In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import MultiLabelBinarizer
from pandas.api.types import CategoricalDtype

#### **Variables & Description**

|   variable_name  | type |                     desc                     |
|:----------------:|:----:|:--------------------------------------------:|
| `proj`           | df   | main DataFrame                               |
| `services`       | list | list of all subscription services            |
| `all_gen`        | list | list of all available genres                 |
| `countries_list` | list | list of all countries                        |
| `service_pipe`   | df   | proj joined with `Services` one hot encoding |
| `genre_pipe`     | df   | proj joined with `Genres` one hot encoding   |
|                  |      |                                              |

#### Initial Set-up and reading the data:

In [2]:
#initial set-up
proj = pd.read_csv('project.csv', names=['ShowName', 'Links', 'Imdb', 'Reelgood', 'Genres', 'MaturityRating', 'RunningDate', 'Seasons', 'ReturningDate', 'Service', 'Country'], header=0)

# for some fucking reason... pag ni iterate mo what looks like a god damn list... turns out.. it's not
# for i in proj.Country: #Same for Genres and Services
#     for j in i:
#         time.sleep(1)
#         print(j)

services = ['Netflix', 'Hulu', 'Prime Video', 'Disney+', 'HBO', 'HBO MAX', 
            'Peacock', 'Apple TV+', 'fuboTV', 'Showtime', 'Starz', 
            'CBS All Access','Epix', 'Crunchyroll', 'Funimation', 'AMC Premiere', 
            'Kanopy', 'Hoopla', 'The Criterion Channel', 'BritBox', 'DC Universe', 
            'Mubi', 'Cinemax', 'Fandor', 'AcornTV', 'Hallmark Movies Now', 'BET+', 
            'Youtube Premium','IndieFlix']

all_gen = ['Action & Adventure', 'Animation', 'Anime', 'Children', 'Comedy', 'Crime', 
           'Documentary', 'Drama', 'Family', 'Fantasy', 'Food', 'Game Show','Home & Garden',
           'Horror','LGBTQ', 'Mystery', 'Reality', 'Science-Fiction', 'Sport', 'Stand-up & Talk', 'Travel']

countries_list = ['America','Argentina','Australia','Austria','Belgium','Brazil','British Indian Ocean Territory','Bulgaria','Canada',
                  'Chile','China','Colombia','Columbia','Croatia','Czechia','Denmark',
                  'Egypt','Finland','France','Germany','Hong Kong','Hungary',
                  'Iceland','India','Iraq','Ireland','Israel','Italy','Japan',
                  'Luxembourg','Malaysia','Mexico','Netherlands','New Zealand',
                  'Norway','Philippines','Poland','Puerto Rico','Russia','South Africa',
                  'South Korea','Spain','Sweden','Switzerland','Taiwan','Thailand','Turkey','United Kingdom']

def New_Services(serve):
    new_list=[]
    for service in services:
        if service in serve:
            new_list.append(service)
    return new_list

def New_Genres(more_gen):
    Genre = []
    for gen in all_gen:
        if gen in more_gen:
            Genre.append(gen)
    return Genre

def New_Countries(cntry):
    new_country = []
    for i in countries_list:
        if i in cntry:
            new_country.append(i)
    return new_country

proj['Service'] = proj['Service'].astype(str).apply(New_Services)
proj['Genres'] = proj['Genres'].astype(str).apply(New_Genres)
proj['Country'] = proj['Country'].astype(str).apply(New_Countries)

#### Manipulating RunningDate
- separate `StartDate` and `EndDate`
- new column `RunYears`
    - `EndDate` - `StartDate`
    - if present 2020
    
|    StartDate     |    EndDate     |       RunYears       |
|:----------------:|:--------------:|:--------------------:|
| RunningDate[0]   | RunningDate[1] | EndDate - StartDate  |

In [3]:
#handling running dates
proj['StartDate'] = proj['RunningDate'].apply(lambda x: int(x.split(' - ')[0]))
proj['EndDate'] = proj['RunningDate'].apply(lambda x: 2020 if x.split(' - ')[1] == 'Present' else int(x.split(' - ')[1]))
proj['RunYears'] = proj['EndDate'] - proj['StartDate']

#### Manipulating Maturity Rating
- set type as category (Order)
    - Rated: All (TV-G)
    - Rated: 7+ (TV-PG)
    - Rated: 13
    - Rated: 14+ (TV-14)
    - Rated: 18+ (TV-MA)

In [4]:
cat_type = CategoricalDtype(categories=['Rated: All (TV-G)','Rated: 7+ (TV-PG)',
                                        'Rated: 13','Rated: 14+ (TV-14)','Rated: 18+ (TV-MA)'], ordered=True)
proj['MaturityRating'] = proj.MaturityRating.astype(cat_type)

In [5]:
proj['MaturityRating'].unique()

[Rated: 18+ (TV-MA), Rated: 14+ (TV-14), Rated: 7+ (TV-PG), NaN, Rated: All (TV-G)]
Categories (4, object): [Rated: All (TV-G) < Rated: 7+ (TV-PG) < Rated: 14+ (TV-14) < Rated: 18+ (TV-MA)]

### Manipulating Seasons
- Making Seasons Categorical (Ordered)
    - idea here is for easy manipulation like using [cut()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html#pandas.cut) to make bins
    - also categorical data is smaller and faster

In [6]:
proj['Seasons'].unique()

array(['5 Seasons', '8 Seasons', '4 Seasons', '3 Seasons', '2 Seasons',
       '1 Season', '10 Seasons', '9 Seasons', '15 Seasons', '31 Seasons',
       '6 Seasons', '7 Seasons', '23 Seasons', '12 Seasons', '11 Seasons',
       '14 Seasons', '16 Seasons', '24 Seasons', '27 Seasons',
       '19 Seasons', '18 Seasons', '17 Seasons', '21 Seasons',
       '28 Seasons', '13 Seasons', '29 Seasons', '25 Seasons',
       '45 Seasons', '40 Seasons', '20 Seasons', '51 Seasons',
       '35 Seasons', '62 Seasons', '44 Seasons', '54 Seasons',
       '32 Seasons', '22 Seasons', '26 Seasons', '36 Seasons',
       '34 Seasons', '52 Seasons', '41 Seasons', '37 Seasons',
       '42 Seasons', '33 Seasons', '38 Seasons', '39 Seasons',
       '43 Seasons', '48 Seasons', '57 Seasons'], dtype=object)

In [7]:
sea_type = CategoricalDtype(categories=['1 Season', '2 Seasons','3 Seasons','4 Seasons','5 Seasons',
                                        '6 Seasons','7 Seasons','8 Seasons','9 Seasons','10 Seasons',
                                        '11 Seasons','12 Seasons','13 Seasons','14 Seasons','15 Seasons',
                                        '16 Seasons','17 Seasons','18 Seasons','19 Seasons','20 Seasons',
                                        '21 Seasons','22 Seasons','23 Seasons','24 Seasons','25 Seasons',
                                        '26 Seasons','27 Seasons','28 Seasons','29 Seasons','31 Seasons',
                                        '32 Seasons', '33 Seasons','34 Seasons','35 Seasons','36 Seasons',
                                        '37 Seasons','38 Seasons','39 Seasons','40 Seasons','41 Seasons',
                                        '42 Seasons','43 Seasons','44 Seasons','45 Seasons','48 Seasons',
                                        '51 Seasons','52 Seasons','54 Seasons','57 Seasons','62 Seasons'], ordered=True)
proj['Seasons'] = proj.Seasons.astype(sea_type)     

### Possible Questions:

- Running dates:
    - inspect how many years did the show ran and it's ratings
    - how many shows are still running.
        - ratings
        - genres
    - how many shows has ended.
        - ratings
        - genres
- Genres:
    - how many shows per genre: %
    - highest rated per genre
- Maturity rating:
    - how many shows per rating %
    - highest rated per rating
- Services:
    - shows available per Service
    - top shows per service
- Seasons:
    - heatmap no of seasons per service
        - what services have a higher retention of shows
- Correlation:
    - ratings vs running time
    - ratings vs genres
    - ratings vs countries
    - ratings vs maturity ratings
    - does ratings affect the no of seasons

### Descriptive Stats

In [8]:
proj.head()

Unnamed: 0,ShowName,Links,Imdb,Reelgood,Genres,MaturityRating,RunningDate,Seasons,ReturningDate,Service,Country,StartDate,EndDate,RunYears
0,Breaking Bad,https://reelgood.com/show/breaking-bad-2008,9.5,100.0,"[Crime, Drama]",Rated: 18+ (TV-MA),2008 - 2013,5 Seasons,Series Ended,[Netflix],[America],2008,2013,5
1,Game of Thrones,https://reelgood.com/show/game-of-thrones-2011,9.3,99.0,"[Action & Adventure, Drama, Fantasy, Science-F...",Rated: 18+ (TV-MA),2011 - 2019,8 Seasons,Series Ended,"[HBO, HBO MAX]",[America],2011,2019,8
2,Rick and Morty,https://reelgood.com/show/rick-and-morty-2013,9.2,97.0,"[Action & Adventure, Animation, Comedy, Fantas...",Rated: 18+ (TV-MA),2013 - Present,4 Seasons,Returning: Date TBA,"[Hulu, HBO, HBO MAX, fuboTV, Hoopla]",[America],2013,2020,7
3,Stranger Things,https://reelgood.com/show/stranger-things-2016,8.8,96.0,"[Action & Adventure, Drama, Fantasy, Horror, M...",Rated: 14+ (TV-14),2016 - Present,3 Seasons,Returning: Date TBA,[Netflix],[America],2016,2020,4
4,Dark,https://reelgood.com/show/dark-2017,8.8,95.0,"[Crime, Drama, Fantasy, Mystery, Science-Fiction]",Rated: 14+ (TV-14),2017 - 2020,3 Seasons,Series Ended,[Netflix],[Germany],2017,2020,3


In [9]:
proj.columns 

Index(['ShowName', 'Links', 'Imdb', 'Reelgood', 'Genres', 'MaturityRating',
       'RunningDate', 'Seasons', 'ReturningDate', 'Service', 'Country',
       'StartDate', 'EndDate', 'RunYears'],
      dtype='object')

In [10]:
proj.shape

(9778, 14)

In [11]:
proj.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9778 entries, 0 to 9777
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   ShowName        9778 non-null   object  
 1   Links           9778 non-null   object  
 2   Imdb            8282 non-null   float64 
 3   Reelgood        9294 non-null   float64 
 4   Genres          9778 non-null   object  
 5   MaturityRating  6012 non-null   category
 6   RunningDate     9778 non-null   object  
 7   Seasons         9778 non-null   category
 8   ReturningDate   9778 non-null   object  
 9   Service         9778 non-null   object  
 10  Country         9778 non-null   object  
 11  StartDate       9778 non-null   int64   
 12  EndDate         9778 non-null   int64   
 13  RunYears        9778 non-null   int64   
dtypes: category(2), float64(2), int64(3), object(7)
memory usage: 5.4 MB


In [12]:
proj.dtypes

ShowName            object
Links               object
Imdb               float64
Reelgood           float64
Genres              object
MaturityRating    category
RunningDate         object
Seasons           category
ReturningDate       object
Service             object
Country             object
StartDate            int64
EndDate              int64
RunYears             int64
dtype: object

In [13]:
proj.describe()

Unnamed: 0,Imdb,Reelgood,StartDate,EndDate,RunYears
count,8282.0,9294.0,9778.0,9778.0,9778.0
mean,7.042272,49.406606,2010.544794,2014.788403,4.243608
std,1.116074,17.560583,11.51823,10.053876,6.770087
min,1.0,10.0,1901.0,1939.0,-40.0
25%,6.5,39.0,2009.0,2014.0,1.0
50%,7.2,50.0,2014.0,2020.0,3.0
75%,7.8,61.0,2017.0,2020.0,5.0
max,9.6,100.0,2020.0,2020.0,119.0


In [14]:
# Daming NaN Countries
proj.isna().sum()

ShowName             0
Links                0
Imdb              1496
Reelgood           484
Genres               0
MaturityRating    3766
RunningDate          0
Seasons              0
ReturningDate        0
Service              0
Country              0
StartDate            0
EndDate              0
RunYears             0
dtype: int64

#### 1. What are the top 10 shows on IMDB that are available on each streaming service? And what are their ratings

In [15]:
# One Hot Encoding
def one_hot_encoding(dataframe, col):
    '''just run once you bitch'''
    mlb = MultiLabelBinarizer()
    new = dataframe.join(pd.DataFrame(mlb.fit_transform(dataframe.pop(col)),
                                      columns=mlb.classes_,index=dataframe.index))
    return new
# mlb.classes_

# Get encoded then sort while filtering columns
def get_encoded_sort_desc(dataframe, col_one_hot_encoded, sort_by_col, filter_cols):
    sorteddf = dataframe[dataframe[col_one_hot_encoded] == 1].sort_values(by=sort_by_col,ascending=False).filter(items=filter_cols)
    return sorteddf.reset_index(drop=True)

In [16]:
# pipelines note: run once cuz you poppin
service_pipe = proj.pipe(one_hot_encoding,col='Service')
genres_pipe = proj.pipe(one_hot_encoding,col='Genres')
country_pipe = proj.pipe(one_hot_encoding,col='Country')

In [17]:
# IMDB - sorry katamaran lolz
for element in services:
    print(f"{element} on IMDB\nservice_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='{element}',sort_by_col='Imdb',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)\n") 

Netflix on IMDB
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Netflix',sort_by_col='Imdb',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Hulu on IMDB
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Hulu',sort_by_col='Imdb',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Prime Video on IMDB
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Prime Video',sort_by_col='Imdb',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Disney+ on IMDB
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Disney+',sort_by_col='Imdb',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

HBO on IMDB
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='HBO',sort_by_col='Imdb',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

HBO MAX on IMDB
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='HBO MAX',sort_by_col='Imdb',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Peacock on IMDB
service_pipe.pipe(get_encoded_

In [18]:
# paste code below from ^
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Cinemax',
                  sort_by_col='Imdb',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Unnamed: 0,ShowName,Imdb,Reelgood
0,The Knick,8.5,80.0
1,Mike Judge Presents: Tales From the Tour Bus,8.5,64.0
2,Banshee,8.4,83.0
3,Warrior,8.2,76.0
4,Strike Back,8.2,74.0
5,Warrior,8.2,76.0
6,Strike,7.9,67.0
7,GATE,7.5,66.0
8,Outcast,7.4,72.0
9,Jett,7.4,69.0


#### 2. What are the top 10 shows on Reelgood that are available on each streaming service? And what are their ratings

In [19]:
# Reelgood - sorry katamaran lolz
for element in services:
    print(f"{element}\nservice_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='{element}',sort_by_col='Reelgood',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)\n") 

Netflix
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Netflix',sort_by_col='Reelgood',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Hulu
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Hulu',sort_by_col='Reelgood',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Prime Video
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Prime Video',sort_by_col='Reelgood',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Disney+
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Disney+',sort_by_col='Reelgood',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

HBO
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='HBO',sort_by_col='Reelgood',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

HBO MAX
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='HBO MAX',sort_by_col='Reelgood',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Peacock
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='P

In [20]:
# paste code below from ^
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='IndieFlix',
                  sort_by_col='Reelgood',filter_cols=['ShowName','Imdb', 'Reelgood']).head(10)

Unnamed: 0,ShowName,Imdb,Reelgood
0,The Beverly Hillbillies,7.2,63.0
1,Hamatora,7.2,63.0
2,A Haunting,8.0,63.0
3,Bonanza,7.3,62.0
4,The Lucy Show,7.2,57.0
5,The Adventures of Ozzie and Harriet,7.4,53.0
6,Nazi Hunters,7.7,44.0
7,American Guns,6.6,44.0
8,THURSTON-The Western Web Series,5.7,34.0


#### 3. Plot the number of shows categorized per year.

#### 4. How many shows are there per streaming service (based from Reelgood.com)?

In [21]:
# please note that some titles are available on different services
service_pipe.filter(items=services).sum().reset_index().rename(columns={'index' : 'Service',0:'Count'})

Unnamed: 0,Service,Count
0,Netflix,2146
1,Hulu,2067
2,Prime Video,2535
3,Disney+,267
4,HBO,569
5,HBO MAX,519
6,Peacock,46
7,Apple TV+,38
8,fuboTV,1328
9,Showtime,96


#### 5. For each streaming service, what is the targeted age group?

In [22]:
# wait ang labo hahahha
# basically, count of maturity rating per service?
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Netflix',sort_by_col='MaturityRating',
                  filter_cols=['ShowName','MaturityRating']).groupby('MaturityRating').count().rename(columns={'ShowName':'Count'})

Unnamed: 0_level_0,Count
MaturityRating,Unnamed: 1_level_1
Rated: All (TV-G),189
Rated: 7+ (TV-PG),357
Rated: 13,0
Rated: 14+ (TV-14),496
Rated: 18+ (TV-MA),442


In [23]:
# Maturity
for element in services:
    print(f"{element}\nservice_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='{element}',sort_by_col='MaturityRating',filter_cols=['ShowName','MaturityRating']).count().rename(columns={{'ShowName':'Count'}})\n") 

Netflix
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Netflix',sort_by_col='MaturityRating',filter_cols=['ShowName','MaturityRating']).count().rename(columns={'ShowName':'Count'})

Hulu
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Hulu',sort_by_col='MaturityRating',filter_cols=['ShowName','MaturityRating']).count().rename(columns={'ShowName':'Count'})

Prime Video
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Prime Video',sort_by_col='MaturityRating',filter_cols=['ShowName','MaturityRating']).count().rename(columns={'ShowName':'Count'})

Disney+
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='Disney+',sort_by_col='MaturityRating',filter_cols=['ShowName','MaturityRating']).count().rename(columns={'ShowName':'Count'})

HBO
service_pipe.pipe(get_encoded_sort_desc,col_one_hot_encoded='HBO',sort_by_col='MaturityRating',filter_cols=['ShowName','MaturityRating']).count().rename(columns={'ShowName':'Count'})

HBO MAX
service_pipe.pipe(

#### 6. What is the average rating of all shows available per streaming services?

In [24]:
#omaygahd and haba... pag madaming variables naliito na ako :(
pd.pivot_table(service_pipe.filter(items=['Imdb']+services).dropna().melt(id_vars=['Imdb']).\
               rename(columns={'variable':'Services'}).set_index('value').filter(like='1', axis=0),
               index=['Services'],aggfunc=np.mean)

Unnamed: 0_level_0,Imdb
Services,Unnamed: 1_level_1
AMC Premiere,7.43913
AcornTV,7.387692
Apple TV+,6.889189
BET+,6.287356
BritBox,7.441853
CBS All Access,6.929851
Cinemax,7.526667
Crunchyroll,6.910714
DC Universe,7.377083
Disney+,6.950202


#### 7. Based on the list, which region has the most number of produced shows?

#### 9. Which online streaming service is worth the money in terms of quality of shows? 
 how would we quantify this?
 - Ave rating for IMDB and Reelgood
 - Quantity of Aavailable shows (refer to #4)
 - longlasting shows? total run dates

### Additional Questions:

#### Running dates:
- inspect how many years the show ran and it's ratings
- ave running time per service
    - RunYears
    - Seasons
- how many shows are still running.
    - ratings
    - genres
- how many shows has ended.
    - ratings
    - genres