In [2]:
import pandas as pd
import csv
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [3]:
# Reading TV Time CSV Data
tvshows_df = pd.read_csv("Data/tvtimeshows.csv")
tvshows_df.head()

Unnamed: 0,id,name,followers,nb_rates,runtime,number_of_seasons,network,mean_rate,poster_image,seasons
0,288128,X-Ray & Vav,230,10,9,2,Rooster Teeth,4.2,https://dg31sz3gwrwan.cloudfront.net/poster/28...,"[{'number': 1, 'nb_episodes': 4}, {'number': 2..."
1,313803,Sister's Slam Dunk,1298,100,80,2,KBS TV2,4.25,https://dg31sz3gwrwan.cloudfront.net/poster/31...,"[{'number': 1, 'nb_episodes': 33}, {'number': ..."
2,325462,Reverse,270,15,60,1,Tokyo Broadcasting System,4.63,https://dg31sz3gwrwan.cloudfront.net/poster/32...,"[{'number': 1, 'nb_episodes': 10}]"
3,325198,Frame Arms Girl,900,24,25,1,Tokyo MX,3.54,https://dg31sz3gwrwan.cloudfront.net/poster/32...,"[{'number': 1, 'nb_episodes': 12}]"
4,349743,Mr Inbetween,7123,118,25,2,FX,4.42,https://dg31sz3gwrwan.cloudfront.net/poster/34...,"[{'number': 1, 'nb_episodes': 6}, {'number': 2..."


In [4]:
# Reading All Episodes CSV Data
episode_df = pd.read_csv("Data/all_episodes.csv")
episode_df.head()

Unnamed: 0,show_id,episode_id,time,episode,title,description,rating,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad
0,315103,5697528,2016-10-05T03:00,S01E01,Idiots with Numbers!,Idiots with Numbers! (S01E01) is the first epi...,9.48,12307.0,175.0,388.0,136.0,1.0,16.0,3.0
1,315103,5774664,2016-10-12T03:00,S01E02,The Inmates Are Stupid! The Guards Are Kind of...,The Inmates Are Stupid! The Guards Are Kind of...,9.52,11423.0,156.0,349.0,82.0,0.0,5.0,3.0
2,315103,5774665,2016-10-19T03:00,S01E03,Another Idiot Has Come!!,Another Idiot Has Come!! (S01E03) is the third...,9.36,11067.0,145.0,339.0,63.0,0.0,7.0,1.0
3,315103,5774666,2016-10-26T03:00,S01E04,Happy New Year! The New Year's Tournament Is W...,Happy New Year! The New Year's Tournament Is W...,8.36,10552.0,134.0,270.0,109.0,0.0,3.0,3.0
4,315103,5774667,2016-11-02T03:00,S01E05,A Fraud and a Hero,A Fraud and a Hero (S01E05) is the fifth episo...,8.94,10340.0,106.0,183.0,223.0,1.0,2.0,2.0


In [5]:
pd.set_option('display.max_columns', None)

In [6]:
# Merging both dataframe on Show Id
merged_df = pd.merge(episode_df, tvshows_df, how="inner", left_on="show_id", right_on="id")
merged_df.head()

Unnamed: 0,show_id,episode_id,time,episode,title,description,rating,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,id,name,followers,nb_rates,runtime,number_of_seasons,network,mean_rate,poster_image,seasons
0,315103,5697528,2016-10-05T03:00,S01E01,Idiots with Numbers!,Idiots with Numbers! (S01E01) is the first epi...,9.48,12307.0,175.0,388.0,136.0,1.0,16.0,3.0,315103,Nanbaka,16617,620,25,2,MBS,4.57,https://dg31sz3gwrwan.cloudfront.net/poster/31...,"[{'number': 1, 'nb_episodes': 13}, {'number': ..."
1,315103,5774664,2016-10-12T03:00,S01E02,The Inmates Are Stupid! The Guards Are Kind of...,The Inmates Are Stupid! The Guards Are Kind of...,9.52,11423.0,156.0,349.0,82.0,0.0,5.0,3.0,315103,Nanbaka,16617,620,25,2,MBS,4.57,https://dg31sz3gwrwan.cloudfront.net/poster/31...,"[{'number': 1, 'nb_episodes': 13}, {'number': ..."
2,315103,5774665,2016-10-19T03:00,S01E03,Another Idiot Has Come!!,Another Idiot Has Come!! (S01E03) is the third...,9.36,11067.0,145.0,339.0,63.0,0.0,7.0,1.0,315103,Nanbaka,16617,620,25,2,MBS,4.57,https://dg31sz3gwrwan.cloudfront.net/poster/31...,"[{'number': 1, 'nb_episodes': 13}, {'number': ..."
3,315103,5774666,2016-10-26T03:00,S01E04,Happy New Year! The New Year's Tournament Is W...,Happy New Year! The New Year's Tournament Is W...,8.36,10552.0,134.0,270.0,109.0,0.0,3.0,3.0,315103,Nanbaka,16617,620,25,2,MBS,4.57,https://dg31sz3gwrwan.cloudfront.net/poster/31...,"[{'number': 1, 'nb_episodes': 13}, {'number': ..."
4,315103,5774667,2016-11-02T03:00,S01E05,A Fraud and a Hero,A Fraud and a Hero (S01E05) is the fifth episo...,8.94,10340.0,106.0,183.0,223.0,1.0,2.0,2.0,315103,Nanbaka,16617,620,25,2,MBS,4.57,https://dg31sz3gwrwan.cloudfront.net/poster/31...,"[{'number': 1, 'nb_episodes': 13}, {'number': ..."


In [7]:
merged_df["network"].value_counts().head(20)

Rede Globo                26076
YouTube                   20754
TV Tokyo                  12529
Canal de las Estrellas    10343
CBS                        9075
NBC                        8324
Fuji TV                    8177
MBC                        8002
ABC (US)                   7843
Tokyo MX                   7062
Telemundo                  6716
TV Asahi                   6322
KBS TV2                    5920
Netflix                    5384
SBS (KR)                   5082
FOX                        4771
Cartoon Network            4721
TF1                        4642
SBT                        4431
TVN                        4045
Name: network, dtype: int64

In [8]:
# Reading only American Network Names and sorting it by count of titles
us_network = pd.read_csv("Data/network_names.csv")
us_network = us_network.sort_values(by='Count of title', ascending=False, ignore_index=True)
us_network.head(20)

Unnamed: 0,USA Networks,Count of title
0,CBS,9075
1,NBC,8324
2,ABC (US),7843
3,Netflix,5384
4,FOX,4771
5,Cartoon Network,4721
6,Nickelodeon,4147
7,Disney Channel,2803
8,BBC One,2754
9,PBS,2661


In [9]:
# Inner Merge on Merged Dataframe and Top 15 American Network Data
merged_us_data = pd.merge(merged_df, us_network, 
                          how="inner", 
                          left_on="network", 
                          right_on="USA Networks").drop(columns=['id', 'USA Networks', 'Count of title', 
                                                                 'description','seasons', 'nb_rates', 
                                                                 'rating', 'title'])

merged_us_data

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image
0,79029,375789,2005-11-07T21:30,S01E01,74.0,5.0,3.0,5.0,0.0,0.0,0.0,ShakespeaRe-Told,242,120,1,BBC One,3.4,https://dg31sz3gwrwan.cloudfront.net/poster/79...
1,79029,375790,2005-11-14T21:30,S01E02,94.0,14.0,0.0,5.0,0.0,2.0,0.0,ShakespeaRe-Told,242,120,1,BBC One,3.4,https://dg31sz3gwrwan.cloudfront.net/poster/79...
2,79029,375791,2005-11-21T21:30,S01E03,76.0,6.0,1.0,2.0,0.0,2.0,1.0,ShakespeaRe-Told,242,120,1,BBC One,3.4,https://dg31sz3gwrwan.cloudfront.net/poster/79...
3,79029,375792,2005-11-28T21:30,S01E04,66.0,4.0,0.0,0.0,0.0,4.0,0.0,ShakespeaRe-Told,242,120,1,BBC One,3.4,https://dg31sz3gwrwan.cloudfront.net/poster/79...
4,251999,4173236,2011-02-21T01:00,S01E01,58.0,1.0,0.0,0.0,0.0,0.0,0.0,Mrs. Brown's Boys,193,180,1,BBC One,3.0,https://d36rlb2fgh8cjd.cloudfront.net/default-...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81534,252486,4183419,T21:00,S01E08,76.0,0.0,0.0,0.0,0.0,0.0,0.0,Dual Survival,458,1,2,Discovery Channel,0.0,https://dg31sz3gwrwan.cloudfront.net/poster/25...
81535,252486,4183421,T21:00,S01E09,76.0,0.0,0.0,0.0,0.0,0.0,0.0,Dual Survival,458,1,2,Discovery Channel,0.0,https://dg31sz3gwrwan.cloudfront.net/poster/25...
81536,252486,4183422,T21:00,S01E10,76.0,0.0,0.0,0.0,0.0,0.0,0.0,Dual Survival,458,1,2,Discovery Channel,0.0,https://dg31sz3gwrwan.cloudfront.net/poster/25...
81537,248584,4098218,2011-01-27T20:00,S01E01,38.0,3.0,0.0,0.0,0.0,0.0,0.0,Time Paladin Sakura,167,25,1,Discovery Channel,3.0,https://dg31sz3gwrwan.cloudfront.net/poster/24...


In [10]:
merged_us_data.dtypes

show_id                int64
episode_id             int64
time                  object
episode               object
times_watched        float64
mood-good            float64
mood-fun             float64
mood-wow             float64
mood-sad             float64
mood-so-so           float64
mood-bad             float64
name                  object
followers              int64
runtime                int64
number_of_seasons      int64
network               object
mean_rate            float64
poster_image          object
dtype: object

In [11]:
# Converting Object datatype of Time column to String
merged_us_data['time'] = merged_us_data['time'].astype('string')

In [12]:
# Parsing timeslot from the time data using string split function
merged_us_data['timeslot'] = merged_us_data['time'].str.split(pat='T').str[1]

In [13]:
# Parsing year from the time data using string split function
year = merged_us_data['time'].str.split(pat='T').str[0]
merged_us_data['year'] = year.str.split(pat='-').str[0]

In [14]:
merged_us_data

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image,timeslot,year
0,79029,375789,2005-11-07T21:30,S01E01,74.0,5.0,3.0,5.0,0.0,0.0,0.0,ShakespeaRe-Told,242,120,1,BBC One,3.4,https://dg31sz3gwrwan.cloudfront.net/poster/79...,21:30,2005
1,79029,375790,2005-11-14T21:30,S01E02,94.0,14.0,0.0,5.0,0.0,2.0,0.0,ShakespeaRe-Told,242,120,1,BBC One,3.4,https://dg31sz3gwrwan.cloudfront.net/poster/79...,21:30,2005
2,79029,375791,2005-11-21T21:30,S01E03,76.0,6.0,1.0,2.0,0.0,2.0,1.0,ShakespeaRe-Told,242,120,1,BBC One,3.4,https://dg31sz3gwrwan.cloudfront.net/poster/79...,21:30,2005
3,79029,375792,2005-11-28T21:30,S01E04,66.0,4.0,0.0,0.0,0.0,4.0,0.0,ShakespeaRe-Told,242,120,1,BBC One,3.4,https://dg31sz3gwrwan.cloudfront.net/poster/79...,21:30,2005
4,251999,4173236,2011-02-21T01:00,S01E01,58.0,1.0,0.0,0.0,0.0,0.0,0.0,Mrs. Brown's Boys,193,180,1,BBC One,3.0,https://d36rlb2fgh8cjd.cloudfront.net/default-...,01:00,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81534,252486,4183419,T21:00,S01E08,76.0,0.0,0.0,0.0,0.0,0.0,0.0,Dual Survival,458,1,2,Discovery Channel,0.0,https://dg31sz3gwrwan.cloudfront.net/poster/25...,21:00,
81535,252486,4183421,T21:00,S01E09,76.0,0.0,0.0,0.0,0.0,0.0,0.0,Dual Survival,458,1,2,Discovery Channel,0.0,https://dg31sz3gwrwan.cloudfront.net/poster/25...,21:00,
81536,252486,4183422,T21:00,S01E10,76.0,0.0,0.0,0.0,0.0,0.0,0.0,Dual Survival,458,1,2,Discovery Channel,0.0,https://dg31sz3gwrwan.cloudfront.net/poster/25...,21:00,
81537,248584,4098218,2011-01-27T20:00,S01E01,38.0,3.0,0.0,0.0,0.0,0.0,0.0,Time Paladin Sakura,167,25,1,Discovery Channel,3.0,https://dg31sz3gwrwan.cloudfront.net/poster/24...,20:00,2011


In [15]:
# Filtering Data between runtime (15 - 90 mins) for tv shows
runtime_df = merged_us_data.loc[(merged_us_data['runtime'] >= 15) & (merged_us_data['runtime'] <= 90)]
runtime_df

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image,timeslot,year
10,348204,7380549,2019-11-17T21:00,S01E01,2859.0,60.0,0.0,13.0,0.0,5.0,0.0,The War of the Worlds,12893,60,1,BBC One,0.00,https://dg31sz3gwrwan.cloudfront.net/poster/34...,21:00,2019
11,348204,7380550,T21:00,S01E02,1879.0,41.0,0.0,7.0,0.0,6.0,0.0,The War of the Worlds,12893,60,1,BBC One,0.00,https://dg31sz3gwrwan.cloudfront.net/poster/34...,21:00,
12,348204,7380551,T21:00,S01E03,1082.0,18.0,0.0,1.0,1.0,4.0,1.0,The War of the Worlds,12893,60,1,BBC One,0.00,https://dg31sz3gwrwan.cloudfront.net/poster/34...,21:00,
13,76332,435387,1999-12-25T19:00,S01E01,184.0,13.0,1.0,4.0,3.0,1.0,0.0,David Copperfield (1999),325,90,1,BBC One,3.72,https://dg31sz3gwrwan.cloudfront.net/poster/76...,19:00,1999
14,76332,435388,1999-12-26T19:00,S01E02,173.0,12.0,1.0,2.0,0.0,1.0,1.0,David Copperfield (1999),325,90,1,BBC One,3.72,https://dg31sz3gwrwan.cloudfront.net/poster/76...,19:00,1999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81524,253412,4211482,T21:00,S02E04,147.0,1.0,0.0,1.0,0.0,0.0,0.0,Gold Rush: Alaska,827,60,1,Discovery Channel,4.76,https://d36rlb2fgh8cjd.cloudfront.net/default-...,21:00,
81525,253412,4211483,T21:00,S02E05,145.0,2.0,0.0,0.0,0.0,0.0,0.0,Gold Rush: Alaska,827,60,1,Discovery Channel,4.76,https://d36rlb2fgh8cjd.cloudfront.net/default-...,21:00,
81526,253412,4211484,T21:00,S02E06,147.0,2.0,0.0,0.0,0.0,0.0,0.0,Gold Rush: Alaska,827,60,1,Discovery Channel,4.76,https://d36rlb2fgh8cjd.cloudfront.net/default-...,21:00,
81537,248584,4098218,2011-01-27T20:00,S01E01,38.0,3.0,0.0,0.0,0.0,0.0,0.0,Time Paladin Sakura,167,25,1,Discovery Channel,3.00,https://dg31sz3gwrwan.cloudfront.net/poster/24...,20:00,2011


In [16]:
runtime_df['mood-good'].value_counts().sort_index(ascending=True)

0.0        13041
1.0         7365
2.0         5540
3.0         3997
4.0         2941
           ...  
43454.0        1
43958.0        1
45286.0        1
46890.0        1
54405.0        1
Name: mood-good, Length: 3518, dtype: int64

In [17]:
runtime_df['mood-fun'].value_counts().sort_index(ascending=True)

-1.0            3
 0.0        36868
 1.0         8256
 2.0         4257
 3.0         2783
            ...  
 7436.0         1
 7571.0         1
 7644.0         1
 8637.0         1
 10377.0        1
Name: mood-fun, Length: 1412, dtype: int64

In [18]:
runtime_df['mood-wow'].value_counts().sort_index(ascending=True)

-1.0             1
 0.0         32075
 1.0          8945
 2.0          4444
 3.0          2827
             ...  
 83821.0         1
 85571.0         1
 87258.0         1
 89154.0         1
 104161.0        1
Name: mood-wow, Length: 2415, dtype: int64

In [19]:
runtime_df['mood-sad'].value_counts().sort_index(ascending=True)

-3.0            1
-1.0            5
 0.0        60234
 1.0         4587
 2.0         1864
            ...  
 9964.0         1
 11023.0        1
 13115.0        1
 17344.0        1
 28397.0        1
Name: mood-sad, Length: 425, dtype: int64

In [20]:
runtime_df['mood-so-so'].value_counts().sort_index(ascending=True)

-1.0           7
 0.0       45213
 1.0        8363
 2.0        3571
 3.0        1993
           ...  
 3100.0        1
 3201.0        1
 3392.0        1
 5941.0        1
 7226.0        1
Name: mood-so-so, Length: 597, dtype: int64

In [21]:
runtime_df['mood-bad'].value_counts().sort_index(ascending=True)

-11.0          1
-2.0           3
-1.0           2
 0.0       55189
 1.0        6465
           ...  
 1014.0        1
 1047.0        1
 1620.0        1
 2317.0        1
 3103.0        1
Name: mood-bad, Length: 284, dtype: int64

In [22]:
# Create dataframe for all episode entries where all mood reactions are zero
mood_df = runtime_df.loc[(runtime_df['mood-good'] == 0) & 
                         (runtime_df['mood-fun'] == 0) & 
                         (runtime_df['mood-wow'] == 0) & 
                         (runtime_df['mood-sad'] == 0) & 
                         (runtime_df['mood-so-so'] == 0) & 
                         (runtime_df['mood-bad'] == 0)]

mood_df

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image,timeslot,year
93,73854,130901,1981-10-30T00:00,S01E07,470.0,0.0,0.0,0.0,0.0,0.0,0.0,Postman Pat,466,15,8,BBC One,4.44,https://dg31sz3gwrwan.cloudfront.net/poster/73...,00:00,1981
94,73854,130902,1981-11-06T00:00,S01E08,470.0,0.0,0.0,0.0,0.0,0.0,0.0,Postman Pat,466,15,8,BBC One,4.44,https://dg31sz3gwrwan.cloudfront.net/poster/73...,00:00,1981
95,73854,130903,1981-11-13T00:00,S01E09,470.0,0.0,0.0,0.0,0.0,0.0,0.0,Postman Pat,466,15,8,BBC One,4.44,https://dg31sz3gwrwan.cloudfront.net/poster/73...,00:00,1981
96,73854,130904,1981-11-20T00:00,S01E10,468.0,0.0,0.0,0.0,0.0,0.0,0.0,Postman Pat,466,15,8,BBC One,4.44,https://dg31sz3gwrwan.cloudfront.net/poster/73...,00:00,1981
97,73854,130905,1981-11-27T00:00,S01E11,467.0,0.0,0.0,0.0,0.0,0.0,0.0,Postman Pat,466,15,8,BBC One,4.44,https://dg31sz3gwrwan.cloudfront.net/poster/73...,00:00,1981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81516,254313,4220794,2005-06-28T12:00,S01E40,177.0,0.0,0.0,0.0,0.0,0.0,0.0,Ventdelpla,218,50,7,Discovery Channel,2.09,https://dg31sz3gwrwan.cloudfront.net/poster/25...,12:00,2005
81517,254313,4220795,2005-07-04T12:00,S01E41,177.0,0.0,0.0,0.0,0.0,0.0,0.0,Ventdelpla,218,50,7,Discovery Channel,2.09,https://dg31sz3gwrwan.cloudfront.net/poster/25...,12:00,2005
81518,254313,4220796,2005-07-05T12:00,S01E42,177.0,0.0,0.0,0.0,0.0,0.0,0.0,Ventdelpla,218,50,7,Discovery Channel,2.09,https://dg31sz3gwrwan.cloudfront.net/poster/25...,12:00,2005
81519,254313,4220797,2005-07-11T12:00,S01E43,177.0,0.0,0.0,0.0,0.0,0.0,0.0,Ventdelpla,218,50,7,Discovery Channel,2.09,https://dg31sz3gwrwan.cloudfront.net/poster/25...,12:00,2005


In [23]:
# Filtering mood_df data from runtime_df
mood_filter_df = runtime_df.merge(mood_df, how="left", indicator=True)
mood_filter_df = mood_filter_df[mood_filter_df['_merge'] == 'left_only']
mood_filter_df

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image,timeslot,year,_merge
0,348204,7380549,2019-11-17T21:00,S01E01,2859.0,60.0,0.0,13.0,0.0,5.0,0.0,The War of the Worlds,12893,60,1,BBC One,0.00,https://dg31sz3gwrwan.cloudfront.net/poster/34...,21:00,2019,left_only
1,348204,7380550,T21:00,S01E02,1879.0,41.0,0.0,7.0,0.0,6.0,0.0,The War of the Worlds,12893,60,1,BBC One,0.00,https://dg31sz3gwrwan.cloudfront.net/poster/34...,21:00,,left_only
2,348204,7380551,T21:00,S01E03,1082.0,18.0,0.0,1.0,1.0,4.0,1.0,The War of the Worlds,12893,60,1,BBC One,0.00,https://dg31sz3gwrwan.cloudfront.net/poster/34...,21:00,,left_only
3,76332,435387,1999-12-25T19:00,S01E01,184.0,13.0,1.0,4.0,3.0,1.0,0.0,David Copperfield (1999),325,90,1,BBC One,3.72,https://dg31sz3gwrwan.cloudfront.net/poster/76...,19:00,1999,left_only
4,76332,435388,1999-12-26T19:00,S01E02,173.0,12.0,1.0,2.0,0.0,1.0,1.0,David Copperfield (1999),325,90,1,BBC One,3.72,https://dg31sz3gwrwan.cloudfront.net/poster/76...,19:00,1999,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73121,253412,4211482,T21:00,S02E04,147.0,1.0,0.0,1.0,0.0,0.0,0.0,Gold Rush: Alaska,827,60,1,Discovery Channel,4.76,https://d36rlb2fgh8cjd.cloudfront.net/default-...,21:00,,left_only
73122,253412,4211483,T21:00,S02E05,145.0,2.0,0.0,0.0,0.0,0.0,0.0,Gold Rush: Alaska,827,60,1,Discovery Channel,4.76,https://d36rlb2fgh8cjd.cloudfront.net/default-...,21:00,,left_only
73123,253412,4211484,T21:00,S02E06,147.0,2.0,0.0,0.0,0.0,0.0,0.0,Gold Rush: Alaska,827,60,1,Discovery Channel,4.76,https://d36rlb2fgh8cjd.cloudfront.net/default-...,21:00,,left_only
73124,248584,4098218,2011-01-27T20:00,S01E01,38.0,3.0,0.0,0.0,0.0,0.0,0.0,Time Paladin Sakura,167,25,1,Discovery Channel,3.00,https://dg31sz3gwrwan.cloudfront.net/poster/24...,20:00,2011,left_only


In [30]:
del mood_filter_df['_merge']

In [32]:
mood_filter_df['network'].value_counts().head(20)

CBS                6907
ABC (US)           6785
NBC                6094
Netflix            4860
FOX                4075
Nickelodeon        2512
Cartoon Network    2418
BBC One            2342
Disney Channel     1893
HBO                1562
MTV                1362
Syfy               1158
BBC Two            1139
The CW             1057
PBS                1049
WOWOW               995
Discovery           974
Comedy Central      952
TLC                 918
Amazon              915
Name: network, dtype: int64

In [42]:
mood_filter_df['runtime'].value_counts().sort_index(ascending=True)

15     1573
16       21
17       11
18        1
20      929
25    22896
29       12
30     6484
35      335
40      420
45    21015
50     1279
55      492
60     5657
65      362
70       60
75       71
80       37
85      835
90      433
Name: runtime, dtype: int64

In [44]:
mood_filter_df.loc[mood_filter_df['runtime'] == 80]

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image,timeslot,year
1835,109411,5086767,1964-08-22T22:15,S1964E01,332.0,4.0,1.0,0.0,0.0,0.0,0.0,Match of the Day,891,80,56,BBC One,3.13,https://dg31sz3gwrwan.cloudfront.net/poster/10...,22:15,1964
4493,252870,4189417,2011-12-11T21:00,S01E01,4522.0,238.0,4.0,104.0,5.0,55.0,14.0,Bag of Bones,9677,80,1,A&E,3.31,https://dg31sz3gwrwan.cloudfront.net/poster/25...,21:00,2011
4494,252870,4189418,2011-12-12T21:00,S01E02,4362.0,211.0,5.0,117.0,5.0,58.0,10.0,Bag of Bones,9677,80,1,A&E,3.31,https://dg31sz3gwrwan.cloudfront.net/poster/25...,21:00,2011
12134,94131,642881,2009-01-27T00:00,S01E01,909.0,38.0,4.0,43.0,0.0,3.0,1.0,HULK VS,1129,80,1,CBS,4.7,https://dg31sz3gwrwan.cloudfront.net/poster/94...,00:00,2009
12135,94131,642891,2009-01-27T00:00,S01E02,914.0,23.0,3.0,62.0,0.0,1.0,0.0,HULK VS,1129,80,1,CBS,4.7,https://dg31sz3gwrwan.cloudfront.net/poster/94...,00:00,2009
21129,322229,5911418,2017-01-06T21:00,S01E01,136.0,10.0,0.0,4.0,2.0,0.0,0.0,Truth and Lies,233,80,1,ABC (US),3.26,https://dg31sz3gwrwan.cloudfront.net/poster/32...,21:00,2017
21130,322229,6009259,2017-03-17T21:00,S01E02,97.0,9.0,0.0,0.0,1.0,0.0,1.0,Truth and Lies,233,80,1,ABC (US),3.26,https://dg31sz3gwrwan.cloudfront.net/poster/32...,21:00,2017
21131,322229,6316040,2017-06-16T21:00,S01E03,73.0,2.0,0.0,1.0,0.0,1.0,0.0,Truth and Lies,233,80,1,ABC (US),3.26,https://dg31sz3gwrwan.cloudfront.net/poster/32...,21:00,2017
21132,322229,6315542,2017-09-14T21:00,S01E04,82.0,2.0,0.0,0.0,0.0,1.0,0.0,Truth and Lies,233,80,1,ABC (US),3.26,https://dg31sz3gwrwan.cloudfront.net/poster/32...,21:00,2017
21133,322229,6487087,2018-01-04T21:00,S01E05,78.0,4.0,0.0,1.0,0.0,1.0,0.0,Truth and Lies,233,80,1,ABC (US),3.26,https://dg31sz3gwrwan.cloudfront.net/poster/32...,21:00,2018


In [53]:
mood_filter_df.loc[mood_filter_df['name'].str.contains('World Cup', case=False, regex=False)]

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image,timeslot,year
29047,348431,6711409,2018-06-14T00:00,S2018E01,58600.0,1196.0,484.0,550.0,314.0,103.0,325.0,World Cup 2018,72488,90,1,FOX,3.96,https://dg31sz3gwrwan.cloudfront.net/poster/34...,00:00,2018
29048,348431,6711422,2018-06-15T00:00,S2018E02,55792.0,1099.0,34.0,194.0,137.0,193.0,48.0,World Cup 2018,72488,90,1,FOX,3.96,https://dg31sz3gwrwan.cloudfront.net/poster/34...,00:00,2018
29049,348431,6711426,2018-06-15T00:00,S2018E03,55149.0,865.0,83.0,262.0,91.0,236.0,97.0,World Cup 2018,72488,90,1,FOX,3.96,https://dg31sz3gwrwan.cloudfront.net/poster/34...,00:00,2018
29050,348431,6711430,2018-06-15T00:00,S2018E04,59442.0,432.0,13.0,2038.0,10.0,18.0,15.0,World Cup 2018,72488,90,1,FOX,3.96,https://dg31sz3gwrwan.cloudfront.net/poster/34...,00:00,2018
29051,348431,6712693,2018-06-16T00:00,S2018E05,57395.0,1091.0,22.0,183.0,17.0,280.0,99.0,World Cup 2018,72488,90,1,FOX,3.96,https://dg31sz3gwrwan.cloudfront.net/poster/34...,00:00,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52715,282503,4909416,2002-07-25T01:00,S01E17,172.0,6.0,0.0,1.0,0.0,0.0,0.0,FIFA World Cup Official Film,224,90,1,BBC Two,4.02,https://dg31sz3gwrwan.cloudfront.net/poster/28...,01:00,2002
52716,282503,4919629,T01:00,S01E18,174.0,5.0,0.0,2.0,0.0,0.0,0.0,FIFA World Cup Official Film,224,90,1,BBC Two,4.02,https://dg31sz3gwrwan.cloudfront.net/poster/28...,01:00,
52717,282503,4919630,2007-01-23T01:00,S01E18,184.0,5.0,0.0,3.0,0.0,0.0,0.0,FIFA World Cup Official Film,224,90,1,BBC Two,4.02,https://dg31sz3gwrwan.cloudfront.net/poster/28...,01:00,2007
52718,282503,4919633,2010-11-16T01:00,S01E19,183.0,6.0,1.0,2.0,0.0,0.0,0.0,FIFA World Cup Official Film,224,90,1,BBC Two,4.02,https://dg31sz3gwrwan.cloudfront.net/poster/28...,01:00,2010


In [54]:
# Remove entries that are not TV shows
mask1 = mood_filter_df['name'].str.contains('Christmas', case=False, regex=False)
mask2 = mood_filter_df['name'].str.contains('Awards', case=False, regex=False)
mask3 = mood_filter_df['name'].str.contains('WWE', case=False, regex=False)
mask4 = mood_filter_df['name'].str.contains('Presidential', case=False, regex=False)
mask5 = mood_filter_df['name'].str.contains('Thanksgiving', case=False, regex=False)
mask6 = mood_filter_df['name'].str.contains('World cup', case=False, regex=False)
mask7 = mood_filter_df['name'].str.contains('Boxing', case=False, regex=False)
mask8 = mood_filter_df['name'].str.contains('Miss Universe', case=False, regex=False)
# mask9 = mood_filter_df['name'].str.contains('', case=False, regex=False)

In [55]:
unwanted_df = mood_filter_df[mask1 | mask2 | mask3 | mask4 | mask5 | mask7 | mask8]
unwanted_df

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image,timeslot,year
4909,354860,6887071,2018-11-04T21:00,S01E01,274.0,16.0,1.0,0.0,0.0,1.0,0.0,Ultimate Thanksgiving Challenge,341,45,2,Food Network,4.09,https://dg31sz3gwrwan.cloudfront.net/poster/35...,21:00,2018
4910,354860,6887073,2018-11-11T21:00,S01E02,263.0,17.0,0.0,2.0,1.0,0.0,0.0,Ultimate Thanksgiving Challenge,341,45,2,Food Network,4.09,https://dg31sz3gwrwan.cloudfront.net/poster/35...,21:00,2018
4911,354860,6887074,2018-11-18T21:00,S01E03,259.0,19.0,0.0,1.0,0.0,0.0,0.0,Ultimate Thanksgiving Challenge,341,45,2,Food Network,4.09,https://dg31sz3gwrwan.cloudfront.net/poster/35...,21:00,2018
5026,337326,6399953,2017-11-06T22:00,S01E01,414.0,17.0,0.0,0.0,0.0,1.0,0.0,Christmas Cookie Challenge,645,45,3,Food Network,2.72,https://dg31sz3gwrwan.cloudfront.net/poster/33...,22:00,2017
5027,337326,6400023,2017-11-13T22:00,S01E02,407.0,17.0,0.0,0.0,0.0,2.0,0.0,Christmas Cookie Challenge,645,45,3,Food Network,2.72,https://dg31sz3gwrwan.cloudfront.net/poster/33...,22:00,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68826,279029,4917272,2014-06-24T22:00,S01E16,169.0,3.0,0.0,1.0,0.0,0.0,0.0,WWE Countdown,435,60,2,WWE Network,4.52,https://dg31sz3gwrwan.cloudfront.net/poster/27...,22:00,2014
68827,279029,4928833,2014-07-01T22:00,S01E17,168.0,2.0,0.0,1.0,0.0,0.0,0.0,WWE Countdown,435,60,2,WWE Network,4.52,https://dg31sz3gwrwan.cloudfront.net/poster/27...,22:00,2014
68828,279029,4928834,2014-07-08T22:00,S01E18,168.0,1.0,0.0,1.0,0.0,1.0,0.0,WWE Countdown,435,60,2,WWE Network,4.52,https://dg31sz3gwrwan.cloudfront.net/poster/27...,22:00,2014
68829,279029,4938292,2014-07-22T22:00,S01E19,167.0,0.0,0.0,2.0,0.0,0.0,0.0,WWE Countdown,435,60,2,WWE Network,4.52,https://dg31sz3gwrwan.cloudfront.net/poster/27...,22:00,2014


In [56]:
us_tv_shows = mood_filter_df.merge(unwanted_df, how="left", indicator=True)
us_tv_shows = us_tv_shows[us_tv_shows['_merge'] == 'left_only']
us_tv_shows

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image,timeslot,year,_merge
0,348204,7380549,2019-11-17T21:00,S01E01,2859.0,60.0,0.0,13.0,0.0,5.0,0.0,The War of the Worlds,12893,60,1,BBC One,0.00,https://dg31sz3gwrwan.cloudfront.net/poster/34...,21:00,2019,left_only
1,348204,7380550,T21:00,S01E02,1879.0,41.0,0.0,7.0,0.0,6.0,0.0,The War of the Worlds,12893,60,1,BBC One,0.00,https://dg31sz3gwrwan.cloudfront.net/poster/34...,21:00,,left_only
2,348204,7380551,T21:00,S01E03,1082.0,18.0,0.0,1.0,1.0,4.0,1.0,The War of the Worlds,12893,60,1,BBC One,0.00,https://dg31sz3gwrwan.cloudfront.net/poster/34...,21:00,,left_only
3,76332,435387,1999-12-25T19:00,S01E01,184.0,13.0,1.0,4.0,3.0,1.0,0.0,David Copperfield (1999),325,90,1,BBC One,3.72,https://dg31sz3gwrwan.cloudfront.net/poster/76...,19:00,1999,left_only
4,76332,435388,1999-12-26T19:00,S01E02,173.0,12.0,1.0,2.0,0.0,1.0,1.0,David Copperfield (1999),325,90,1,BBC One,3.72,https://dg31sz3gwrwan.cloudfront.net/poster/76...,19:00,1999,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62918,253412,4211482,T21:00,S02E04,147.0,1.0,0.0,1.0,0.0,0.0,0.0,Gold Rush: Alaska,827,60,1,Discovery Channel,4.76,https://d36rlb2fgh8cjd.cloudfront.net/default-...,21:00,,left_only
62919,253412,4211483,T21:00,S02E05,145.0,2.0,0.0,0.0,0.0,0.0,0.0,Gold Rush: Alaska,827,60,1,Discovery Channel,4.76,https://d36rlb2fgh8cjd.cloudfront.net/default-...,21:00,,left_only
62920,253412,4211484,T21:00,S02E06,147.0,2.0,0.0,0.0,0.0,0.0,0.0,Gold Rush: Alaska,827,60,1,Discovery Channel,4.76,https://d36rlb2fgh8cjd.cloudfront.net/default-...,21:00,,left_only
62921,248584,4098218,2011-01-27T20:00,S01E01,38.0,3.0,0.0,0.0,0.0,0.0,0.0,Time Paladin Sakura,167,25,1,Discovery Channel,3.00,https://dg31sz3gwrwan.cloudfront.net/poster/24...,20:00,2011,left_only


In [69]:
# Top 15 US Networks
top_us_network = us_tv_shows['network'].value_counts().head(15).to_frame().reset_index().rename(columns={'index':'network', 'network':'count'})
top_us_network

Unnamed: 0,network,count
0,CBS,6834
1,ABC (US),6719
2,NBC,6055
3,Netflix,4860
4,FOX,4000
5,Nickelodeon,2491
6,Cartoon Network,2418
7,BBC One,2342
8,Disney Channel,1893
9,HBO,1560


In [71]:
final_df =  pd.merge(us_tv_shows, us_network, how="inner", left_on="network", right_on='network')
final_df

KeyError: 'network'

In [58]:
abc = mood_filter_df['show_id'].value_counts(ascending=True)
abc = abc[abc > 2]
abc1 = abc.to_frame()

In [65]:
episode_df = pd.merge(mood_filter_df, abc1, how="inner", on="show_id")
episode_df

Unnamed: 0,show_id,episode_id,time,episode,times_watched,mood-good,mood-fun,mood-wow,mood-sad,mood-so-so,mood-bad,name,followers,runtime,number_of_seasons,network,mean_rate,poster_image,timeslot,year,_merge


## Top 15 Data

In [None]:
# Inner Merge on Merged Dataframe and Top 15 American Network Data
top_us_data = pd.merge(merged_df, top_us, how="inner", left_on="network", right_on="USA Networks")
top_us_data

In [None]:
top_us_data['network'].value_counts()

In [None]:
# Dropping columns that are not required
us_df1 = top_us_data.drop(columns=['id', 'USA Networks', 'Count of title', 'description'])
us_df1

In [None]:
del us_df1['time']
del us_df1['rating']
del us_df1['poster_image']
del us_df1['seasons']

In [None]:
us_df1

In [None]:
us_df1['network'].value_counts()

In [None]:
us_df1.to_csv("US_Data/top15networks.csv")

In [None]:
us_df2 = us_df1.loc[(us_df1['runtime'] >= 15) & (us_df1['runtime'] <= 90)]
us_df2

In [None]:
us_df2.isnull().sum(axis=0)

In [None]:
us_df2['network'].value_counts()

In [None]:
# pd.set_option('display.max_rows', None)
us_df2.loc[us_df2['runtime'] == 120].sort_values(by="runtime")

In [None]:
runtime_df.loc[runtime_df['name'].str.contains('Christmas', regex=False)]

In [None]:
mask1 = runtime_df['name'].str.contains('Awards', case=False, regex=False)
mask2 = runtime_df['name'].str.contains('FIFA', case=False, regex=False)
mask3 = runtime_df['name'].str.contains('WWE', case=False, regex=False)
mask4 = runtime_df['name'].str.contains('Late Night', case=False, regex=False)
mask5 = runtime_df['name'].str.contains('Thanksgiving', case=False, regex=False)
# mask6 = runtime_df['name'].str.contains('CMA', case=False, regex=False)
mask7 = runtime_df['name'].str.contains('Boxing', case=False, regex=False)
mask8 = runtime_df['name'].str.contains('Miss Universe', case=False, regex=False)
# mask9 = runtime_df['name'].str.contains('', case=False, regex=False)

In [None]:
xyz = runtime_df[mask1 | mask2 | mask3 | mask4 | mask5 | mask7 | mask8]
xyz

### Supervised Machine Learning - Data Pre Processing

In [None]:
ml_df = us_df1[["show_id","times_watched", 
                       "mood-good", "mood-fun", "mood-wow", "mood-sad", "mood-so-so", "mood-bad", 
                       "name", "followers", "runtime", "network"]]
ml_df

In [None]:
ml_grouped = ml_df.groupby(by=["show_id"]).agg({'times_watched':'mean',"mood-good":'mean', "mood-fun":'mean', 
                                                   "mood-wow":'mean', "mood-sad":'mean', "mood-so-so":'mean', 
                                                   "mood-bad":'mean', "name":'first', "followers":'mean', 
                                                   "runtime":'mean', "network": 'first'})
ml_grouped

In [None]:
X = ml_grouped.drop(columns=["name","network", "times_watched", "followers", "runtime"])
X

In [None]:
X.loc[(X!=0).any(axis=1)]

In [None]:
y = ml_grouped[["network"]]
y

In [None]:
y_label = LabelEncoder().fit_transform(y['network'])
y_label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, random_state=1)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

In [None]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

In [None]:
print(X_test_scaled.shape, y_test.shape, X_train_scaled.shape, y_train.shape)

### Logistic Regression

In [None]:
classifier = LogisticRegression(max_iter = 2000)
classifier

In [None]:
# Fit the model
classifier.fit(X_train_scaled, y_train)

In [None]:
# Display Training & Testing Score
print(f"Training Data Score (Logistic Regression Model): {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score (Logistic Regression Model): {classifier.score(X_test_scaled, y_test)}")

In [None]:
# Predict Y values using the model
y_pred_lr = classifier.predict(X_test_scaled)
y_pred_lr

In [None]:
y_pred_lr2 = classifier.predict([[30, 40, 50, 12, 5, 78]])

In [None]:
print(classification_report(y_test, y_pred_lr))

### Random Forests

In [None]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train_scaled, y_train)