In [None]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Reading TV Time CSV Data
tvshows_df = pd.read_csv("Data/tvtimeshows.csv")
tvshows_df.head()

In [None]:
# Reading All Episodes CSV Data
episode_df = pd.read_csv("Data/all_episodes.csv")
episode_df.head()

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Merging both dataframe on Show Id
merged_df = pd.merge(episode_df, tvshows_df, how="inner", left_on="show_id", right_on="id")
merged_df.head()

In [None]:
merged_df["network"].value_counts().head(20)

In [None]:
# Reading only American Network Names and sorting it by count of titles
us_network = pd.read_csv("Data/network_names.csv")
us_network = us_network.sort_values(by='Count of title', ascending=False, ignore_index=True)
us_network.head(20)

In [None]:
# Inner Merge on Merged Dataframe and American Network Data
merged_us_data = pd.merge(merged_df, us_network, 
                          how="inner", 
                          left_on="network", 
                          right_on="USA Networks").drop(columns=['id', 'USA Networks', 'Count of title', 
                                                                 'description','seasons', 'nb_rates', 
                                                                 'rating', 'title'])

merged_us_data

In [None]:
merged_us_data.dtypes

In [None]:
# Converting Object datatype of Time column to String
merged_us_data['time'] = merged_us_data['time'].astype('string')

In [None]:
# Parsing timeslot from the time data using string split function
merged_us_data['timeslot'] = merged_us_data['time'].str.split(pat='T').str[1]

In [None]:
# Parsing year from the time data using string split function
year = merged_us_data['time'].str.split(pat='T').str[0]
merged_us_data['year'] = year.str.split(pat='-').str[0]

In [None]:
merged_us_data

In [None]:
# Filtering Data between runtime (15 - 90 mins) for tv shows
runtime_df = merged_us_data.loc[(merged_us_data['runtime'] >= 15) & (merged_us_data['runtime'] <= 90)]
runtime_df

In [None]:
runtime_df['mood-good'].value_counts().sort_index(ascending=True)

In [None]:
runtime_df['mood-fun'].value_counts().sort_index(ascending=True)

In [None]:
runtime_df['mood-wow'].value_counts().sort_index(ascending=True)

In [None]:
runtime_df['mood-sad'].value_counts().sort_index(ascending=True)

In [None]:
runtime_df['mood-so-so'].value_counts().sort_index(ascending=True)

In [None]:
runtime_df['mood-bad'].value_counts().sort_index(ascending=True)

In [None]:
# Create dataframe for all episode entries where all mood reactions are zero
mood_df = runtime_df.loc[(runtime_df['mood-good'] == 0) & 
                         (runtime_df['mood-fun'] == 0) & 
                         (runtime_df['mood-wow'] == 0) & 
                         (runtime_df['mood-sad'] == 0) & 
                         (runtime_df['mood-so-so'] == 0) & 
                         (runtime_df['mood-bad'] == 0)]

mood_df

In [None]:
# Filtering mood_df data from runtime_df
mood_filter_df = runtime_df.merge(mood_df, how="left", indicator=True)
mood_filter_df = mood_filter_df[mood_filter_df['_merge'] == 'left_only']
mood_filter_df

In [None]:
del mood_filter_df['_merge']

In [None]:
mood_filter_df['network'].value_counts().head(20)

In [None]:
mood_filter_df['runtime'].value_counts().sort_index(ascending=True)

In [None]:
mood_filter_df.loc[mood_filter_df['runtime'] == 20]

In [None]:
mood_filter_df.loc[mood_filter_df['name'].str.contains('Christmas', case=False, regex=False)].head(40)

In [None]:
# Remove entries that are not TV shows
mask1 = mood_filter_df['name'].str.contains('Christmas', case=False, regex=False)
mask2 = mood_filter_df['name'].str.contains('Awards', case=False, regex=False)
mask3 = mood_filter_df['name'].str.contains('WWE', case=False, regex=False)
mask4 = mood_filter_df['name'].str.contains('Presidential', case=False, regex=False)
mask5 = mood_filter_df['name'].str.contains('Thanksgiving', case=False, regex=False)
mask6 = mood_filter_df['name'].str.contains('World cup', case=False, regex=False)
mask7 = mood_filter_df['name'].str.contains('Boxing', case=False, regex=False)
mask8 = mood_filter_df['name'].str.contains('Miss Universe', case=False, regex=False)
# mask9 = mood_filter_df['name'].str.contains('', case=False, regex=False)

In [None]:
# Club all non-TV shows entries into an unwanted dataframe
unwanted_df = mood_filter_df[mask1 | mask2 | mask3 | mask4 | mask5 | mask7 | mask8]
unwanted_df

In [None]:
# Filtering non-tv shows from the mood_filter data
us_tv_shows = mood_filter_df.merge(unwanted_df, how="left", indicator=True)
us_tv_shows = us_tv_shows[us_tv_shows['_merge'] == 'left_only']
us_tv_shows

In [None]:
# Filtering all titles where episode numbers are less than 4
episode_df = us_tv_shows['show_id'].value_counts(ascending=True)
episode_df = episode_df[episode_df > 3]
filtered_episode_df = episode_df.to_frame().reset_index().rename(columns={'index':'show_id', 'show_id':'episode_count'})
filtered_episode_df

In [None]:
# All US episodes data
us_episodes_df = pd.merge(us_tv_shows, 
                          filtered_episode_df, 
                          how="inner", 
                          on="show_id").drop(columns={'_merge', 'episode_count'})
us_episodes_df

In [None]:
us_episodes_df.to_csv("US_Data/us_episodes.csv")

In [None]:
# Top 9 US Network count and removing Netflix as a network
top_us_networks = us_episodes_df['network'].value_counts().head(10).to_frame().reset_index().rename(columns={'index':'network', 'network':'count'})
top_us_networks = top_us_networks.loc[top_us_networks['network'] != 'Netflix']
top_us_networks

In [None]:
# Final clean Dataframe
final_df =  pd.merge(us_episodes_df, 
                     top_us_networks, 
                     how="inner", 
                     on='network').drop(columns=['count', 'time', 'number_of_seasons'])
final_df

In [None]:
final_df['network'].value_counts()

### Supervised Machine Learning - Data Pre Processing

In [None]:
# Creating Machine Learning dataframe by groupby on Show Ids
ml_df = final_df.groupby(by=["show_id"]).agg({'times_watched':'mean',"mood-good":'mean', "mood-fun":'mean', 
                                              "mood-wow":'mean', "mood-sad":'mean', "mood-so-so":'mean', 
                                              "mood-bad":'mean', "name":'first', "followers":'mean', 
                                              "runtime":'mean', "network": 'first', "mean_rate":'mean', 
                                              "poster_image":'first'})
ml_df

In [None]:
ml_df.to_csv("US_Data/model_data.csv")

In [None]:
# X dataset with all Moods as feature
X = ml_df.drop(columns=["name","network", "times_watched", "followers", "runtime", "mean_rate", "poster_image"])
X

In [None]:
# y dataset with Networks as labels
y = ml_df[["network"]]
y

In [None]:
# Label Encoding
le = LabelEncoder()
y_label = le.fit_transform(y['network'])
y_label

In [None]:
# Unique Label Encoding values
np.unique(y_label)

In [None]:
# Split dataset into Training & Testing Data
X_train, X_test, y_train, y_test = train_test_split(X, y_label, random_state=1)

In [None]:
# Using MinMaxScaler to scale feature values between 1 and 100
scaler = MinMaxScaler(feature_range=(1,100)).fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

In [None]:
# Scaling Test data values
X_test_scaled = scaler.transform(X_test)
X_test_scaled

In [None]:
print(X_test_scaled.shape, y_test.shape, X_train_scaled.shape, y_train.shape)

### Logistic Regression

In [None]:
classifier = LogisticRegression(max_iter = 2000)
classifier

In [None]:
# Fit the model
classifier.fit(X_train_scaled, y_train)

In [None]:
# Display Training & Testing Score
print(f"Training Data Score (Logistic Regression Model): {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score (Logistic Regression Model): {classifier.score(X_test_scaled, y_test)}")

In [None]:
# Predict Y values using the model
y_pred_lr = classifier.predict(X_test_scaled)
y_pred_lr

In [None]:
print(classification_report(y_test, y_pred_lr))

### Random Forests Classifiers

In [None]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train_scaled, y_train)

In [None]:
# Display Training & Testing Score
print(f'Training Data Score (Random Forests Classifier): {clf.score(X_train_scaled, y_train)}')
print(f'Testing Data Score (Random Forests Classifier): {clf.score(X_test_scaled, y_test)}')

In [None]:
y_pred_rf = classifier.predict(X_test_scaled)
y_pred_rf

In [None]:
print(classification_report(y_test, y_pred_rf))

In [None]:
# Extra Trees
clf_extra = ExtraTreesClassifier(random_state=1, n_estimators=100).fit(X_train_scaled, y_train)
print(f'Training Data Score (Extra Trees Classifier): {clf_extra.score(X_train_scaled, y_train)}')
print(f'Testing Data Score (Extra Trees Classifier): {clf_extra.score(X_test_scaled, y_test)}')

In [None]:
# Ad a Boost
clf_adaboost = AdaBoostClassifier(random_state=1, n_estimators=100, base_estimator=DecisionTreeClassifier(max_depth=2)).fit(X_train_scaled, y_train)
print(f'Training Data Score (Ad a Boost): {clf_adaboost.score(X_train_scaled, y_train)}')
print(f'Testing Data Score (Ad a Boost): {clf_adaboost.score(X_test_scaled, y_test)}')

### Random Forests (Using Hypertuned Model)

In [None]:
# Train a Random Forest Classifier for hypertuning
random_model_hp = RandomForestClassifier()
random_model_hp

In [None]:
# Define values of n_estimators
random_param_grid = {'n_estimators': np.arange(0,550,50)}
random_param_grid

In [None]:
model = RandomizedSearchCV(random_model_hp, random_param_grid, random_state=1, verbose=3)
model

In [None]:
# Fit the model by using the randomized search estimator
model.fit(X_train_scaled, y_train)

In [None]:
print(model.best_params_)

In [None]:
print(model.best_score_)

In [None]:
model.score(X_test_scaled, y_test)

In [None]:
y_pred_rf_hp = model.predict(X_test_scaled)
y_pred_rf_hp

### Extra Trees (Using Hypertuned Model)

In [None]:
# Train a Random Forest Classifier for hypertuning
extra_model_hp = ExtraTreesClassifier()
extra_model_hp

In [None]:
# Define values of n_estimators
extra_param_grid = {'n_estimators': np.arange(0,550,50)}
extra_param_grid

In [None]:
model_et = RandomizedSearchCV(extra_model_hp, extra_param_grid, random_state=1, verbose=3)
model_et

In [None]:
model_et.fit(X_train_scaled, y_train)

In [None]:
print(model_et.best_params_)

In [None]:
print(model_et.best_score_)

In [None]:
model_et.score(X_test_scaled, y_test)

In [None]:
y_pred_et_hp = model_et.predict(X_test_scaled)
y_pred_et_hp

### KNN

In [None]:
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 30, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 30, 2), train_scores, marker='o')
plt.plot(range(1, 30, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_scaled, y_train)
print('k=15 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

In [None]:
knn.score(X_train_scaled, y_train)

In [None]:
# Predicting show using KNN and user input
user_entry = [[8, 45, 35, 30, 60, 9]]
predicted_network_knn = knn.predict(user_entry)
print(le.inverse_transform(predicted_network_knn))

In [None]:
# Predicting show using Random Forest Hypertuned Model
user_entry = [[8, 45, 35, 30, 60, 9]]
predicted_network = model.predict(user_entry)
print(le.inverse_transform(predicted_network))

In [None]:
# Predicting show using Random Forest Hypertuned Model
user_entry = [[8, 45, 35, 30, 60, 9]]
predicted_network_et = model_et.predict(user_entry)
print(le.inverse_transform(predicted_network))

In [None]:
# Save network predictor Model
import joblib
joblib.dump(model_et,"network_predictor.h5")