In [157]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

### Read input and perform label encode for director name, actor name, movie name

In [185]:
train_df = pd.read_csv('project_data/train_dataset.csv')
X_test = pd.read_csv('project_data/test_dataset.csv')

# use the whole dataset to fit to the label_encoder 
# make sure there no unseen data in the test set 
df = pd.concat([train_df, X_test])
label_encoder = LabelEncoder()
features = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'movie_title']
for feature in features:
    label_encoder.fit(df[feature])
    train_df[feature] = label_encoder.transform(train_df[feature])
    X_test[feature] = label_encoder.transform(X_test[feature])

train_df.head()

Unnamed: 0,id,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,title_year,actor_2_facebook_likes,movie_facebook_likes,title_embedding,average_degree_centrality,imdb_score_binned
0,1,1356,186,73,28,847,1533,2000,422783777,Adventure|Animation|Drama|Family|Musical,...,656,English,USA,G,1994,886,17000,[-4.1984697e-03 4.2941985e-03 -1.1961063e-03 ...,0.001576,4
1,2,511,252,97,0,233,1891,654,20433940,Horror,...,662,English,Canada,R,2005,529,0,[-4.7586653e-03 2.6511205e-03 -3.7954253e-04 ...,0.000675,2
2,3,336,232,117,234,221,2078,12000,371897,Drama,...,118,English,USA,R,2013,1000,11000,[ 2.78131524e-03 -3.15494463e-03 -6.38332494e-...,0.003002,2
3,4,739,297,109,0,145,1731,957,13782838,Horror|Mystery|Sci-Fi,...,911,English,USA,R,1982,163,23000,[-5.32674184e-03 3.60742491e-03 7.91795843e-...,0.001726,4
4,5,1229,297,171,0,857,1592,16000,313837577,Action|Adventure|Drama|Fantasy,...,5060,English,New Zealand,PG-13,2001,5000,21000,[-4.2586620e-03 3.6257182e-03 -1.5326265e-03 ...,0.001876,4


### Perform one hot encoding for genres, content rating, language, and country 

In [186]:
train_df = pd.get_dummies(train_df, columns=['content_rating', 'language', 'country'])
train_df = train_df.join(train_df['genres'].str.get_dummies(sep="|")).drop('genres', axis=1)
X_test = pd.get_dummies(X_test, columns=['content_rating', 'language', 'country'])
X_test = X_test.join(X_test['genres'].str.get_dummies(sep="|")).drop('genres', axis=1)
train_df.head()

Unnamed: 0,id,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,actor_1_name,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,1,1356,186,73,28,847,1533,2000,422783777,917,...,0,0,1,0,0,0,0,0,0,0
1,2,511,252,97,0,233,1891,654,20433940,1352,...,1,0,0,0,0,0,0,0,0,0
2,3,336,232,117,234,221,2078,12000,371897,1021,...,0,0,0,0,0,0,0,0,0,0
3,4,739,297,109,0,145,1731,957,13782838,1395,...,1,0,0,1,0,1,0,0,0,0
4,5,1229,297,171,0,857,1592,16000,313837577,260,...,0,0,0,0,0,0,0,0,0,0


### Perform Count Vectorizer for plot keywords

In [188]:
vectorizer = CountVectorizer()
train_count_matrix = vectorizer.fit_transform(train_df['plot_keywords'])
train_count_df = pd.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out())
test_count_matrix = vectorizer.fit_transform(X_test['plot_keywords'])
test_count_df = pd.DataFrame(test_count_matrix.toarray(), columns=vectorizer.get_feature_names_out())
train_count_df.head()

Unnamed: 0,007,10,1000000,11,1190s,12,13,130,13th,14,...,zeppelin,zero,zeus,zodiac,zoloft,zombie,zone,zoo,zoologist,zorro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Use 5-fold cross validation for find the suitable n_components for PCA

In [189]:
n_coms = [2, 5, 10, 20, 50, 100]
cv_score = []
for n in n_coms: 
    pca = PCA(n_components=n)
    count_matrix_reduced = pca.fit_transform(count_matrix.toarray())
    
    # add prefix to every column name 
    count_df_reduced = pd.DataFrame(count_matrix_reduced, 
                                    columns=[f'plot_{i}' for i in range(count_matrix_reduced.shape[1])])
    
    # test the effect of vectorized plot keywords on training dataset using random forest 
    
    df_plot = pd.concat([train_df, count_df_reduced], axis=1)
    df_plot.drop(columns=['plot_keywords'], inplace=True)
    X = df_plot.drop(columns=['imdb_score_binned']).select_dtypes(['number'])
    y = train_df['imdb_score_binned']

    rf = RandomForestClassifier(n_estimators=200, random_state=10)
    # perform k-fold cross validation to find the suitable n
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
    scores_with_keywords = cross_val_score(rf, X, y, cv=cv, scoring='accuracy')
    cv_score.append(scores_with_keywords)
cv_score

[array([0.72212978, 0.71214642, 0.74708819, 0.72212978, 0.71833333]),
 array([0.72878536, 0.73377704, 0.75207987, 0.72545757, 0.71333333]),
 array([0.71381032, 0.70881864, 0.75041597, 0.73044925, 0.70666667]),
 array([0.71214642, 0.71381032, 0.7437604 , 0.72046589, 0.70166667]),
 array([0.69883527, 0.6921797 , 0.72212978, 0.68552413, 0.68      ]),
 array([0.67886855, 0.68219634, 0.70382696, 0.69051581, 0.66833333])]

Apply PCA to 'plot_keywords'

In [190]:
pca = PCA(n_components=2)
train_count_matrix_reduced = pca.fit_transform(train_count_matrix.toarray())

# add prefix to every column name for train_df
train_count_df_reduced = pd.DataFrame(train_count_matrix_reduced, 
                                columns=[f'plot_{i}' for i in range(train_count_matrix_reduced.shape[1])])
train_df = pd.concat([train_df, train_count_df_reduced], axis=1)
test_count_matrix_reduced = pca.fit_transform(test_count_matrix.toarray())

# add prefix to every column name for X_test
test_count_df_reduced = pd.DataFrame(test_count_matrix_reduced, 
                                columns=[f'plot_{i}' for i in range(test_count_matrix_reduced.shape[1])])
X_test = pd.concat([X_test, test_count_df_reduced], axis=1)

In [191]:
# match columns between test and train
X_test = X_test.reindex(columns=train_df.drop(['imdb_score_binned'], axis=1).columns, fill_value=0)

In [192]:
def evaluate(clf, X, y, fold):
    kf = StratifiedKFold(n_splits=fold)
    
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    # train model using k-fold validation and compute accuracy, precision, recall, f1
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average=None, zero_division=0)
        recall = recall_score(y_test, y_pred, average=None, zero_division=0)
        f1 = f1_score(y_test, y_pred, average=None, zero_division=0)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
    mean_accuracy = np.mean(accuracy_scores)
    mean_precision = np.mean(precision_scores, axis=0)
    mean_recall = np.mean(recall_scores, axis=0)
    mean_f1 = np.mean(f1_scores, axis=0)
    
    # store data to dataframe
    scores_df = pd.DataFrame({
        'Class': [0,1,2,3,4],
        'Precision': mean_precision,
        'Recall': mean_recall,
        'F1 Score': mean_f1
    })
    
    return (mean_accuracy, scores_df)

In [193]:
rf = RandomForestClassifier(n_estimators=50, random_state=10)

X = train_df.drop(columns='imdb_score_binned').select_dtypes(['number'])
y = train_df['imdb_score_binned']

evaluate(rf, X, y, 10)

(0.7260332225913622,
    Class  Precision    Recall  F1 Score
 0      0   0.000000  0.000000  0.000000
 1      1   0.116667  0.012500  0.022507
 2      2   0.731990  0.943457  0.824273
 3      3   0.692459  0.505678  0.583213
 4      4   0.919560  0.387179  0.523979)

In [194]:
base_clf = DecisionTreeClassifier()
bd = BaggingClassifier(estimator=base_clf, n_estimators=100, random_state=10)
evaluate(bd, X, y, 10)

(0.7223632336655592,
    Class  Precision    Recall  F1 Score
 0      0   0.000000  0.000000  0.000000
 1      1   0.520660  0.197826  0.224094
 2      2   0.757448  0.858105  0.800804
 3      3   0.672250  0.600932  0.630002
 4      4   0.861731  0.603846  0.701676)

In [195]:
rf_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
grid_search = GridSearchCV(estimator=rf, param_grid=rf_grid, cv=10)
grid_search.fit(X, y)
grid_search.best_params_

{'max_depth': 20,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 200}

In [196]:
rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_leaf=1, min_samples_split=5, random_state=10)
evaluate(rf, X, y, 10)

(0.7293565891472868,
    Class  Precision    Recall  F1 Score
 0      0   0.000000  0.000000  0.000000
 1      1   0.200000  0.008333  0.016000
 2      2   0.735558  0.947262  0.828013
 3      3   0.689806  0.510889  0.586170
 4      4   0.904242  0.385897  0.515834)

In [197]:
base_clf = DecisionTreeClassifier(min_samples_split=5)
bd = BaggingClassifier(estimator=base_clf, n_estimators=200, random_state=10)
evaluate(bd, X, y, 10)

(0.7273455149501662,
    Class  Precision    Recall  F1 Score
 0      0   0.000000  0.000000  0.000000
 1      1   0.538788  0.176812  0.222630
 2      2   0.760080  0.868434  0.807581
 3      3   0.668581  0.607393  0.632190
 4      4   0.863889  0.573077  0.677540)

### Synthetic Minority Over-sampling Technique

In [198]:
smote = SMOTE(random_state=10)
X_resampled, y_resampled = smote.fit_resample(X, y)
evaluate(rf, X_resampled, y_resampled, fold=3)

(0.8525285481239805,
    Class  Precision    Recall  F1 Score
 0      0   0.990867  0.999456  0.995134
 1      1   0.839962  0.937466  0.885041
 2      2   0.795016  0.543774  0.606164
 3      3   0.728489  0.795541  0.752380
 4      4   0.960484  0.986406  0.973192)

In [199]:
evaluate(bd, X_resampled, y_resampled, fold=3)

(0.7720500271886895,
    Class  Precision    Recall  F1 Score
 0      0   0.966422  0.993475  0.979664
 1      1   0.746974  0.903752  0.817656
 2      2   0.449995  0.202284  0.269772
 3      3   0.591150  0.785209  0.672609
 4      4   0.963097  0.975530  0.969203)

In [200]:
rf.fit(X, y)
rf.predict(X_test.select_dtypes(['number']))

array([2, 2, 2, 3, 2, 3, 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 3, 2, 2, 4, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 3, 2, 2, 2, 3, 3,
       2, 2, 2, 2, 2, 2, 2, 4, 2, 3, 2, 3, 2, 2, 3, 2, 3, 3, 2, 2, 2, 2,
       2, 2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2,
       3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2,
       2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 2, 3, 2, 2, 2, 3, 2,
       3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 3,

In [201]:
bd.fit(X, y)
bd.predict(X_test.select_dtypes(['number']))

array([2, 2, 2, 3, 2, 3, 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 1, 2, 3, 2, 3, 2, 2, 4, 2, 2, 2,
       2, 3, 2, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3, 2, 1, 2, 2, 2, 1, 2, 2, 2,
       1, 2, 2, 2, 2, 2, 3, 3, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3,
       2, 2, 3, 2, 2, 3, 1, 2, 2, 2, 2, 3, 2, 3, 2, 3, 3, 2, 2, 2, 3, 4,
       2, 3, 2, 2, 2, 2, 2, 4, 2, 4, 2, 3, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 4, 2, 2, 2, 4, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       2, 2, 2, 2, 3, 2, 2, 3, 3, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 3, 2, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2,
       3, 2, 3, 1, 2, 3, 3, 3, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2,
       2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 3, 3, 1, 3, 2, 3, 2, 3, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 3,