###### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2024 Semester 1

## Assignment 2 IMDB Movie Rating Prediction


## 1. Data loading

In [1]:
## python3.12.0 
##run on HPC, 24 CPUs; GPU NVIDIA A30
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [2]:
## load the data and split them into attributes and labels 
data_train = pd.read_csv('project_data/train_dataset.csv')
data_test = pd.read_csv('project_data/test_dataset.csv')
X_train = data_train.iloc[:,1:-1] ## the id column is removed 
y_train = data_train.iloc[:,-1]
X_test = data_test.iloc[:,1:] ## the id column is removed
print(np.unique(y_train, return_counts=True))

(array([0, 1, 2, 3, 4]), array([  24,  235, 1839,  777,  129]))


In [3]:
## load the preprocessed text feature
train_title_embedding = np.load('project_data/features_fasttext/train_fasttext_title_embeddings.npy')
test_title_embedding = np.load('project_data/features_fasttext/test_fasttext_title_embeddings.npy')

train_genres = np.load('project_data/features_doc2vec/train_doc2vec_features_genre.npy')
test_genres = np.load('project_data/features_doc2vec/test_doc2vec_features_genre.npy')

train_plot_keywords = np.load('project_data/features_doc2vec/train_doc2vec_features_plot_keywords.npy')
test_plot_keywords = np.load('project_data/features_doc2vec/test_doc2vec_features_plot_keywords.npy')

train_director_name = np.load('project_data/features_countvec/train_countvec_features_director_name.npy')
test_director_name = np.load('project_data/features_countvec/test_countvec_features_director_name.npy')

train_actor_1_name = np.load('project_data/features_countvec/train_countvec_features_actor_1_name.npy')
test_actor_1_name = np.load('project_data/features_countvec/test_countvec_features_actor_1_name.npy')

train_actor_2_name = np.load('project_data/features_countvec/train_countvec_features_actor_2_name.npy')
test_actor_2_name = np.load('project_data/features_countvec/test_countvec_features_actor_2_name.npy')

## 2. Preprocessing text features

In [4]:
## the proprecessed text feature actor_3_name is not provided. 
## Use CountVectorizer, the same method applied to the other actor names to convert them to numeric 
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)
actor_3_name = vectorizer.fit_transform(pd.concat([X_train['actor_3_name'],X_test['actor_3_name']],ignore_index=True)).toarray()

train_actor_3_name = actor_3_name[0:(len(X_train['actor_3_name']))]
test_actor_3_name = actor_3_name[(len(X_train['actor_3_name'])):]

actor_3_name

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
## the proprecessed text feature language, country, and content_rating are not provided
## As they are categorial features, one hot encoding is used
def one_hot_encode(text):
    vector = np.array([1 if (word == text) else 0 for word in vocabulary])
    return vector

In [6]:
vocabulary = set(pd.concat([X_train['language'],X_test['language']],ignore_index=True))
train_language = X_train['language'].apply(one_hot_encode)
test_language = X_test['language'].apply(one_hot_encode)

train_language = np.vstack(train_language).reshape(-1, len(train_language[0]))
test_language = np.vstack(test_language).reshape(-1, len(test_language[0]))

train_language

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
vocabulary = set(pd.concat([X_train['country'],X_test['country']],ignore_index=True))
train_country = X_train['country'].apply(one_hot_encode)
test_country = X_test['country'].apply(one_hot_encode)

train_country = np.vstack(train_country).reshape(-1, len(train_country[0]))
test_country = np.vstack(test_country).reshape(-1, len(test_country[0]))

train_country

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
vocabulary = set(pd.concat([X_train['content_rating'],X_test['content_rating']],ignore_index=True))
train_content_rating = X_train['content_rating'].apply(one_hot_encode)
test_content_rating = X_test['content_rating'].apply(one_hot_encode)

train_content_rating = np.vstack(train_content_rating).reshape(-1, len(train_content_rating[0]))
test_content_rating = np.vstack(test_content_rating).reshape(-1, len(test_content_rating[0]))

train_content_rating

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

## 3. Round 1, only including numeric data without standardization

In [9]:
## there are two kinds of features, numeric features and text features 
numeric_features = ['num_critic_for_reviews','duration','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','gross','num_voted_users','cast_total_facebook_likes','facenumber_in_poster','num_user_for_reviews','title_year','actor_2_facebook_likes','movie_facebook_likes','average_degree_centrality']
text_features = ['title_embedding','genres','plot_keywords','director_name','actor_1_name','actor_2_name','actor_3_name','language','country','content_rating']

In [10]:
### the movie title_embedding is included, dropping the movie title column 
X_train.drop(columns=['movie_title'], inplace=True)
X_test.drop(columns=['movie_title'], inplace=True)
print(X_train.columns)
print(X_test.columns)

Index(['director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'num_user_for_reviews',
       'language', 'country', 'content_rating', 'title_year',
       'actor_2_facebook_likes', 'movie_facebook_likes', 'title_embedding',
       'average_degree_centrality'],
      dtype='object')
Index(['director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'num_user_for_reviews',
       'language', 'country', 'content_rating', 'title_year',
       'actor_2_facebook_likes', 'movie_fac

In [11]:
### get backup for training and test sets
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

In [12]:
### since the preprocessed text features are of high dimensions, consider the numeric features only first 
X_train = X_train[numeric_features]
X_test = X_test[numeric_features]

In [13]:
X_train

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,title_year,actor_2_facebook_likes,movie_facebook_likes,average_degree_centrality
0,186,73,28,847,2000,422783777,644348,6458,0,656,1994,886,17000,0.001576
1,252,97,0,233,654,20433940,78883,1876,8,662,2005,529,0,0.000675
2,232,117,234,221,12000,371897,36494,13607,2,118,2013,1000,11000,0.003002
3,297,109,0,145,957,13782838,258078,1757,0,911,1982,163,23000,0.001726
4,297,171,0,857,16000,313837577,1238746,22342,2,5060,2001,5000,21000,0.001876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999,161,129,42,49,97,93952276,132048,318,7,203,2009,50,12000,0.000750
3000,393,123,2000,471,26000,26903709,312629,37206,0,475,2013,10000,83000,0.003302
3001,216,118,473,963,18000,73343413,217480,22517,0,429,2009,1000,21000,0.003302
3002,109,95,0,0,227,1060591,9750,231,0,66,2007,4,0,0.000300


In [14]:
X_test

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,title_year,actor_2_facebook_likes,movie_facebook_likes,average_degree_centrality
0,27,118,14,400,2000,2246000,2302,3384,4,20,2015,769,0,0.000375
1,339,141,0,404,749,47307550,104301,1948,4,269,2012,463,28000,0.002176
2,78,95,89,388,963,37606,31836,2658,0,90,2009,654,0,0.000900
3,226,117,0,818,15000,104054514,200359,16828,0,1009,2002,1000,0,0.003452
4,97,104,38,690,801,3447339,29517,2667,7,79,2013,727,0,0.000450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747,179,93,0,766,13000,17096053,134458,15716,2,640,1998,933,5000,0.002777
748,393,105,335,911,3000,37516013,128629,8281,0,348,2012,3000,98000,0.001801
749,55,117,133,249,687,20966644,29610,1665,0,94,1985,443,0,0.001126
750,85,72,0,384,3000,47887943,11634,4480,0,58,2003,455,227,0.000825


In [15]:
### train a 0-R model first to set the based line 
from sklearn.dummy import DummyClassifier
R_0 = DummyClassifier(strategy='most_frequent')
R_0.fit(X_train,y_train)
y_pred_R_0 = R_0.predict(X_test)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_R_0}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_OR.csv', index=False)

In [16]:
## using GNB to predict 
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score


gnb = GaussianNB()
## perform cross validation
cross_val_predict_gnb = cross_val_predict(gnb, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_gnb) == y_train.values)/len(y_train.values)) 

gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_gnb}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_GNB.csv', index=False)

cross-val acc: 0.29527296937416775


In [17]:
## using KNN5
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()

cross_val_predict_KNN = cross_val_predict(KNN, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_KNN) == y_train.values)/len(y_train.values)) 

KNN.fit(X_train, y_train)
y_pred_KNN = KNN.predict(X_test)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_KNN}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_KNN5.csv', index=False)

cross-val acc: 0.5818908122503329


In [18]:
## using random forest 
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=1000, random_state=888,n_jobs=-1)

cross_val_predict_rf = cross_val_predict(rf_regressor, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_rf) == y_train.values)/len(y_train.values)) 
rf_regressor.fit(X_train, y_train)

y_pred_rf = rf_regressor.predict(X_test)
y_pred_rf = np.round(y_pred_rf) ## avoid some predictions not being integer 
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_rf}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_rf.csv', index=False)

cross-val acc: 0.7213715046604527


In [19]:
df_importance = pd.DataFrame({'features': X_train[numeric_features].columns,
        'importance': rf_regressor.feature_importances_})
df_importance.sort_values(by='importance', ascending=False, inplace=True)
df_importance

Unnamed: 0,features,importance
6,num_voted_users,0.358911
5,gross,0.10124
1,duration,0.079245
10,title_year,0.065952
9,num_user_for_reviews,0.062958
3,actor_3_facebook_likes,0.055618
0,num_critic_for_reviews,0.04451
2,director_facebook_likes,0.042663
7,cast_total_facebook_likes,0.042459
11,actor_2_facebook_likes,0.037495


In [20]:
## using Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Concatenate, Input, Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical

2024-05-15 18:50:00.014421: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-15 18:50:00.080336: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
### cross validation for Neuron network 

from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=666)

accuracy = []
# Iterate over K folds
for train_index, test_index in kf.split(X_train):
    # Split data into training and test sets
    #all_fold_indices.append(test_index) 
    X_train_temp, X_test_temp = X_train.values[train_index], X_train.values[test_index]
    y_train_temp, y_test_temp = y_train.values[train_index], y_train.values[test_index]
    y_train_categorical_temp = to_categorical(y_train_temp)
    
    ## set the model
    
    np.random.seed(888)
    tf.random.set_seed(888)
    random.seed(888)
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],))) # input layer
    model.add(Dense(512, activation='relu')) # hiden layer with 512 neurons
    model.add(Dropout(0.2)) # drop out some neurons avoiding overfitting
    model.add(Dense(1, activation='relu')) # pseudo regression-like output layer
    model.add(Dense(5, activation='softmax'))  # output layer with 5 neurons for 5 classes using softmax activation
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    
    # Train the model
    model.fit(X_train_temp, y_train_categorical_temp, epochs=50, verbose=0)

    # Generate predictions on the "test set"
    predictions = np.argmax(model.predict(X_test_temp, verbose=0),axis=1)
    
    # calculate the accuracy
    accuracy = accuracy + [sum(predictions == y_test_temp)/len(y_test_temp)]
    
print('cross-val acc:',np.mean(accuracy)) 

2024-05-15 18:50:04.578061: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22467 MB memory:  -> device: 0, name: NVIDIA A30, pci bus id: 0000:65:00.0, compute capability: 8.0
I0000 00:00:1715763005.870627   60820 service.cc:145] XLA service 0x2b3574014230 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1715763005.870672   60820 service.cc:153]   StreamExecutor device (0): NVIDIA A30, Compute Capability 8.0
2024-05-15 18:50:05.899405: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-15 18:50:06.022717: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
I0000 00:00:1715763007.464618   60820 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


cross-val acc: 0.6121661129568107


In [22]:
y_train_categorical = to_categorical(y_train.values)
np.random.seed(888)
tf.random.set_seed(888)
random.seed(888)
model = Sequential()
model.add(Input(shape=(X_train.values.shape[1],)))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='relu'))
model.add(Dense(5, activation='softmax'))  
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train.values, y_train_categorical, epochs=50)
#model_nn = model
y_pred_nn = model.predict(X_test.values)
y_pred_nn = np.argmax(y_pred_nn, axis=1)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_nn}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_NN.csv', index=False)

Epoch 1/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.6067 - loss: 5804.0991
Epoch 2/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 906us/step - accuracy: 0.6236 - loss: 1.5093
Epoch 3/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 852us/step - accuracy: 0.6236 - loss: 1.4399
Epoch 4/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 833us/step - accuracy: 0.6236 - loss: 1.3792
Epoch 5/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 833us/step - accuracy: 0.6236 - loss: 1.3262
Epoch 6/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 851us/step - accuracy: 0.6236 - loss: 1.2800
Epoch 7/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 844us/step - accuracy: 0.6236 - loss: 1.2401
Epoch 8/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 831us/step - accuracy: 0.6236 - loss: 1.2059
Epoch 9/50
[1m94/94[0m [32m━━━━━━━━

## 4. Round 2, only including numeric data with standardization

In [23]:
## standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
X_train

array([[ 0.16819334, -1.69245592, -0.25111333, ..., -0.23801553,
         0.40441232, -0.13749659],
       [ 0.71259343, -0.59691369, -0.26047711, ..., -0.31478479,
        -0.45198568, -0.8986283 ],
       [ 0.5476237 ,  0.31603816, -0.18222263, ..., -0.21350097,
         0.10215421,  1.06762748],
       ...,
       [ 0.41564792,  0.36168575, -0.10229605, ..., -0.21350097,
         0.60591774,  1.32133777],
       [-0.46694011, -0.68820888, -0.26047711, ..., -0.42768078,
        -0.45198568, -1.21576594],
       [-1.02783718,  0.22474297, -0.12704319, ..., -0.40940238,
         0.25328326, -1.02548302]])

In [25]:
X_test

array([[-1.14331598,  0.36168575, -0.25579522, ..., -0.2631752 ,
        -0.45198568, -1.15233858],
       [ 1.43021174,  1.41158038, -0.26047711, ..., -0.32897743,
         0.95855221,  0.36992398],
       [-0.72264318, -0.68820888, -0.23071365, ..., -0.2879048 ,
        -0.45198568, -0.70834537],
       ...,
       [-0.91235837,  0.31603816, -0.21599914, ..., -0.33327823,
        -0.45198568, -0.51806244],
       [-0.66490378, -1.73810351, -0.26047711, ..., -0.33069775,
        -0.44055025, -0.77177273],
       [-1.07732809, -1.05338962, -0.25913943, ..., -0.21350097,
        -0.30085662,  0.56020691]])

In [26]:
## using GNB to predict 
gnb = GaussianNB()

cross_val_predict_gnb = cross_val_predict(gnb, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_gnb) == y_train.values)/len(y_train.values)) 

gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_gnb}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_GNB_scale.csv', index=False)

cross-val acc: 0.2789613848202397


In [27]:
## using KNN5
KNN = KNeighborsClassifier()

cross_val_predict_KNN = cross_val_predict(KNN, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_KNN) == y_train.values)/len(y_train.values)) 

KNN.fit(X_train, y_train)
y_pred_KNN = KNN.predict(X_test)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_KNN}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_KNN5_scale.csv', index=False)

cross-val acc: 0.6264980026631158


In [28]:
## using random forest 
rf_regressor = RandomForestRegressor(n_estimators=1000, random_state=888,n_jobs=-1)

cross_val_predict_rf = cross_val_predict(rf_regressor, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_rf) == y_train.values)/len(y_train.values)) 

rf_regressor.fit(X_train, y_train)
y_pred_rf = rf_regressor.predict(X_test)
y_pred_rf = np.round(y_pred_rf) ## avoid some predictions not being integer 
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_rf}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_rf_scale.csv', index=False)

cross-val acc: 0.7217043941411452


In [29]:
df_importance = pd.DataFrame({'features': X_train_copy[numeric_features].columns,
        'importance': rf_regressor.feature_importances_})
df_importance.sort_values(by='importance', ascending=False, inplace=True)
df_importance

Unnamed: 0,features,importance
6,num_voted_users,0.358911
5,gross,0.10124
1,duration,0.079245
10,title_year,0.065952
9,num_user_for_reviews,0.062958
3,actor_3_facebook_likes,0.055618
0,num_critic_for_reviews,0.04451
2,director_facebook_likes,0.042663
7,cast_total_facebook_likes,0.042459
11,actor_2_facebook_likes,0.037495


In [30]:
### cross validation

kf = KFold(n_splits=10, shuffle=True, random_state=666)

accuracy = []
for train_index, test_index in kf.split(X_train):
    
    ## split the training set 
    
    X_train_temp, X_test_temp = X_train[train_index], X_train[test_index]
    y_train_temp, y_test_temp = y_train[train_index], y_train[test_index]
    y_train_categorical_temp = to_categorical(y_train_temp)
    
    ## set the model
    
    np.random.seed(888)
    tf.random.set_seed(888)
    random.seed(888)
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],))) # input layer
    model.add(Dense(512, activation='relu')) # hiden layer with 512 neurons
    model.add(Dropout(0.2)) # drop out some neurons avoiding overfitting
    model.add(Dense(1, activation='relu')) # pseudo regression-like output layer
    model.add(Dense(5, activation='softmax'))  # output layer with 5 neurons for 5 classes using softmax activation
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    
    # Train the model
    model.fit(X_train_temp, y_train_categorical_temp, epochs=50, verbose=0)

    # Generate predictions on the "test set"
    predictions = np.argmax(model.predict(X_test_temp, verbose=0),axis=1)
    
    # calculate the accuracy
    accuracy = accuracy + [sum(predictions == y_test_temp)/len(y_test_temp)]
    
print('cross-val acc:',np.mean(accuracy)) 

cross-val acc: 0.6937264673311184


In [31]:
y_train_categorical = to_categorical(y_train)
np.random.seed(888)
tf.random.set_seed(888)
random.seed(888)
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='relu'))
model.add(Dense(5, activation='softmax'))  
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_categorical, epochs=50)
y_pred_nn = model.predict(X_test)
y_pred_nn = np.argmax(y_pred_nn, axis=1)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_nn}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_NN_scale.csv', index=False)

Epoch 1/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.6238 - loss: 1.3931
Epoch 2/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 843us/step - accuracy: 0.6582 - loss: 1.1573
Epoch 3/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 837us/step - accuracy: 0.6631 - loss: 1.0703
Epoch 4/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 836us/step - accuracy: 0.6709 - loss: 1.0160
Epoch 5/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 822us/step - accuracy: 0.6760 - loss: 0.9754
Epoch 6/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 841us/step - accuracy: 0.6750 - loss: 0.9390
Epoch 7/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 842us/step - accuracy: 0.6845 - loss: 0.9221
Epoch 8/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 821us/step - accuracy: 0.6833 - loss: 0.9032
Epoch 9/50
[1m94/94[0m [32m━━━━━━━━━━━

In [32]:
X_train = X_train_copy.copy()
X_test = X_test_copy.copy()

## 5. Round 3, including text features and with standardization

In [33]:
from sklearn.decomposition import PCA

In [34]:
### introduce PCA component to training set
nrow_train=X_train.shape[0]
for i in text_features:
    print(i)
    df_temp = pd.concat([pd.DataFrame(globals()['train_'+i]),pd.DataFrame(globals()['test_'+i])], axis=0, ignore_index=True)
    pca = PCA(n_components=2)  
    pca_temp = pca.fit_transform(df_temp)
    df_pca = pd.DataFrame(pca_temp)
    X_train.drop(columns=[i], inplace=True)
    X_train[i+'_PC1'] = df_pca.iloc[0:nrow_train,0]
    X_train[i+'_PC2'] = df_pca.iloc[0:nrow_train,1]
    print('training set',' PC1:','min:',np.min(X_train[i+'_PC1']),'max',np.max(X_train[i+'_PC1']))
    print('training set',' PC2:','min:',np.min(X_train[i+'_PC2']),'max',np.max(X_train[i+'_PC2']))
    X_test.drop(columns=[i], inplace=True)
    X_test[i+'_PC1'] = df_pca.iloc[nrow_train:,0].values
    X_test[i+'_PC2'] = df_pca.iloc[nrow_train:,1].values
    print('test set',' PC1:','min:',np.min(X_test[i+'_PC1']),'max',np.max(X_test[i+'_PC1']))
    print('test set',' PC2:','min:',np.min(X_test[i+'_PC2']),'max',np.max(X_test[i+'_PC2']))

title_embedding
training set  PC1: min: -0.019224618 max 0.034412663
training set  PC2: min: -0.0094109345 max 0.009616705
test set  PC1: min: -0.017691333 max 0.026750533
test set  PC2: min: -0.008997158 max 0.008628372
genres
training set  PC1: min: -0.018505985 max 0.019414203
training set  PC2: min: -0.016693272 max 0.017568637
test set  PC1: min: -0.018449714 max 0.019391654
test set  PC2: min: -0.016694069 max 0.017549878
plot_keywords
training set  PC1: min: -0.010268084 max 0.011913228
training set  PC2: min: -0.011101094 max 0.012587468
test set  PC1: min: -0.009904355 max 0.010138965
test set  PC2: min: -0.011012451 max 0.011121323
director_name
training set  PC1: min: -0.16650595824865771 max 1.033929846009026
training set  PC2: min: -0.21443343010341392 max 1.0445152899239785
test set  PC1: min: -0.1665059582486577 max 1.033929846009026
test set  PC2: min: -0.1804064436199437 max 1.0445152899239785
actor_1_name
training set  PC1: min: -0.07722631356924756 max 1.579161203577

In [35]:
X_train

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,...,actor_2_name_PC1,actor_2_name_PC2,actor_3_name_PC1,actor_3_name_PC2,language_PC1,language_PC2,country_PC1,country_PC2,content_rating_PC1,content_rating_PC2
0,186,73,28,847,2000,422783777,644348,6458,0,656,...,-0.018911,-0.009801,-0.016433,-0.004900,-0.043992,0.000986,-0.224117,-0.022061,0.097988,0.245086
1,252,97,0,233,654,20433940,78883,1876,8,662,...,-0.019053,-0.011324,-0.017776,-0.006196,-0.043992,0.000986,0.704948,0.473141,-0.647047,-0.158164
2,232,117,234,221,12000,371897,36494,13607,2,118,...,-0.017017,-0.008776,-0.016929,-0.005098,-0.043992,0.000986,-0.224117,-0.022061,-0.647047,-0.158164
3,297,109,0,145,957,13782838,258078,1757,0,911,...,-0.021173,-0.012834,-0.052814,0.900699,-0.043992,0.000986,-0.224117,-0.022061,-0.647047,-0.158164
4,297,171,0,857,16000,313837577,1238746,22342,2,5060,...,-0.018607,-0.010668,-0.020062,-0.006440,-0.043992,0.000986,0.660089,0.345869,0.763211,-0.263544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999,161,129,42,49,97,93952276,132048,318,7,203,...,-0.015630,-0.007890,-0.016433,-0.004900,-0.043992,0.000986,-0.224117,-0.022061,0.763211,-0.263544
3000,393,123,2000,471,26000,26903709,312629,37206,0,475,...,-0.021192,-0.016594,-0.019386,-0.006127,-0.043992,0.000986,1.113117,-0.480086,-0.647047,-0.158164
3001,216,118,473,963,18000,73343413,217480,22517,0,429,...,-0.017388,-0.009394,0.998885,0.131660,-0.043992,0.000986,-0.224117,-0.022061,-0.647047,-0.158164
3002,109,95,0,0,227,1060591,9750,231,0,66,...,-0.015405,-0.007712,-0.017476,-0.005328,0.911420,-0.124156,0.749516,0.697826,0.148904,1.009111


In [36]:
X_test

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,...,actor_2_name_PC1,actor_2_name_PC2,actor_3_name_PC1,actor_3_name_PC2,language_PC1,language_PC2,country_PC1,country_PC2,content_rating_PC1,content_rating_PC2
0,27,118,14,400,2000,2246000,2302,3384,4,20,...,-0.015405,-0.007712,-0.018475,0.011968,-0.043992,0.000986,-0.224117,-0.022061,0.148904,1.009111
1,339,141,0,404,749,47307550,104301,1948,4,269,...,-0.016344,-0.008476,-0.020046,-0.006035,-0.043992,0.000986,-0.224117,-0.022061,-0.647047,-0.158164
2,78,95,89,388,963,37606,31836,2658,0,90,...,-0.021758,-0.024972,-0.018002,-0.005545,-0.043992,0.000986,0.704948,0.473141,-0.647047,-0.158164
3,226,117,0,818,15000,104054514,200359,16828,0,1009,...,-0.054480,0.602579,-0.016433,-0.004900,-0.043992,0.000986,-0.224117,-0.022061,-0.647047,-0.158164
4,97,104,38,690,801,3447339,29517,2667,7,79,...,-0.015405,-0.007712,-0.031841,-0.012266,-0.043992,0.000986,-0.224117,-0.022061,-0.647047,-0.158164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747,179,93,0,766,13000,17096053,134458,15716,2,640,...,-0.016344,-0.008476,-0.005318,-0.013488,-0.043992,0.000986,-0.224117,-0.022061,-0.647047,-0.158164
748,393,105,335,911,3000,37516013,128629,8281,0,348,...,-0.018085,-0.010169,-0.017500,-0.005353,-0.043992,0.000986,-0.224117,-0.022061,-0.647047,-0.158164
749,55,117,133,249,687,20966644,29610,1665,0,94,...,-0.017016,-0.008721,-0.017698,-0.005206,-0.043992,0.000986,1.113117,-0.480086,0.093643,0.222252
750,85,72,0,384,3000,47887943,11634,4480,0,58,...,-0.017606,-0.018814,-0.023203,-0.007817,-0.043992,0.000986,-0.224117,-0.022061,0.097988,0.245086


In [37]:
columns = X_train.columns
columns

Index(['num_critic_for_reviews', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_1_facebook_likes', 'gross',
       'num_voted_users', 'cast_total_facebook_likes', 'facenumber_in_poster',
       'num_user_for_reviews', 'title_year', 'actor_2_facebook_likes',
       'movie_facebook_likes', 'average_degree_centrality',
       'title_embedding_PC1', 'title_embedding_PC2', 'genres_PC1',
       'genres_PC2', 'plot_keywords_PC1', 'plot_keywords_PC2',
       'director_name_PC1', 'director_name_PC2', 'actor_1_name_PC1',
       'actor_1_name_PC2', 'actor_2_name_PC1', 'actor_2_name_PC2',
       'actor_3_name_PC1', 'actor_3_name_PC2', 'language_PC1', 'language_PC2',
       'country_PC1', 'country_PC2', 'content_rating_PC1',
       'content_rating_PC2'],
      dtype='object')

In [38]:
### standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
X_train

array([[ 0.16819334, -1.69245592, -0.25111333, ..., -0.10215625,
         0.15918878,  0.5524476 ],
       [ 0.71259343, -0.59691369, -0.26047711, ...,  2.08719009,
        -1.02608494, -0.36488128],
       [ 0.5476237 ,  0.31603816, -0.18222263, ..., -0.10215625,
        -1.02608494, -0.36488128],
       ...,
       [ 0.41564792,  0.36168575, -0.10229605, ..., -0.10215625,
        -1.02608494, -0.36488128],
       [-0.46694011, -0.68820888, -0.26047711, ...,  3.08054829,
         0.24018953,  2.29048112],
       [-1.02783718,  0.22474297, -0.12704319, ...,  1.48282979,
         1.21748826, -0.60460542]])

In [40]:
X_test

array([[-1.14331598,  0.36168575, -0.25579522, ..., -0.10215625,
         0.24018953,  2.29048112],
       [ 1.43021174,  1.41158038, -0.26047711, ..., -0.10215625,
        -1.02608494, -0.36488128],
       [-0.72264318, -0.68820888, -0.23071365, ...,  2.08719009,
        -1.02608494, -0.36488128],
       ...,
       [-0.91235837,  0.31603816, -0.21599914, ..., -2.12713304,
         0.15227622,  0.50050279],
       [-0.66490378, -1.73810351, -0.26047711, ..., -0.10215625,
         0.15918878,  0.5524476 ],
       [-1.07732809, -1.05338962, -0.25913943, ..., -0.10215625,
        -1.02608494, -0.36488128]])

In [41]:
gnb = GaussianNB()

cross_val_predict_gnb = cross_val_predict(gnb, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_gnb) == y_train.values)/len(y_train.values)) 

gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_gnb}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_GNB_text.csv', index=False)

cross-val acc: 0.2300266311584554


In [42]:
KNN = KNeighborsClassifier()

cross_val_predict_KNN = cross_val_predict(KNN, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_KNN) == y_train.values)/len(y_train.values)) 

KNN.fit(X_train, y_train)
y_pred_KNN = KNN.predict(X_test)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_KNN}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_KNN5_text.csv', index=False)

cross-val acc: 0.6155126498002663


In [43]:
rf_regressor = RandomForestRegressor(n_estimators=1000, random_state=888,n_jobs=-1)

cross_val_predict_rf = cross_val_predict(rf_regressor, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_rf) == y_train.values)/len(y_train.values)) 
rf_regressor.fit(X_train, y_train)

y_pred_rf = rf_regressor.predict(X_test)
y_pred_rf = np.round(y_pred_rf) ## avoid some predictions not being integer 
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_rf}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_rf_text.csv', index=False)

cross-val acc: 0.7230359520639148


In [44]:
df_importance = pd.DataFrame({'Features': columns,
        'importance': rf_regressor.feature_importances_})
df_importance.sort_values(by='importance', ascending=False, inplace=True)
df_importance

Unnamed: 0,Features,importance
6,num_voted_users,0.330611
5,gross,0.060634
1,duration,0.053451
10,title_year,0.045186
9,num_user_for_reviews,0.03901
28,language_PC1,0.033647
17,genres_PC2,0.032953
16,genres_PC1,0.029415
3,actor_3_facebook_likes,0.026691
15,title_embedding_PC2,0.021777


In [45]:
kf = KFold(n_splits=10, shuffle=True, random_state=666)

accuracy = []
# Iterate over K folds
for train_index, test_index in kf.split(X_train):
    # Split data into training and test sets
    #all_fold_indices.append(test_index) 
    X_train_temp, X_test_temp = X_train[train_index], X_train[test_index]
    y_train_temp, y_test_temp = y_train.values[train_index], y_train.values[test_index]
    y_train_categorical_temp = to_categorical(y_train_temp)
    
    ## set the model
    
    np.random.seed(888)
    tf.random.set_seed(888)
    random.seed(888)
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],))) # input layer
    model.add(Dense(512, activation='relu')) # hiden layer with 512 neurons
    model.add(Dropout(0.2)) # drop out some neurons avoiding overfitting
    model.add(Dense(1, activation='relu')) # pseudo regression-like output layer
    model.add(Dense(5, activation='softmax'))  # output layer with 5 neurons for 5 classes using softmax activation
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    
    # Train the model
    model.fit(X_train_temp, y_train_categorical_temp, epochs=50, verbose=0)

    # Generate predictions on the "test set"
    predictions = np.argmax(model.predict(X_test_temp, verbose=0),axis=1)
    
    # calculate the accuracy
    accuracy = accuracy + [sum(predictions == y_test_temp)/len(y_test_temp)]
    
print('cross-val acc:',np.mean(accuracy)) 

cross-val acc: 0.6827596899224806


In [46]:
y_train_categorical = to_categorical(y_train.values)
np.random.seed(888)
tf.random.set_seed(888)
random.seed(888)
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='relu'))
model.add(Dense(5, activation='softmax'))  
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_categorical, epochs=50)
#model_nn = model
y_pred_nn = model.predict(X_test)
y_pred_nn = np.argmax(y_pred_nn, axis=1)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_nn}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_NN_text.csv', index=False)

Epoch 1/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.6293 - loss: 1.3582
Epoch 2/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 861us/step - accuracy: 0.6791 - loss: 1.1138
Epoch 3/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 855us/step - accuracy: 0.6869 - loss: 1.0339
Epoch 4/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 845us/step - accuracy: 0.6874 - loss: 0.9807
Epoch 5/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 861us/step - accuracy: 0.6867 - loss: 0.9359
Epoch 6/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 834us/step - accuracy: 0.6999 - loss: 0.8985
Epoch 7/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 838us/step - accuracy: 0.7040 - loss: 0.8790
Epoch 8/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 861us/step - accuracy: 0.7056 - loss: 0.8532
Epoch 9/50
[1m94/94[0m [32m━━━━━━━━━━━

## 6. Round 4, feature selection based on MI

In [47]:
from sklearn.feature_selection import mutual_info_regression
np.random.seed(888)
random.seed(888)

mi_scores = mutual_info_regression(X_train, y_train)  # For regression tasks

# Create a DataFrame to store feature names and their MI scores
mi_scores_df = pd.DataFrame({'Feature': columns, 'MI Score': mi_scores})
mi_scores_df.sort_values(by='MI Score', ascending=False, inplace=True)

In [48]:
mi_scores_df

Unnamed: 0,Feature,MI Score
6,num_voted_users,0.174967
12,movie_facebook_likes,0.093768
9,num_user_for_reviews,0.088804
1,duration,0.081324
0,num_critic_for_reviews,0.067453
21,director_name_PC2,0.0664
2,director_facebook_likes,0.064057
20,director_name_PC1,0.05002
7,cast_total_facebook_likes,0.045413
16,genres_PC1,0.043558


In [49]:
X_train_copy2 = X_train.copy() ## backup of unselected scaled training set including text features
X_test_copy2 = X_test.copy() ## backup of unselected scaled test set including text features
X_train = X_train[:,mi_scores_df.index[0:20]]
X_test = X_test[:,mi_scores_df.index[0:20]]
print(mi_scores_df['Feature'][0:20])

6               num_voted_users
12         movie_facebook_likes
9          num_user_for_reviews
1                      duration
0        num_critic_for_reviews
21            director_name_PC2
2       director_facebook_likes
20            director_name_PC1
7     cast_total_facebook_likes
16                   genres_PC1
4        actor_1_facebook_likes
22             actor_1_name_PC1
11       actor_2_facebook_likes
33           content_rating_PC2
23             actor_1_name_PC2
32           content_rating_PC1
17                   genres_PC2
24             actor_2_name_PC1
13    average_degree_centrality
31                  country_PC2
Name: Feature, dtype: object


In [50]:
print(X_train.shape)
X_test.shape

(3004, 20)


(752, 20)

In [51]:
gnb = GaussianNB()

## perform cross validation
cross_val_predict_gnb = cross_val_predict(gnb, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_gnb) == y_train.values)/len(y_train.values))

gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)

data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_gnb}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_GNB_text_select_1.csv', index=False)

cross-val acc: 0.2796271637816245


In [52]:
KNN = KNeighborsClassifier()

cross_val_predict_KNN = cross_val_predict(KNN, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_KNN) == y_train.values)/len(y_train.values))

KNN.fit(X_train, y_train)
y_pred_KNN = KNN.predict(X_test)

data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_KNN}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_KNN5_text_select_1.csv', index=False)

cross-val acc: 0.6324900133155792


In [54]:
rf_regressor = RandomForestRegressor(n_estimators=1000, random_state=888,n_jobs=-1)

cross_val_predict_rf = cross_val_predict(rf_regressor, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_rf) == y_train.values)/len(y_train.values))
rf_regressor.fit(X_train, y_train)

y_pred_rf = rf_regressor.predict(X_test)
y_pred_rf = np.round(y_pred_rf) ## avoid some predictions not being integer 

data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_rf}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_rf_text_select_1.csv', index=False)

cross-val acc: 0.7010652463382158


In [54]:
kf = KFold(n_splits=10, shuffle=True, random_state=666)

accuracy = []
# Iterate over K folds
for train_index, test_index in kf.split(X_train):
    # Split data into training and test sets
    #all_fold_indices.append(test_index) 
    X_train_temp, X_test_temp = X_train[train_index], X_train[test_index]
    y_train_temp, y_test_temp = y_train.values[train_index], y_train.values[test_index]
    y_train_categorical_temp = to_categorical(y_train_temp)
    
    ## set the model
    
    np.random.seed(888)
    tf.random.set_seed(888)
    random.seed(888)
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],))) # input layer
    model.add(Dense(512, activation='relu')) # hiden layer with 512 neurons
    model.add(Dropout(0.2)) # drop out some neurons avoiding overfitting
    model.add(Dense(1, activation='relu')) # pseudo regression-like output layer
    model.add(Dense(5, activation='softmax'))  # output layer with 5 neurons for 5 classes using softmax activation
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    
    # Train the model
    model.fit(X_train_temp, y_train_categorical_temp, epochs=50, verbose=0)

    # Generate predictions on the "test set"
    predictions = np.argmax(model.predict(X_test_temp, verbose=0),axis=1)
    
    # calculate the accuracy
    accuracy = accuracy + [sum(predictions == y_test_temp)/len(y_test_temp)]
    
print('cross-val acc:',np.mean(accuracy))

cross-val acc: 0.66678073089701


In [55]:
y_train_categorical = to_categorical(y_train.values)
np.random.seed(888)
tf.random.set_seed(888)
random.seed(888)
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='relu'))
model.add(Dense(5, activation='softmax'))  
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_categorical, epochs=50)
#model_nn = model
y_pred_nn = model.predict(X_test)
y_pred_nn = np.argmax(y_pred_nn, axis=1)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_nn}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_NN_text_select_1.csv', index=False)

Epoch 1/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.6243 - loss: 1.3739
Epoch 2/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 857us/step - accuracy: 0.6563 - loss: 1.1551
Epoch 3/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 827us/step - accuracy: 0.6573 - loss: 1.0744
Epoch 4/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 831us/step - accuracy: 0.6618 - loss: 1.0159
Epoch 5/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 832us/step - accuracy: 0.6614 - loss: 0.9752
Epoch 6/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 853us/step - accuracy: 0.6608 - loss: 0.9419
Epoch 7/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 829us/step - accuracy: 0.6660 - loss: 0.9175
Epoch 8/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 860us/step - accuracy: 0.6712 - loss: 0.8972
Epoch 9/50
[1m94/94[0m [32m━━━━━━━━━━━

## 7. Round 5, feature selection based on RF

In [55]:
### renew the data with new features from selection method of RF
X_train = X_train_copy2.copy()[:,df_importance.index[0:20]]
X_test = X_test_copy2.copy()[:,df_importance.index[0:20]]
df_importance['Features'][0:20]

6               num_voted_users
5                         gross
1                      duration
10                   title_year
9          num_user_for_reviews
28                 language_PC1
17                   genres_PC2
16                   genres_PC1
3        actor_3_facebook_likes
15          title_embedding_PC2
0        num_critic_for_reviews
2       director_facebook_likes
7     cast_total_facebook_likes
18            plot_keywords_PC1
12         movie_facebook_likes
11       actor_2_facebook_likes
14          title_embedding_PC1
19            plot_keywords_PC2
24             actor_2_name_PC1
27             actor_3_name_PC2
Name: Features, dtype: object

In [56]:
print(X_train.shape)
X_test.shape

(3004, 20)


(752, 20)

In [57]:
gnb = GaussianNB()

## perform cross validation
cross_val_predict_gnb = cross_val_predict(gnb, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_gnb) == y_train.values)/len(y_train.values)) 

gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_gnb}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_GNB_text_select.csv', index=False)

cross-val acc: 0.2496671105193076


In [58]:
KNN = KNeighborsClassifier()

cross_val_predict_KNN = cross_val_predict(KNN, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_KNN) == y_train.values)/len(y_train.values)) 

KNN.fit(X_train, y_train)
y_pred_KNN = KNN.predict(X_test)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_KNN}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_KNN5_text_select.csv', index=False)

cross-val acc: 0.631491344873502


In [59]:
rf_regressor = RandomForestRegressor(n_estimators=1000, random_state=888,n_jobs=-1)

cross_val_predict_rf = cross_val_predict(rf_regressor, X_train, y_train, cv=10)
print('cross-val acc:', sum(np.round(cross_val_predict_rf) == y_train.values)/len(y_train.values)) 
rf_regressor.fit(X_train, y_train)

y_pred_rf = rf_regressor.predict(X_test)
y_pred_rf = np.round(y_pred_rf) ## avoid some predictions not being integer 
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_rf}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_rf_text_select.csv', index=False)

cross-val acc: 0.7250332889480693


In [60]:
df_importance1 = pd.DataFrame({'Features': columns[df_importance.index[0:20]],
        'importance': rf_regressor.feature_importances_})
df_importance1.sort_values(by='importance', ascending=False, inplace=True)
df_importance1

Unnamed: 0,Features,importance
0,num_voted_users,0.339882
1,gross,0.073225
2,duration,0.059659
3,title_year,0.053283
4,num_user_for_reviews,0.047008
6,genres_PC2,0.040915
7,genres_PC1,0.037109
5,language_PC1,0.036236
8,actor_3_facebook_likes,0.032698
12,cast_total_facebook_likes,0.030414


In [61]:
kf = KFold(n_splits=10, shuffle=True, random_state=666)

accuracy = []
# Iterate over K folds
for train_index, test_index in kf.split(X_train):
    # Split data into training and test sets
    #all_fold_indices.append(test_index) 
    X_train_temp, X_test_temp = X_train[train_index], X_train[test_index]
    y_train_temp, y_test_temp = y_train.values[train_index], y_train.values[test_index]
    y_train_categorical_temp = to_categorical(y_train_temp)
    
    ## set the model
    
    np.random.seed(888)
    tf.random.set_seed(888)
    random.seed(888)
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],))) # input layer
    model.add(Dense(512, activation='relu')) # hiden layer with 512 neurons
    model.add(Dropout(0.2)) # drop out some neurons avoiding overfitting
    model.add(Dense(1, activation='relu')) # pseudo regression-like output layer
    model.add(Dense(5, activation='softmax'))  # output layer with 5 neurons for 5 classes using softmax activation
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    
    # Train the model
    model.fit(X_train_temp, y_train_categorical_temp, epochs=50, verbose=0)

    # Generate predictions on the "test set"
    predictions = np.argmax(model.predict(X_test_temp, verbose=0),axis=1)
    
    # calculate the accuracy
    accuracy = accuracy + [sum(predictions == y_test_temp)/len(y_test_temp)]
    
print('cross-val acc:',np.mean(accuracy)) 

cross-val acc: 0.6900675526024364


In [62]:
y_train_categorical = to_categorical(y_train.values)
np.random.seed(888)
tf.random.set_seed(888)
random.seed(888)
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='relu'))
model.add(Dense(5, activation='softmax'))  
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_categorical, epochs=50)
#model_nn = model
y_pred_nn = model.predict(X_test)
y_pred_nn = np.argmax(y_pred_nn, axis=1)
data_temp = {'id': data_test['id'],
        'imdb_score_binned': y_pred_nn}
df_temp = pd.DataFrame(data_temp)
df_temp.to_csv('output_NN_text_select.csv', index=False)

Epoch 1/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.6283 - loss: 1.3636
Epoch 2/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 883us/step - accuracy: 0.6698 - loss: 1.1309
Epoch 3/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 861us/step - accuracy: 0.6782 - loss: 1.0531
Epoch 4/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 866us/step - accuracy: 0.6801 - loss: 0.9953
Epoch 5/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 880us/step - accuracy: 0.6848 - loss: 0.9564
Epoch 6/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 870us/step - accuracy: 0.6923 - loss: 0.9208
Epoch 7/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 855us/step - accuracy: 0.6976 - loss: 0.8970
Epoch 8/50
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 860us/step - accuracy: 0.6980 - loss: 0.8815
Epoch 9/50
[1m94/94[0m [32m━━━━━━━━━━━