In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
import patsy
import scipy.stats as stats
import statsmodels.api as sm
from sklearn import linear_model, model_selection, metrics
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression ,Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [2]:
# Accessing cleaned df
df = pd.read_pickle('./data/cleaned_df.pickle')
df.sample(5)

Unnamed: 0,TITLE,YEAR,EBERT_RATING,MPAA,RUNTIME,GENRE,SUB_GENRE,MOVIELENS_RATING,IMDB_RATING,BUDGET,OPENING_GROSS,DOMESTIC_GROSS,WORLDWIDE_GROSS
739,Two Brothers,2004,2.5,PG,109,Foreign,Fantasy,3.354167,7.1,59660000,6144160,19176754,62174008
2018,Club Paradise,1986,2.0,PG-13,95,Romance,Drama,2.486111,5.2,15000000,4152296,12308521,12308521
562,16 Blocks,2006,3.0,PG-13,105,Thriller,Crime,3.307337,6.6,52000000,11855260,36895141,65664721
1975,Adventures in Babysitting,1987,2.5,PG-13,102,Family,Comedy,3.227057,7.0,7000000,2901297,34368475,34368475
1978,Spaceballs,1987,2.5,PG,96,Science Fiction,Comedy,3.401829,7.1,22700000,6613837,38119483,38119483


In [3]:
genre_dummies = pd.get_dummies(df['GENRE'], prefix='GENRE', drop_first=False)
mpaa_dummies = pd.get_dummies(df['MPAA'], prefix='MPAA', drop_first=False)
subgenre_dummies = pd.get_dummies(df['SUB_GENRE'], prefix='SUB_GENRE', drop_first=False)

df = df.join(genre_dummies).join(mpaa_dummies).join(subgenre_dummies)
df.drop(['TITLE', 'GENRE', 'SUB_GENRE', 'MPAA'], inplace=True, axis=1)
df.head()

Unnamed: 0,YEAR,EBERT_RATING,RUNTIME,MOVIELENS_RATING,IMDB_RATING,BUDGET,OPENING_GROSS,DOMESTIC_GROSS,WORLDWIDE_GROSS,GENRE_Action,...,SUB_GENRE_Mystery,SUB_GENRE_N,SUB_GENRE_Neo-noir,SUB_GENRE_Road Movie,SUB_GENRE_Romance,SUB_GENRE_Science Fiction,SUB_GENRE_Sport,SUB_GENRE_Sports Film,SUB_GENRE_Suspense,SUB_GENRE_Thriller
0,2013,3.5,106,3.64273,7.1,30000000,9303145,32172757,63414135,0,...,0,0,0,0,0,0,0,0,0,0
1,2012,2.5,98,3.451807,6.8,11000000,47122,18390117,59520298,0,...,0,0,0,0,0,0,0,0,0,0
2,2012,2.0,106,2.939024,5.7,35000000,5750288,13103272,30962335,0,...,0,0,0,0,0,0,0,0,0,0
3,1982,4.0,105,3.825658,7.8,2600000,11623,8144,8144,0,...,0,0,0,0,0,0,0,0,0,0
4,2012,2.0,104,3.225352,6.2,15000000,6812900,15026056,37930465,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
X, y = df.drop('EBERT_RATING', axis=1), df.EBERT_RATING
X, X_test, y, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

kf = KFold(n_splits=5, shuffle=True, random_state=0)
X, y = np.array(X), np.array(y)

cv_rf_r2s, cv_rf_r2s_tr = [], []
cv_lasso_r2s, cv_lasso_r2s_tr = [], []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
   # Lasso model
    lasso = Lasso(alpha=0.006)
    lasso.fit(X_train_scaled, y_train)
    cv_lasso_r2s.append(lasso.score(X_val_scaled, y_val))
    cv_lasso_r2s_tr.append(lasso.score(X_train_scaled, y_train))
    
    lasso_train_preds = lasso.predict(X_train_scaled)
    lasso_train_rmse = mean_squared_error(y_train, lasso_train_preds, squared=False)
    lasso_train_mae = mean_absolute_error(y_train, lasso_train_preds)
    
    lasso_val_preds = lasso.predict(X_val_scaled)
    lasso_val_rmse = mean_squared_error(y_val, lasso_val_preds, squared=False)
    lasso_val_mae = mean_absolute_error(y_val, lasso_val_preds)
 #-------------------------------------------------------------------------------------#    
    
    # Create a Random Forest regressor object from Random Forest REgressor class
    rf = RandomForestRegressor(n_estimators=3000, max_features = 10, max_depth = 5, random_state=0)
    
    # Fit the random forest regressor with tarining data represented by X_train and y_train
    rf.fit(X_train, y_train)
    cv_rf_r2s.append(rf.score(X_val, y_val))
    cv_rf_r2s_tr.append(rf.score(X_train, y_train))

    rf_train_pred = rf.predict(X_train)
    rf_train_rmse = mean_squared_error(y_train, rf_train_pred, squared=False)
    rf_train_mae = mean_absolute_error(y_train, rf_train_pred)
    
    rf_val_pred = rf.predict(X_val)
    rf_val_rmse = mean_squared_error(y_val, rf_val_pred, squared=False)
    rf_val_mae = mean_absolute_error(y_val, rf_val_pred)

print('Lasso validation scores: ', cv_lasso_r2s)
print(f'Lasso mean cv r^2: {np.mean(cv_lasso_r2s):.3f} +- {np.std(cv_lasso_r2s):.3f}')
print("Lasso RMSE: ", lasso_val_rmse)
print("Lasso MAE: ", lasso_val_mae, "\n")
    
print('Lasso training scores: ', cv_lasso_r2s_tr)
print(f'Lasso mean cv r^2: {np.mean(cv_lasso_r2s_tr):.3f} +- {np.std(cv_lasso_r2s_tr):.3f}')
print("Lasso RMSE: ", lasso_train_rmse)
print("Lasso MAE: ", lasso_train_mae, "\n")    
    
print('Random Forest Regressor')
print("Validation set scores - ")
print("R2: ", cv_rf_r2s)
print(f"mean R2:  {np.mean(cv_rf_r2s):.3f} +- {np.std(cv_rf_r2s):.3f}")
print("RMSE: ", rf_val_rmse)
print("MAE: ", rf_val_mae)

print("Training set scores - ")
print("R2: ", cv_rf_r2s_tr)
print(f"mean R2:  {np.mean(cv_rf_r2s_tr):.3f} +- {np.std(cv_rf_r2s_tr):.3f}")
print("RMSE: ", rf_train_rmse)
print("MAE: ", rf_train_mae)

Lasso validation scores:  [0.3856128676663836, 0.3378177988794032, 0.31655080966985694, 0.3705325110998453, 0.3660368038241626]
Lasso mean cv r^2: 0.355 +- 0.025
Lasso RMSE:  0.7212055407460362
Lasso MAE:  0.5682185787745628 

Lasso training scores:  [0.41724399939010914, 0.4239831073046503, 0.4299196164242318, 0.41786188840723415, 0.4246078363544331]
Lasso mean cv r^2: 0.423 +- 0.005
Lasso RMSE:  0.6690423195800024
Lasso MAE:  0.5362326009676057 

Random Forest Regressor
Validation set scores - 
R2:  [0.3518106956168966, 0.3707955970362963, 0.32662280343490235, 0.35647971054919037, 0.36187704180473257]
mean R2:  0.354 +- 0.015
RMSE:  0.7235677743134193
MAE:  0.5751880887587383
Training set scores - 
R2:  [0.4404658065894864, 0.4413485209835012, 0.4527874143134166, 0.4330103279946892, 0.43232546832985064]
mean R2:  0.440 +- 0.007
RMSE:  0.664540300054735
MAE:  0.5403181972498129


In [31]:
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)
    
# Predict rating from test dataset with Lasso Regression
lasso.fit(X_scaled, y)
y_predict_lasso = lasso.predict((X_test_scaled))

r_square_lasso = metrics.r2_score(y_test, y_predict_lasso)
print('R-Square Error associated with Lasso Regression is: ', r_square_lasso)

# Predict rating from test dataset with Random Forest Regression
rf.fit(X, y)
y_predict_rf = rf.predict((X_test))

r_square = metrics.r2_score(y_test, y_predict_rf)
print('R-Square Error associated with Random Forest Regression is: ', r_square)

R-Square Error associated with Lasso Regression is:  0.3270133434501441
R-Square Error associated with Random Forest Regression is:  0.3394055278861985


In [32]:
lasso_mae = mean_absolute_error(y_test, y_predict_lasso)
ridge_mae = mean_absolute_error(y_test, y_predict_rf)
print(lasso_mae, ridge_mae)

0.5838267972621387 0.5857998382116659
