## IMDB Random Forest Regression

In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from IPython.display import display
from scipy import stats
from scipy.stats import norm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import ProbPlot
sns.set_style("whitegrid")
sns.set_palette("Set2")
%pylab inline


from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

  from pandas.core import datetools


Populating the interactive namespace from numpy and matplotlib


In [2]:
imdb_df_clean = pd.read_csv("imdb_df_clean.csv", sep='\t', index_col=0)

In [3]:
imdb_df_clean = imdb_df_clean.drop('Unnamed: 0.1', axis = 1)

In [4]:
imdb_df_clean.head(2)

Unnamed: 0,genre,runtime,budget,mpaa_rating,user_rating,total_votes,critic_score,us_boxoffice_gross
0,"Action, Adventure, Sci-Fi",131.0,190000000,PG-13,7.0,409200,64,101800000
1,"Crime, Drama, Mystery",153.0,46000000,R,8.1,453418,74,61000000


### Dropping Outliers

In [5]:
imdb_df_clean = imdb_df_clean[(imdb_df_clean.us_boxoffice_gross < 400000000) & (imdb_df_clean.us_boxoffice_gross > 11000)]
imdb_df_clean = imdb_df_clean[(imdb_df_clean.budget < 250000000) & (imdb_df_clean.budget > 49999)]

In [6]:
imdb_df_clean.shape

(2195, 8)

In [7]:
imdb_df_clean.columns

Index(['genre', 'runtime', 'budget', 'mpaa_rating', 'user_rating',
       'total_votes', 'critic_score', 'us_boxoffice_gross'],
      dtype='object')

### Log Transformation

In [8]:
# applying log transformation on us_boxoffice_gross and budget, due to 
# heteroscedasticity despite normal residuals
imdb_df_clean['us_boxoffice_gross'] = np.log(imdb_df_clean['us_boxoffice_gross'])
#imdb_df_clean['budget'] = np.log(imdb_df_clean['budget'])

### One Hot Encoding

In [9]:
# # converting mpaa_rating to dummy variables
# get k-1 dummies out of k categorical levels
mpaa_dummies_df = pd.get_dummies(imdb_df_clean['mpaa_rating'], drop_first=True, prefix='MPAA')
# join repurposed mpaa_rating with main df
imdb_df_clean = imdb_df_clean.join(mpaa_dummies_df)
# drop original mpaa_rating column
imdb_df_clean = imdb_df_clean.drop('mpaa_rating', axis=1)
print("Added %d columns for mpaa_rating" % (mpaa_dummies_df.shape[1]))

Added 3 columns for mpaa_rating


In [10]:
#For multiple genres, create a separate column in DF for each genre
genre_dummies_df = imdb_df_clean['genre'].str.get_dummies(sep=', ')
# join repurposed genre with main df
imdb_df_clean = imdb_df_clean.join(genre_dummies_df)
# drop original genre column
imdb_df_clean = imdb_df_clean.drop('genre', axis=1)
print("Added %d columns for genre"  % (genre_dummies_df.shape[1]))

Added 20 columns for genre


## US Box Office Gross Prediction

#### Create random training and testing sets of features and labels

In [11]:
# create a Python list of feature names
# Get all columns from dataframe
feature_cols = imdb_df_clean.columns.tolist()
# Filter columns to remove the ones not needed - retain only numeric varaibles
feature_cols  = [c for c in feature_cols if c not in ['us_boxoffice_gross']]
# use the list to select a subset of the original DataFrame
X = imdb_df_clean[feature_cols]
# check type and shape of X
print(type(X))
print(X.shape)

<class 'pandas.core.frame.DataFrame'>
(2195, 28)


In [12]:
# select a Series from the DataFrame
y = imdb_df_clean['us_boxoffice_gross']
# print the first 5 values
y.head()
# check the type and shape of y
print(type(y))
print(y.shape)

<class 'pandas.core.series.Series'>
(2195,)


### Splitting X and y into training and testing sets

In [13]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# default split is 75% for training and 25% for testing
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1646, 28)
(1646,)
(549, 28)
(549,)




## Random Forest

#### Feature Scaling

In [None]:
# from sklearn.preprocessing import StandardScaler

# sc = StandardScaler()  
# X_train = sc.fit_transform(X_train)  
# X_test = sc.transform(X_test)  

#### Hyperparameter Selection

In [None]:
# Beware - reduce thelist values, else it will take too long to run 
# Keep tweaking the regressor name as you try new combinations of hypterparameters

# from sklearn.model_selection import GridSearchCV

# param_grid = {"n_estimators": [5, 10, 30, 50, 100, 200, 500, 800, 1000],
#     "max_depth": [3, 5, 7, 10],
#     "max_features": [1, 3, 5, 7],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf": [1, 3, 10],
#     "bootstrap": [True, False]}

# model = RandomForestRegressor(random_state=1)
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
# grid.fit(X_train_filter1, y_train)

# print(grid.best_score_)
# print(grid.best_params_)

#### Random Forest Regression Model 1: based on hyperparameters picked from GridSearchCV

In [14]:
regressor1 = RandomForestRegressor(random_state=1, n_estimators=800, max_depth=None, max_features=10, 
                                  min_samples_leaf=1, min_samples_split=2, bootstrap=True)
regressor1.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=800, n_jobs=1, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [15]:
# predicting us_box_office_gross
y_pred = regressor1.predict(X_test)

#RMSE - train
print("RMSE Train: %f" % (sqrt(mean_squared_error(y_train, regressor1.predict(X_train)))))

#errors = abs(y_test - y_pred)

#print('Mean Absolute Error:', round(np.mean(errors), 2), 'dollars')
print("RMSE Test: %f" % (sqrt(mean_squared_error(y_test, y_pred))))

#from sklearn.metrics import mean_squared_log_error
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))

print('rsquared:',r2_score(y_test, y_pred))

RMSE Train: 0.482175
RMSE Test: 1.308381
Mean Squared Error:  1.71186175422
rsquared: 0.759625534216


In [16]:
#Get the top 5 important features from RandomForest
importances = list(regressor1.feature_importances_)
features_imp = list(zip(feature_cols, importances))
features_imp = sorted(features_imp, reverse=True, key = lambda x: x[1])[:]
sorted(features_imp, reverse=True, key = lambda x: x[1])[:]

[('total_votes', 0.36106961921670483),
 ('budget', 0.26369214788665285),
 ('critic_score', 0.12402080531360005),
 ('MPAA_PG-13', 0.042736795650131762),
 ('runtime', 0.040441007204809025),
 ('user_rating', 0.03981903522212573),
 ('MPAA_R', 0.022436627947707528),
 ('MPAA_Not Rated', 0.018405729950610215),
 ('Drama', 0.014727681339739869),
 ('Adventure', 0.01034186957062655),
 ('Comedy', 0.0082166948083663132),
 ('Horror', 0.006831223392516985),
 ('Crime', 0.0063832360957973263),
 ('Thriller', 0.0057021098797065328),
 ('Action', 0.0053587794784584275),
 ('Romance', 0.0046537569021941365),
 ('Mystery', 0.0040854627681456987),
 ('Animation', 0.0039239682729779448),
 ('Fantasy', 0.0036716312605064756),
 ('Family', 0.0036047018677288502),
 ('Sci-Fi', 0.0023776128949942266),
 ('Biography', 0.0022458531302513162),
 ('Sport', 0.0016482238190823476),
 ('Music', 0.0010656948889906372),
 ('War', 0.00094148132042671325),
 ('History', 0.00089367588100471755),
 ('Western', 0.00036095364257470364),
 ('

#### Model 2  - based on feature importance (retained top 7 out of 28)

In [17]:
X_train_filter1 = X_train.filter(['total_votes', 'budget', 'critic_score', 'MPAA_PG-13','user_rating', 'runtime', 'MPAA_R'], axis=1)
X_test_filter1 = X_test.filter(['total_votes', 'budget', 'critic_score', 'MPAA_PG-13','user_rating', 'runtime', 'MPAA_R'], axis=1)

In [18]:
regressor2 = RandomForestRegressor(random_state=1, n_estimators=1000, max_depth=None, 
                                  min_samples_leaf=1, min_samples_split=2, bootstrap=True)
regressor2.fit(X_train_filter1, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [19]:
# predicting us_box_office_gross
y_pred = regressor2.predict(X_test_filter1)

#RMSE - train
print("RMSE Train: %f" % (sqrt(mean_squared_error(y_train, regressor2.predict(X_train_filter1)))))

#errors = abs(y_test - y_pred)

#print('Mean Absolute Error:', round(np.mean(errors), 2), 'dollars')
print("RMSE Test: %f" % (sqrt(mean_squared_error(y_test, y_pred))))

#from sklearn.metrics import mean_squared_log_error
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))

print('rsquared:',r2_score(y_test, y_pred))

RMSE Train: 0.513964
RMSE Test: 1.386805
Mean Squared Error:  1.92322682933
rsquared: 0.729946287694


#### Model 3 - based on feature importance (retained top 5 out of 7)

In [21]:
X_train_filter2 = X_train.filter(['total_votes', 'budget', 'critic_score', 'MPAA_PG-13','user_rating'], axis=1)
X_test_filter2 = X_test.filter(['total_votes', 'budget', 'critic_score', 'MPAA_PG-13','user_rating'], axis=1)

In [22]:
regressor3 = RandomForestRegressor(random_state=1, n_estimators=5000, max_depth=None, 
                                  min_samples_leaf=1, min_samples_split=2, bootstrap=True)
regressor3.fit(X_train_filter2, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [23]:
# predicting us_box_office_gross
y_pred = regressor3.predict(X_test_filter2)

#RMSE - train
print("RMSE Train: %f" % (sqrt(mean_squared_error(y_train, regressor3.predict(X_train_filter2)))))

#errors = abs(y_test - y_pred)

#print('Mean Absolute Error:', round(np.mean(errors), 2), 'dollars')
print("RMSE Test: %f" % (sqrt(mean_squared_error(y_test, y_pred))))

#from sklearn.metrics import mean_squared_log_error
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))

print('rsquared:',r2_score(y_test, y_pred))

RMSE Train: 0.518236
RMSE Test: 1.394336
Mean Squared Error:  1.94417297235
rsquared: 0.72700509345


#### Model 4 - based on feature importance

In [24]:
X_train_filter3 = X_train.filter(['total_votes', 'budget', 'critic_score', 'MPAA_PG-13','user_rating', 'runtime', 'MPAA_R', 'MPAA_Not Rated', 'Drama', 'Adventure', 'Comedy', 'Horror', 'Crime', 'Thriller', 'Action','Romance'], axis=1)
X_test_filter3 = X_test.filter(['total_votes', 'budget', 'critic_score', 'MPAA_PG-13','user_rating', 'runtime', 'MPAA_R', 'MPAA_Not Rated', 'Drama', 'Adventure', 'Comedy','Horror', 'Crime', 'Thriller', 'Action','Romance' ], axis=1)

In [25]:
regressor4 = RandomForestRegressor(random_state=1, n_estimators=1500, max_depth=None, 
                                  min_samples_leaf=1, min_samples_split=2, bootstrap=True)
regressor4.fit(X_train_filter3, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [26]:
# predicting us_box_office_gross
y_pred = regressor4.predict(X_test_filter3)

#RMSE - train
print("RMSE Train: %f" % (sqrt(mean_squared_error(y_train, regressor4.predict(X_train_filter3)))))

#errors = abs(y_test - y_pred)

#print('Mean Absolute Error:', round(np.mean(errors), 2), 'dollars')
print("RMSE Test: %f" % (sqrt(mean_squared_error(y_test, y_pred))))

#from sklearn.metrics import mean_squared_log_error
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))

print('rsquared:',r2_score(y_test, y_pred))

RMSE Train: 0.488439
RMSE Test: 1.355590
Mean Squared Error:  1.83762383174
rsquared: 0.741966402499


In [27]:
# #Get the top 5 important features from RandomForest
# importances = list(rf.feature_importances_)
# features_imp = list(zip(feature_cols, importances))
# features_imp = sorted(features_imp, reverse=True, key = lambda x: x[1])[:5]
# sorted(features_imp, reverse=True, key = lambda x: x[1])[:5]

In [None]:
# # Beware - takes time to run: shows how # of trees/estimators relate to RMSE
# plot_df=pd.DataFrame()
# rmse_lst = []
# plot_df['# Estimators'] = list(range(50, 5000, 50))
# for i in range(50, 5000, 50):
#     regressor3 = RandomForestRegressor(random_state=1, n_estimators=i, max_depth=None, 
#                                   min_samples_leaf=1, min_samples_split=2, bootstrap=True)
#     regressor3.fit(X_train_filter2, y_train)
#     # predicting us_box_office_gross
#     y_pred = regressor3.predict(X_test_filter2)
#     # print("RMSE Test: %f n_estimators=%d" % (sqrt(mean_squared_error(y_test, y_pred)), i))
#     print('.')
#     rmse_lst.append(sqrt(mean_squared_error(y_test, y_pred)))
# plot_df['RMSE'] = rmse_lst
# ax = sns.lineplot(x='# Estimators', y='RMSE', data=plot_df)

In [None]:
# # Import tools needed for visualizing trees using pydot - generates a dot file
# from sklearn.tree import export_graphviz
# import pydot

# # Pull out one tree from the forest
# tree = rf.estimators_[5]

# # Export the image to a dot file
# export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_cols, rounded = True, precision = 1)

In [None]:
# # Use dot file to create a graph - exports to png
# (graph, ) = pydot.graph_from_dot_file('tree.dot')
# # Write graph to a png file
# graph.write_png('tree.png')

#### Feature Importance for best model (Model Interpretation)

In [29]:
feature_import = pd.DataFrame(data=regressor1.feature_importances_, index=X_train.columns.values, columns=['values'])
feature_import.sort_values(['values'], ascending=False, inplace=True)
feature_import.transpose()

Unnamed: 0,total_votes,budget,critic_score,MPAA_PG-13,runtime,user_rating,MPAA_R,MPAA_Not Rated,Drama,Adventure,...,Fantasy,Family,Sci-Fi,Biography,Sport,Music,War,History,Western,Musical
values,0.36107,0.263692,0.124021,0.042737,0.040441,0.039819,0.022437,0.018406,0.014728,0.010342,...,0.003672,0.003605,0.002378,0.002246,0.001648,0.001066,0.000941,0.000894,0.000361,0.000344


In [None]:
# # plot feature importance
# feature_import.reset_index(level=0, inplace=True)
# sns.barplot(x='index', y='values', data=feature_import, palette='deep')
# plt.show()

# # plot feature importance
# # feature_import.reset_index(level=0, inplace=True)
# fig, ax = plt.subplots()
# # the size of A4 paper
# fig.set_size_inches(11.7, 8.27)
# sns.barplot(x='values', y='index', data=feature_import, palette='deep')
# plt.show()