# IMDB Random Forests Regression
![ImdbIcon](../images/imdbheader.jpg)

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report

In [3]:
final_df = pd.read_csv('../data/final_df.csv')

In [28]:
final_df.columns

Index(['movie_title', 'year', 'tagline', 'plot', 'cast', 'duration', 'Action',
       'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western', 'avg_vote',
       'total_votes', 'us_voters_votes', 'votes', 'votes_1', 'votes_2',
       'votes_3', 'votes_4', 'votes_5', 'votes_6', 'votes_7', 'votes_8',
       'votes_9', 'votes_10', 'popularity', 'director_score', 'actor_score',
       'actress_score', 'tagline_sentiment', 'plot_sentiment', 'total_score',
       'budget', 'revenue', 'budget_adj', 'revenue_adj'],
      dtype='object')

## Random Forests Regression

In [29]:
features = ['duration', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 
            'Crime', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 
            'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western', 'avg_vote',
            'total_votes', 'us_voters_votes', 'votes', 'votes_1', 'votes_2', 'votes_3', 'votes_4', 
            'votes_5', 'votes_6', 'votes_7', 'votes_8', 'votes_9', 'votes_10', 'popularity', 
            'director_score', 'actor_score', 'actress_score', 'tagline_sentiment', 'plot_sentiment', 
            'total_score', 'budget']

In [30]:
X = final_df[features]
y = final_df['revenue']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 42)

In [36]:
rf = RandomForestRegressor()

In [37]:
print("Cross Val Score:", cross_val_score(rf, X_train, y_train, cv = 5).mean())

Cross Val Score: 0.5734664748455603


In [9]:
model_params = {}
count = 0

In [17]:
rf_params = {
    'max_depth': [10, 11, 12, 13],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [8, 9, 10, 11, 12],
    'n_estimators': [100, 200, 300, 400],
}

gs = GridSearchCV(rf, param_grid=rf_params, cv = 5, n_jobs = 12)

gs.fit(X_train, y_train)

count += 1

gs.best_params_['best_score'] = gs.best_score_

gs.best_params_['training_score'] = gs.score(X_train, y_train)

gs.best_params_['testing_score'] = gs.score(X_test, y_test)

model_params[f'model_{count}'] = gs.best_params_

model_df = pd.DataFrame.from_dict(model_params, orient='index')

model_df

Unnamed: 0,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,best_score,training_score,testing_score
model_1,2,auto,2,2,100,0.467486,0.611122,0.594263
model_2,3,auto,3,3,100,0.54056,0.705825,0.653183
model_3,5,auto,4,5,150,0.577564,0.760334,0.665808
model_4,7,auto,4,6,150,0.580588,0.793257,0.673786
model_5,9,auto,3,8,175,0.586189,0.822773,0.674695
model_6,9,auto,3,10,150,0.584735,0.810816,0.676197
model_7,10,auto,4,11,125,0.586391,0.796233,0.675232
model_8,10,auto,4,9,100,0.586863,0.804966,0.675185


## AdaBoostRegressor

In [19]:
model_params_two = {}
count_two = 0

In [25]:
ada = AdaBoostRegressor(base_estimator=RandomForestRegressor())

ada_params = {
    'base_estimator__max_depth': [6, 7, 8],
    'learning_rate': [.85, .90, .95, 1.],
    'n_estimators': [13, 14, 15, 16, 17],
}

gs_two = GridSearchCV(ada, param_grid=ada_params, cv=3, n_jobs = 12)

gs_two.fit(X_train, y_train)

count_two += 1

gs_two.best_params_['best_score'] = gs_two.best_score_

gs_two.best_params_['training_score'] = gs_two.score(X_train, y_train)

gs_two.best_params_['testing_score'] = gs_two.score(X_test, y_test)

model_params_two[f'model_{count_two}'] = gs_two.best_params_

model_df_two = pd.DataFrame.from_dict(model_params_two, orient='index')

model_df_two

Unnamed: 0,base_estimator__max_depth,learning_rate,n_estimators,best_score,training_score,testing_score
model_1,3,0.9,20,0.456183,0.734131,0.562423
model_2,5,1.0,20,0.491898,0.860568,0.643951
model_3,6,1.0,15,0.519746,0.893979,0.656251
model_4,7,0.95,15,0.545434,0.931081,0.686592
model_5,8,0.85,10,0.551103,0.945679,0.660796
model_6,8,1.0,16,0.552201,0.948873,0.66188
