# IMDB Random Forests Regression
![ImdbIcon](../images/imdbheader.jpg)

In [102]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
final_df = pd.read_csv('../data/final_df.csv')

In [3]:
final_df.columns

Index(['movie_title', 'year', 'tagline', 'plot', 'cast', 'duration', 'Action',
       'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western', 'avg_vote',
       'total_votes', 'us_voters_votes', 'votes', 'votes_1', 'votes_2',
       'votes_3', 'votes_4', 'votes_5', 'votes_6', 'votes_7', 'votes_8',
       'votes_9', 'votes_10', 'popularity', 'director_score', 'actor_score',
       'actress_score', 'tagline_sentiment', 'plot_sentiment', 'total_score',
       'profitable', 'budget', 'revenue', 'budget_adj', 'revenue_adj'],
      dtype='object')

## Extra Trees Regression

In [276]:
features = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 'History', 
            'Horror', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller',
            'votes_4', 'votes_5', 'votes_6', 'votes_7', 'votes_8','votes_9', 'votes_10', 
            'popularity', 'director_score', 'actor_score', 'actress_score', 'tagline_sentiment']

In [277]:
X = final_df[features]
y = final_df['total_score']

In [278]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.50)

In [279]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [280]:
et = ExtraTreesRegressor()

In [281]:
et.fit(X_train, y_train)

ExtraTreesRegressor()

In [282]:
print("Training Score:", et.score(X_train, y_train))
print("Testing Score:", et.score(X_test, y_test))

Training Score: 1.0
Testing Score: 0.870727334915645


In [283]:
print("Cross Val Score:", cross_val_score(et, X_train, y_train, cv = 5).mean())

Cross Val Score: 0.8926182303268041


In [284]:
model_params = {}
count = 0

In [295]:
et_params = {
    'max_depth': [1, 2, 3, 4],
    'min_samples_leaf': [0, 1, 2, 3],
    'min_samples_split': [3, 4, 5],
    'n_estimators': [1000, 1100, 1200],
}

gs = GridSearchCV(et, param_grid=et_params, cv = 5, n_jobs = 12)

gs.fit(X_train, y_train)

count += 1

gs.best_params_['best_score'] = gs.best_score_

gs.best_params_['training_score'] = gs.score(X_train, y_train)

gs.best_params_['testing_score'] = gs.score(X_test, y_test)

model_params[f'model_{count}'] = gs.best_params_

model_df = pd.DataFrame.from_dict(model_params, orient='index')

model_df

Unnamed: 0,max_depth,min_samples_leaf,min_samples_split,n_estimators,best_score,training_score,testing_score
model_1,2,2,2,200,0.628299,0.643966,0.619353
model_2,4,3,3,400,0.774612,0.804742,0.735617
model_3,6,2,3,500,0.837067,0.895206,0.801275
model_4,8,1,4,700,0.87001,0.958352,0.841572
model_5,10,1,4,900,0.884754,0.986403,0.860922
model_6,12,1,3,1100,0.891246,0.997108,0.869297
model_7,12,1,3,100,0.891624,0.996882,0.864472
model_8,3,2,4,900,0.721663,0.744577,0.692321
model_9,5,2,5,1100,0.810669,0.854884,0.773122
model_10,5,2,3,1300,0.810731,0.854807,0.772684
