# IMDB Random Forests Regression
![ImdbIcon](../images/imdbheader.jpg)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report

C:\Users\nolan_fur2pfn\.conda\envs\dsi\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\nolan_fur2pfn\.conda\envs\dsi\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
totalscore_df = pd.read_csv('../data/totalscore_df.csv')

In [3]:
totalscore_df.columns

Index(['movie_title', 'year', 'actors', 'plot', 'duration', 'Action',
       'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music',
       'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi',
       'Sport', 'Thriller', 'War', 'Western', 'avg_vote', 'votes',
       'weighted_average_vote', 'total_votes', 'mean_vote', 'median_vote',
       'votes_1', 'votes_2', 'votes_3', 'votes_4', 'votes_5', 'votes_6',
       'votes_7', 'votes_8', 'votes_9', 'votes_10', 'us_voters_rating',
       'us_voters_votes', 'plot_sentiment', 'director_score', 'actor_score',
       'actress_score', 'total_score'],
      dtype='object')

In [4]:
totalscore_df.shape

(12939, 52)

## Random Forests Regression

In [5]:
features = ['duration', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
            'Fantasy', 'History', 'Horror', 'Mystery', 'votes_3', 
            'votes_4', 'votes_7', 'votes_9', 'actor_score', 'actress_score']

In [6]:
X = totalscore_df[features]
y = totalscore_df['total_score']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = .20)

In [8]:
rf = RandomForestRegressor()

In [9]:
rf.fit(X_train,y_train)

RandomForestRegressor()

In [10]:
print("Training Score:", rf.score(X_train, y_train))
print(" ")
print("Testing Score:", rf.score(X_test, y_test))

Training Score: 0.9883320309142927
 
Testing Score: 0.9210592807702264


In [11]:
print("Cross Val Score:", cross_val_score(rf, X_train, y_train, cv = 5).mean())

Cross Val Score: 0.9153991028976863


In [12]:
model_params = {}
count = 0

In [20]:
rf_params = {
    'max_depth': [14, 16, 18],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [3, 4, 5],
    'n_estimators': [300, 400, 500],
}

gs = GridSearchCV(rf, param_grid=rf_params, cv = 5, n_jobs = 12)

gs.fit(X_train, y_train)

count += 1

gs.best_params_['best_score'] = gs.best_score_

gs.best_params_['training_score'] = gs.score(X_train, y_train)

gs.best_params_['testing_score'] = gs.score(X_test, y_test)

model_params[f'model_{count}'] = gs.best_params_

model_df = pd.DataFrame.from_dict(model_params, orient='index')

model_df

Unnamed: 0,max_depth,min_samples_leaf,min_samples_split,n_estimators,best_score,training_score,testing_score
model_1,2,2,2,100,0.641929,0.645092,0.619828
model_2,4,3,3,300,0.737987,0.748128,0.727219
model_3,6,3,3,300,0.817438,0.838434,0.813714
model_4,8,3,3,200,0.876282,0.908809,0.876309
model_5,10,2,4,200,0.904823,0.952522,0.907917
model_6,14,2,5,300,0.916968,0.976488,0.921554
model_7,16,2,4,400,0.917995,0.980968,0.921211
model_8,18,2,3,300,0.917616,0.981644,0.921872


## AdaBoostRegressor

In [20]:
model_params_two = {}
count_two = 0

In [27]:
ada = AdaBoostRegressor(base_estimator=RandomForestRegressor())

ada_params = {
    'base_estimator__max_depth': [11, 12, 13],
    'learning_rate': [.95, 1.],
    'n_estimators': [70, 75, 80],
}

gs_two = GridSearchCV(ada, param_grid=ada_params, cv=3, n_jobs = 12)

gs_two.fit(X_train, y_train)

count_two += 1

gs_two.best_params_['best_score'] = gs_two.best_score_

gs_two.best_params_['training_score'] = gs_two.score(X_train, y_train)

gs_two.best_params_['testing_score'] = gs_two.score(X_test, y_test)

model_params_two[f'model_{count_two}'] = gs_two.best_params_

model_df_two = pd.DataFrame.from_dict(model_params_two, orient='index')

model_df_two

Unnamed: 0,base_estimator__max_depth,learning_rate,n_estimators,best_score,training_score,testing_score
model_1,3,0.95,20,0.815696,0.861969,0.77293
model_2,5,0.95,30,0.87996,0.953392,0.850893
model_3,7,0.95,40,0.89637,0.983651,0.878017
model_4,9,1.0,45,0.899599,0.992798,0.884188
model_5,11,0.9,55,0.901346,0.995733,0.885692
model_6,12,1.0,70,0.901715,0.996314,0.887713
model_7,13,0.95,70,0.901044,0.996506,0.886476
