# IMDB XGBoost Total Score
![ImdbIcon](../images/imdbheader.jpg)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

# Importing so that I can use LinearRegression and OLS Models.
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics

In [7]:
final_df = pd.read_csv('../data/final_df.csv')

In [8]:
final_df.columns

Index(['movie_title', 'year', 'tagline', 'plot', 'cast', 'duration', 'Action',
       'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western', 'avg_vote',
       'total_votes', 'us_voters_votes', 'votes', 'votes_1', 'votes_2',
       'votes_3', 'votes_4', 'votes_5', 'votes_6', 'votes_7', 'votes_8',
       'votes_9', 'votes_10', 'popularity', 'director_score', 'actor_score',
       'actress_score', 'tagline_sentiment', 'plot_sentiment', 'total_score',
       'profitable', 'budget', 'revenue', 'budget_adj', 'revenue_adj'],
      dtype='object')

### XGBoost

In [9]:
from xgboost import XGBRegressor, XGBClassifier

In [10]:
xgb = XGBRegressor()

In [11]:
features = ['History', 'Music', 'Musical', 'Mystery', 'Romance', 'Sport', 'Thriller', 'War', 'Western', 
            'us_voters_votes', 'total_votes', 'votes_4', 'votes_5', 'votes_6', 'votes_7', 'votes_8','votes_9', 'votes_10', 
            'popularity', 'director_score', 'actor_score', 'actress_score', 'tagline_sentiment', 'profitable', 'budget', 
            'revenue']

In [12]:
X = final_df[features]
y = final_df['total_score']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [14]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=24, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
print("Training Score:", xgb.score(X_train, y_train))
print(" ")
print("Testing Score:", xgb.score(X_test, y_test))

Training Score: 0.9997778088958538
 
Testing Score: 0.9014472024187578


In [16]:
predictions = xgb.predict(X_train)

residuals = y_train - predictions

rss = (residuals ** 2).sum()

mse = metrics.mean_squared_error(y_train, predictions)

rmse = np.sqrt(metrics.mean_squared_error(y_train, predictions))

In [17]:
null_predictions = np.zeros_like(y_test)

null_pred = null_predictions + y_test.mean()

baseline = metrics.mean_squared_error(y_test, null_pred, squared = False)

In [18]:
print("Baseline RMSE:", baseline)
print(" ")
print("RMSE:", rmse)
print("________________________________")
print(" ")
print("Difference:", baseline - rmse)

Baseline RMSE: 0.9077799746640476
 
RMSE: 0.014936013954005982
________________________________
 
Difference: 0.8928439607100416


In [19]:
print("RMSE:", rmse)
print(" ")
print("Training Score:", xgb.score(X_train, y_train))
print(" ")
print("Testing Score:", xgb.score(X_test, y_test))

RMSE: 0.014936013954005982
 
Training Score: 0.9997778088958538
 
Testing Score: 0.9014472024187578


In [20]:
xgb = XGBRegressor()

In [22]:
model_params = {}
count = 0

In [29]:
xgb_params = {
    'base_score' : [0.13, 0.15, 0.17],
    'booster' : ['gbtree', 'gblinear', 'dart'],
    'learning_rate': [.08, .10, .12],
    'max_depth' : [2, 3, 4],
    'n_estimators': [875, 900, 925],
    'subsample' : [1],
}

gs = GridSearchCV(xgb, param_grid = xgb_params, cv = 5, n_jobs = 12)

gs.fit(X_train, y_train)

count += 1

gs.best_params_['best_score'] = gs.best_score_

gs.best_params_['training_score'] = gs.score(X_train, y_train)

gs.best_params_['testing_score'] = gs.score(X_test, y_test)

model_params[f'model_{count}'] = gs.best_params_

model_df = pd.DataFrame.from_dict(model_params, orient = 'index')

model_df

Unnamed: 0,base_score,booster,learning_rate,max_depth,n_estimators,subsample,best_score,training_score,testing_score
model_1,0.1,dart,0.15,3,300,1,0.916494,0.993688,0.891684
model_2,0.15,gbtree,0.15,3,500,1,0.919239,0.997559,0.892852
model_3,0.15,gbtree,0.15,3,700,1,0.919254,0.998937,0.892718
model_4,0.15,gbtree,0.1,3,900,1,0.919548,0.99836,0.900094
model_5,0.15,gbtree,0.1,3,1100,1,0.91976,0.999018,0.899305
model_6,0.15,gbtree,0.1,3,1150,1,0.919797,0.999159,0.899362
model_7,0.15,gbtree,0.1,3,925,1,0.919606,0.998461,0.899903
