# Louis George
## Making the final models

In [23]:
import numpy as np
import pandas as pd

import os
import spacy
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from xgboost import XGBClassifier

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import chart_studio
import chart_studio.plotly as py
chart_studio.tools.set_credentials_file(username='lougeo', api_key=os.environ.get("PLOTLY_API"))
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### Reading in the data

In [24]:
X = pd.read_csv('../../data/X_plus.csv')
y = pd.read_csv('../../data/y.csv')

In [25]:
y_imdb = y.loc[:, 'IMDb_score'].to_frame()
y_rt = y.loc[:, 'RT_score'].to_frame()
y_profit = y.loc[:, 'Per_Profit'].to_frame()

In [26]:
y = pd.read_csv("../../data/y_wt.csv")

### Vectorizing the full dataset

In [5]:
def my_preprocessor(string):
    no_d = ''.join([i for i in string if not i.isdigit()])
    return no_d.lower()

In [6]:
def my_tokenizer(string):
    # Initializing the spacy class
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(string)
    # List to append accepted tokens to
    tokens = []
    # Condition for a good token
    for token in doc:

        if (token.is_stop == False) & \
           (token.is_punct == False) & \
           (token.is_space == False) & \
           ('\n' not in str(token)):
            
            tokens.append(token.lemma_)

    return tokens

This will take a long time

In [None]:
# Fitting the vectorizer
tfidf = TfidfVectorizer(min_df=0.2, 
                        max_df=0.9, 
                        preprocessor=my_preprocessor, 
                        tokenizer=my_tokenizer, 
                        ngram_range=(1,3), 
                        stop_words=None).fit(X['scripts'])

# Exporting the fit vectorizer
joblib.dump(tfidf, '../models/full_tfidf_ns.pkl')

In [None]:
tfidf.stop_words_ = None

In [None]:
tfidf = joblib.load('../models/full_tfidf_ns.pkl')

#### Merging the transformed dataset with the other features

In [None]:
# Transforming all of the scripts (will take awhile)
X_transformed = tfidf.transform(X['scripts'])
# Turning it into a dataframe
X_vecs = pd.DataFrame(columns=tfidf.get_feature_names(), data=X_transformed.toarray())
# Merging all of the features
X_merged = pd.concat([X.drop('scripts', axis=1).reset_index(drop=True), X_vecs], axis=1)

In [None]:
X_merged.to_csv('../../X_merged.csv', columns=X_merged.columns, index=False)

#### Reloading this csv, and scaling it

In [27]:
X_merged = pd.read_csv('../../X_merged.csv')

In [28]:
X_scaled = StandardScaler().fit_transform(X_merged)

### Modeling the full dataset

In [30]:
# Inspecting all of the optimized models for hyper parameter selection
print(joblib.load('../models/IMDb_logreg_ref.pkl').best_params_)
print(joblib.load('../models/Rotten_logreg_ref.pkl').best_params_)
print(joblib.load('../models/Profit_logreg_ref.pkl').best_params_)
print(joblib.load('../models/IMDb_xgbc_ref.pkl').best_params_)
print(joblib.load('../models/Rotten_xgbc_ref.pkl').best_params_)
print(joblib.load('../models/Profit_xgbc_ref.pkl').best_params_)
print(joblib.load('../models/IMDb_rf_ref.pkl').best_params_)
print(joblib.load('../models/Rotten_rf_ref.pkl').best_params_)
print(joblib.load('../models/Profit_rf_ref.pkl').best_params_)
print(joblib.load('../models/IMDb_ada_ref.pkl').best_params_)
print(joblib.load('../models/Rotten_ada_ref.pkl').best_params_)
print(joblib.load('../models/Profit_ada_ref.pkl').best_params_)

{'model': LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False), 'model__C': 0.1, 'model__penalty': 'l2'}
{'model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False), 'model__C': 1.0, 'model__penalty': 'l1'}
{'model': LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   rando

### IMDb Score Models       

**NOTE**     
Some minor touch ups are done in the plotly chart studio editor:
 - fixing the axis on the right horizontal bar chart
 - Increasing the font size

In [32]:
logreg_imdb = LogisticRegression(C=0.1, 
                                 penalty='l2').fit(X_scaled, y_imdb)

In [33]:
xgbc_imdb = XGBClassifier(max_depth=7, 
                          learning_rate=0.01, 
                          n_estimator=200).fit(X_scaled, y_imdb)

In [34]:
rf_imdb = RandomForestClassifier(max_depth=5,
                                 n_estimators=100).fit(X_scaled, y_imdb)

In [36]:
ada_imdb = AdaBoostClassifier(learning_rate=0.1,
                              n_estimators=200).fit(X_scaled, y_imdb)

In [87]:
lr_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':logreg_imdb.coef_[0]})
lr_t50 = lr_coefs.sort_values(by='Value', ascending=False).head(10)

xg_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':xgbc_imdb.feature_importances_})
xg_t50 = xg_coefs.sort_values(by='Value', ascending=False).head(10)

rf_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':rf_imdb.feature_importances_})
rf_t50 = rf_coefs.sort_values(by='Value', ascending=False).head(10)

ada_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':ada_imdb.feature_importances_})
ada_t50 = ada_coefs.sort_values(by='Value', ascending=False).head(10)

In [89]:
# Create the subplots 
fig = make_subplots(rows=3, 
                    cols=2,
                    row_heights=[0.2, 0.4, 0.4],
                    specs=[[{"type": "xy", "colspan": 2}, None],
                           [{"type": "xy"}, {"type": "xy"}], 
                           [{"type": "xy"}, {"type": "xy"}]],
                    subplot_titles=['Distribution of Ratings', 
                                    "Logistic Regression <br> Most Important Features", 
                                    "XG Boost <br> Most Important Features", 
                                    "Random Forest <br> Most Important Features", 
                                    "ADA Boost <br> Most Important Features"], 
                    horizontal_spacing=0.05, 
                    vertical_spacing=0.1)

LR_CD = go.Bar(y=lr_t50['Coef'], 
               x=lr_t50['Value'], 
               name="Most Accurate Model", 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h', 
               marker_color='#2CA02C')
XG_CD = go.Bar(y=xg_t50['Coef'], 
               x=xg_t50['Value'], 
               name="Other Models", 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h', 
               marker_color='#FF7F0E')
RF_CD = go.Bar(y=rf_t50['Coef'], 
               x=rf_t50['Value'], 
               name='Feature Importance', 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h', 
               marker_color='#FF7F0E', 
               showlegend=False)
ADA_CD = go.Bar(y=ada_t50['Coef'], 
               x=ada_t50['Value'], 
               name='Feature Importance', 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h',  
               marker_color='#FF7F0E',
               showlegend=False)
imdb_CD = go.Histogram(x=y['IMDb_score'], 
                       name='Count', 
                       hovertemplate='%{y}<extra></extra>', 
                       marker_color='#1F77B4')

# Add the graph objects
fig.append_trace(imdb_CD, row=1, col=1)
fig.add_shape(go.layout.Shape(type='line', xref='x', yref='y',
                        x0=0.7, y0=0, x1=0.7, y1=100),
                        row=1, col=1)
fig.append_trace(LR_CD, row=2, col=1)
fig.append_trace(XG_CD, row=2, col=2)
fig.append_trace(RF_CD, row=3, col=1)
fig.append_trace(ADA_CD, row=3, col=2)


fig['layout']['xaxis1'].update(title='Rating')
fig['layout']['xaxis3'].update(autorange='reversed')
fig['layout']['xaxis2'].update(title='Coefficient')
fig['layout']['xaxis3'].update(title='Gain')
fig['layout']['xaxis5'].update(autorange='reversed')
fig['layout']['xaxis4'].update(title='Feature Importance')
fig['layout']['xaxis5'].update(title='Feature Importance')

fig['layout']['yaxis1'].update(title='Count')

# Update the layout and show
fig.update_layout(height=1200, 
                  width=800, 
                  title_text="IMDb Results")
fig.show()

# Loading onto cloud
#py.iplot(fig, filename="IMDb")

### RT Score Models

In [54]:
logreg_rt = LogisticRegression(C=1.0, 
                               penalty='l1').fit(X_scaled, y_rt)

In [55]:
xgbc_rt = XGBClassifier(learning_rate=0.1, 
                        max_depth=4, 
                        n_estimators=100).fit(X_scaled, y_rt)

In [56]:
rf_rt = RandomForestClassifier(max_depth=None,
                                 n_estimators=500).fit(X_scaled, y_rt)

In [57]:
ada_rt = AdaBoostClassifier(learning_rate=1,
                              n_estimators=200).fit(X_scaled, y_rt)

In [90]:
lr_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':logreg_rt.coef_[0]})
lr_t10 = lr_coefs.sort_values(by='Value', ascending=False).head(10)

xg_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':xgbc_rt.feature_importances_})
xg_t10 = xg_coefs.sort_values(by='Value', ascending=False).head(10)

rf_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':rf_rt.feature_importances_})
rf_t10 = rf_coefs.sort_values(by='Value', ascending=False).head(10)

ada_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':ada_rt.feature_importances_})
ada_t10 = ada_coefs.sort_values(by='Value', ascending=False).head(10)

In [92]:
# Create the subplots 
fig = make_subplots(rows=3, 
                    cols=2,
                    row_heights=[0.2, 0.4, 0.4],
                    specs=[[{"type": "xy", "colspan": 2}, None],
                           [{"type": "xy"}, {"type": "xy"}],
                           [{"type": "xy"}, {"type": "xy"}]],
                    subplot_titles=['Distribution of Ratings', 
                                    "Logistic Regression <br> Most Important Features", 
                                    "XG Boost <br> Most Important Features", 
                                    "Random Forest <br> Most Important Features", 
                                    "ADA Boost <br> Most Important Features"], 
                    horizontal_spacing=0.05, 
                    vertical_spacing=0.1)

LR_CD = go.Bar(y=lr_t10['Coef'], 
               x=lr_t10['Value'], 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h',
               marker_color='#FF7F0E', 
               showlegend=False)
XG_CD = go.Bar(y=xg_t10['Coef'], 
               x=xg_t10['Value'], 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h', 
               marker_color='#FF7F0E',
               showlegend=False)
RF_CD = go.Bar(y=rf_t10['Coef'], 
               x=rf_t10['Value'], 
               name='Most Accurate Model', 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h', 
               marker_color='#2CA02C')
ADA_CD = go.Bar(y=ada_t10['Coef'], 
               x=ada_t10['Value'], 
               name='Other Models', 
               hovertemplate='%{y}<extra></extra>',
               orientation='h', 
               marker_color='#FF7F0E')
imdb_CD = go.Histogram(x=y['RT_score'], 
                       name='Count', 
                       hovertemplate='%{y}<extra></extra>',
                       marker_color='#1F77B4')

# Add the graph objects
fig.append_trace(imdb_CD, row=1, col=1)
fig.add_shape(go.layout.Shape(type='line', xref='x', yref='y',
                        x0=0.8, y0=0, x1=0.8, y1=130),
                        row=1, col=1)
fig.append_trace(LR_CD, row=2, col=1)
fig.append_trace(XG_CD, row=2, col=2)
fig.append_trace(RF_CD, row=3, col=1)
fig.append_trace(ADA_CD, row=3, col=2)

fig['layout']['xaxis1'].update(title='Rating')
fig['layout']['xaxis3'].update(autorange='reversed')
fig['layout']['xaxis2'].update(title='Coefficient')
fig['layout']['xaxis3'].update(title='Gain')
fig['layout']['xaxis5'].update(autorange='reversed')
fig['layout']['xaxis4'].update(title='Feature Importance')
fig['layout']['xaxis5'].update(title='Feature Importance')

fig['layout']['yaxis1'].update(title='Count')

# Update the layout and show
fig.update_layout(height=1200, 
                  width=800, 
                  title_text="Rotten Tomatoes Results")
fig.show()

# Loading onto cloud
#py.iplot(fig, filename="RT")

### Profit Models

In [66]:
logreg_profit = LogisticRegression(C=0.1, 
                                   penalty='l1').fit(X_scaled, y_profit)

In [67]:
xgbc_profit = XGBClassifier(learning_rate=0.1, 
                            max_depth=5, 
                            n_estimators=80).fit(X_scaled, y_profit)

In [68]:
rf_profit = RandomForestClassifier(max_depth=None,
                                 n_estimators=500).fit(X_scaled, y_profit)

In [69]:
ada_profit = AdaBoostClassifier(learning_rate=0.1,
                              n_estimators=50).fit(X_scaled, y_profit)

In [93]:
lr_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':logreg_profit.coef_[0]})
lr_t10 = lr_coefs.sort_values(by='Value', ascending=False).head(10)

xg_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':xgbc_profit.feature_importances_})
xg_t10 = xg_coefs.sort_values(by='Value', ascending=False).head(10)

rf_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':rf_profit.feature_importances_})
rf_t10 = rf_coefs.sort_values(by='Value', ascending=False).head(10)

ada_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':ada_profit.feature_importances_})
ada_t10 = ada_coefs.sort_values(by='Value', ascending=False).head(10)

In [95]:
# Create the subplots 
fig = make_subplots(rows=3, 
                    cols=2,
                    row_heights=[0.2, 0.4, 0.4],
                    specs=[[{"type": "xy", "colspan": 2}, None],
                           [{"type": "xy"}, {"type": "xy"}], 
                           [{"type": "xy"}, {"type": "xy"}]],
                    subplot_titles=['Distribution of Profit Margins', 
                                    "Logistic Regression <br> Most Important Features", 
                                    "XG Boost <br> Most Important Features", 
                                    "Random Forest <br> Most Important Features", 
                                    "ADA Boost <br> Most Important Features"], 
                    horizontal_spacing=0.05, 
                    vertical_spacing=0.1)

LR_CD = go.Bar(y=lr_t10['Coef'], 
               x=lr_t10['Value'], 
               name="Coefficient", 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h', 
               marker_color='#FF7F0E', 
               showlegend=False)
XG_CD = go.Bar(y=xg_t10['Coef'], 
               x=xg_t10['Value'], 
               name='Gain', 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h', 
               marker_color='#FF7F0E', 
               showlegend=False)
RF_CD = go.Bar(y=rf_t10['Coef'], 
               x=rf_t10['Value'], 
               name='Most Accurate Model', 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h', 
               marker_color='#2CA02C')
ADA_CD = go.Bar(y=ada_t10['Coef'], 
               x=ada_t10['Value'], 
               name='Other Models', 
               hovertemplate='%{y}<extra></extra>', 
               orientation='h',
               marker_color='#FF7F0E')
imdb_CD = go.Histogram(x=y['Per_Profit'], 
                       name='Count', 
                       hovertemplate="%{y}<extra></extra>",
                       marker_color='#1F77B4')

# Add the graph objects
fig.add_trace(imdb_CD, row=1, col=1)
fig.add_shape(go.layout.Shape(type='line', xref='x', yref='y',
                        x0=200, y0=0, x1=200, y1=450),
                        row=1, col=1)
fig.add_trace(LR_CD, row=2, col=1)
fig.add_trace(XG_CD, row=2, col=2)
fig.append_trace(RF_CD, row=3, col=1)
fig.append_trace(ADA_CD, row=3, col=2)

fig['layout']['xaxis1'].update(title="Profit Margin (%)")
fig['layout']['xaxis3'].update(autorange='reversed')
fig['layout']['xaxis2'].update(title='Coefficient')
fig['layout']['xaxis3'].update(title='Gain')
fig['layout']['xaxis5'].update(autorange='reversed')
fig['layout']['xaxis4'].update(title='Feature Importance')
fig['layout']['xaxis5'].update(title='Feature Importance')

fig['layout']['yaxis1'].update(title='Count')


# Update the layout and show
fig.update_layout(height=1200, 
                  width=800,
                  title_text="Profit Margin Results")
fig.show()

# Loading onto cloud
#py.iplot(fig, filename="Profit")