Now that I have optimized my two models, I need to train a new final, and complete model with the parameters which I found to be optimal, but on the full dataset.

This means that I'll have to revectorize, save that fit vectorizer (to vectorize new inputs from the webapp), 

Can I just build a pipeline that includes this? I think I could just build and fit a pipeline with all the optimized settings, and then run predictions on inputs if I format them the same as those of the training. 

# Louis George
## Making the final models

In [1]:
import numpy as np
import pandas as pd

import spacy
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

import matplotlib.pyplot as plt
%matplotlib inline

### Reading in the data

In [2]:
X = pd.read_csv('../../data/X_plus.csv')
y = pd.read_csv('../../data/y.csv')

In [3]:
y_imdb = y.loc[:, 'IMDb_score']
y_rt = y.loc[:, 'RT_score']
y_profit = y.loc[:, 'Per_Profit']

### Vectorizing the full dataset

In [None]:
def my_preprocessor(string):
    no_d = ''.join([i for i in string if not i.isdigit()])
    return no_d.lower()

In [None]:
def my_tokenizer(string):
    # Initializing the spacy class
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(string)
    # List to append accepted tokens to
    tokens = []
    # Condition for a good token
    for token in doc:

        if (token.is_stop == False) & \
           (token.is_punct == False) & \
           (token.is_space == False) & \
           ('\n' not in str(token)):
            
            tokens.append(token.lemma_)

    return tokens

This will take a long time

In [None]:
# Fitting the vectorizer
tfidf = TfidfVectorizer(min_df=0.1, 
                        max_df=0.9, 
                        preprocessor=my_preprocessor, 
                        tokenizer=my_tokenizer, 
                        ngram_range=(1,3)).fit(X['scripts'])

# Exporting the fit vectorizer
joblib.dump(tfidf, '../models/full_tfidf.pkl')

#### Merging the transformed dataset with the other features

In [None]:
# Transforming all of the scripts (will take awhile)
X_transformed = tfidf.transform(X['scripts'])
# Turning it into a dataframe
X_vecs = pd.DataFrame(columns=tfidf.get_feature_names(), data=X_transformed.toarray())
# Merging all of the features
X_merged = pd.concat([X.drop('scripts', axis=1).reset_index(drop=True), X_vecs], axis=1)

### Modeling the full dataset

In [None]:
# Inspecting all of the optimized models for hyper parameter selection
print(joblib.load('../models/IMDb_logreg.pkl'))
print(joblib.load('../models/Rotten_logreg.pkl'))
print(joblib.load('../models/Profit_logreg.pkl'))
print(joblib.load('../models/IMDb_xgbc.pkl'))
print(joblib.load('../models/Rotten_xgbc.pkl'))
print(joblib.load('../models/Profit_xgbc.pkl'))

### IMDb Score Models

In [None]:
logreg = LogisticRegression(C=0.1, 
                            penalty='l2').fit(X_merged, y_imdb)
joblib.dump(logreg, '../models/imdb_logreg_full.pkl')

In [None]:
lr_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':logreg.coef_[0]})
lr_t10 = lr_coefs.sort_values(by='Value', ascending=False).head(10)

plt.figure(figsize=(12,7))
plt.barh(lr_t10['Coef'], abs(lr_t10['Value']))
plt.title("Features with the Highest Coefficient \n Logistic Regression")
plt.xlabel("Coefficient Value")
plt.savefig(f"../plots/imdb_logreg_full.png");

In [None]:
xgbc = XGBClassifier(max_depth=7, 
                     learning_rate=0.01, 
                     n_estimator=200).fit(X_train_tfidf, y_imdb_train)
joblib.dump(logreg, '../models/imdb_xgbc_full.pkl')

In [None]:
xg_coefs = pd.DataFrame({'Coef':X_train_tfidf.columns,
                         'Value':xgbc.feature_importances_})
xg_t10 = xg_coefs.sort_values(by='Value', ascending=False).head(10)

plt.figure(figsize=(12,7))
plt.barh(xg_t10['Coef'], abs(xg_t10['Value']))
plt.title("Features with the Highest Gain \n XG Boost")
plt.xlabel("Gain")
plt.savefig(f"../plots/imdb_xgbc_full.png");

### RT Score Models

In [None]:
logreg = LogisticRegression(C=1.0, 
                            penalty='l1').fit(X_merged, y_rt)
joblib.dump(logreg, '../models/rt_logreg_full.pkl')

In [None]:
lr_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':logreg.coef_[0]})
lr_t10 = lr_coefs.sort_values(by='Value', ascending=False).head(10)

plt.figure(figsize=(12,7))
plt.barh(lr_t10['Coef'], abs(lr_t10['Value']))
plt.title("Features with the Highest Coefficient \n Logistic Regression")
plt.xlabel("Coefficient Value")
plt.savefig(f"../plots/rt_logreg_full.png");

In [None]:
xgbc = XGBClassifier(learning_rate=0.1, 
                     max_depth=4, 
                     n_estimators=100).fit(X_merged, y_rt)
joblib.dump(logreg, '../models/rt_xgbc_full.pkl')

In [None]:
xg_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':xgbc.feature_importances_})
xg_t10 = xg_coefs.sort_values(by='Value', ascending=False).head(10)

plt.figure(figsize=(12,7))
plt.barh(xg_t10['Coef'], abs(xg_t10['Value']))
plt.title("Features with the Highest Gain \n XG Boost")
plt.xlabel("Gain")
plt.savefig(f"../plots/rt_xgbc_full.png");

### Profit Models

In [None]:
logreg = LogisticRegression(C=0.1, 
                            penalty='l1').fit(X_merged, y_profit)
joblib.dump(logreg, '../models/profit_logreg_full.pkl')

In [None]:
lr_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':logreg.coef_[0]})
lr_t10 = lr_coefs.sort_values(by='Value', ascending=False).head(10)

plt.figure(figsize=(12,7))
plt.barh(lr_t10['Coef'], abs(lr_t10['Value']))
plt.title("Features with the Highest Coefficient \n Logistic Regression")
plt.xlabel("Coefficient Value")
plt.savefig(f"../plots/profit_logreg_full.png");

In [None]:
xgbc = XGBClassifier(learning_rate=0.1, 
                     max_depth=5, 
                     n_estimators=80).fit(X_merged, y_profit)
joblib.dump(logreg, '../models/profit_xgbc_full.pkl')

In [None]:
xg_coefs = pd.DataFrame({'Coef':X_merged.columns,
                         'Value':xgbc.feature_importances_})
xg_t10 = xg_coefs.sort_values(by='Value', ascending=False).head(10)

plt.figure(figsize=(12,7))
plt.barh(xg_t10['Coef'], abs(xg_t10['Value']))
plt.title("Features with the Highest Gain \n XG Boost")
plt.xlabel("Gain")
plt.savefig(f"../plots/profit_xgbc_full.png");