I based my initial selection for imports off the work we did in the NLP Practice breakfast hour challenge.

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#from functions import pipe_grid, pipe_grid_njobs, lemmatize_text, stem_text
#import functions as fun
# from bs4 import BeautifulSoup

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
# from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction import text 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
# from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

import time

In [24]:
df = pd.read_csv('data/cleaned_all_text2022-06-27.csv')

df.head()

Unnamed: 0,subreddit_name,all_words
0,startrek,I’m beaming and I had to share - Sir Patrick ...
1,startrek,America and the Star Trek Universe. Roe Vs Wa...
2,startrek,Analysis: Star Trek: The Next Generation’ Gue...
3,startrek,One of the first occasions in which the word ...
4,startrek,Is A TOS Reboot Coming Soon?


# Baseline Accuracy

Our baseline accuracy is 57.2%

In [25]:
df.subreddit_name.value_counts(normalize = True)

starwars    0.572228
startrek    0.427772
Name: subreddit_name, dtype: float64

# Building Functions to Lemmatize and Stem Text

I'm using the workflow from the NLP Practice breakfast hour as a model.

In [11]:
def lemmatize_text(text):
    split_text = text.split()
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in split_text])

def stem_text(text):
    split_text = text.split()
    p_stemmer = PorterStemmer()
    return ' '.join([p_stemmer.stem(word) for word in split_text])

test_phrase1 = 'my computer computes computationally'
print(f'Test phrase: {test_phrase1}')
print(f'Test phrase lemmatized: {lemmatize_text(test_phrase1)}')
print(f'Test phrase stemmed: {stem_text(test_phrase1)}')
print('')

test_phrase2 = 'studies studying cries cry'
print(f'Test phrase: {test_phrase2}')
print(f'Test phrase lemmatized: {lemmatize_text(test_phrase2)}')
print(f'Test phrase stemmed: {stem_text(test_phrase2)}')

Test phrase: my computer computes computationally
Test phrase lemmatized: my computer computes computationally
Test phrase stemmed: my comput comput comput

Test phrase: studies studying cries cry
Test phrase lemmatized: study studying cry cry
Test phrase stemmed: studi studi cri cri


# Prepping Data for Modeling

Getting training and test data set up.

In [26]:
X = df['all_words']
y = df['subreddit_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)
print('')
print('Dataframe shape:', df.shape)
print('')
print('y_train value counts:', y_train.value_counts(normalize = True))
print('y_test value counts:', y_test.value_counts(normalize = True))

X_train shape: (3753,)
y_train shape: (3753,)
X_test shape: (1252,)
y_test shape: (1252,)

Dataframe shape: (5005, 2)

y_train value counts: starwars    0.572342
startrek    0.427658
Name: subreddit_name, dtype: float64
y_test value counts: starwars    0.571885
startrek    0.428115
Name: subreddit_name, dtype: float64


# Modeling

I built a function to streamline the pipeline/gridsearch writing.\
I modeled with out adding the proper name stop words first.

I began by working on CountVectorizer and LogisticRegression, tweaking the parameters and running the model repeatedly. The score was very high to start, probably because of the proper names.

## Building a Gridsearch Function

In [27]:
def pipe_grid(pipe_params, grid_params):
    '''
    This function is designed to streamline gridsearching.
    It returns a gridsearch named 'gs'
    'pipe_params' should be a list of tuples consisting of a series of name/transform pairs followed by a name/model, 
            e.g. [('cvec', CountVectorizer()), ('log', LogisticRegression())]
    'grid_params' should be a series of parameters for those transforms and the model in the form of a dictionary,
            e.g. {'cvec__ngram_range': [(1,1), (1,2)]}
    Be sure the names for the 'pipe_params' and in the 'grid_params match'
    '''
    
    global gs
    
    pipe = Pipeline(pipe_params)
    
    gs = GridSearchCV(pipe, grid_params)
   
    return gs

In [28]:
def pipe_grid_njobs(pipe_params, grid_params):
    '''
    This function is designed to streamline gridsearching.
    It returns a gridsearch named 'gs'
    'pipe_params' should be a list of tuples consisting of a series of name/transform pairs followed by a name/model, 
            e.g. [('cvec', CountVectorizer()), ('log', LogisticRegression())]
    'grid_params' should be a series of parameters for those transforms and the model in the form of a dictionary,
            e.g. {'cvec__ngram_range': [(1,1), (1,2)]}
    Be sure the names for the 'pipe_params' and in the 'grid_params match'
    '''
    
    global gs
    
    pipe = Pipeline(pipe_params)
    
    gs = GridSearchCV(pipe, grid_params, n_jobs = -1)
    
    
    return gs

## NOTE:
I developed several models in which I didn't eliminate proper names, but as it seemed too easy, per Hank's suggestion, I eliminated the proper names and franchise specific references that appeared in the top 150 words from the overall list and from each show. The models below are tuned with that list. I saved the other models but didn't include them here in order to avoid clutter. Please let me know if you want to see them.

Because I did that work, I started my models below from what I found to be the best parameters without the proper name/franchise references removed and tuned from there.

I also worked with the proper_names list (found in ['02_EDA_and_Addl_Cleaning.ipynb'](02_EDA_and_Addl_Cleaning.ipynb)), but found that was a little easier than I'd like, too, so I created expanded_stop_words, which I work with here.

# Exploring Models with Proper Name Stop Words Removed

In [29]:
proper_names = ['snw', 'discovery', 'force', 'federation', 'darth', 'borg', 'wars', 'anakin',\
                'lightsaber', 'vader', 'picard', 'skywalker', 'sith', 'spock', 'starfleet', 'leia',\
                'star', 'tng', 'ds9', 'reva', 'strange', 'luke', 'series', 'obi', 'clone', 'trek',\
                'enterprise', 'disney', 'wan', 'voyager', 'jedi', 'kenobi', 'tos']

stop_words = text.ENGLISH_STOP_WORDS.union(proper_names)

In [30]:
expanded_proper_names = ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia',\
                         'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet',\
                         'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation',\
                         'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf',\
                         'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith',\
                         'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']


expanded_stop_words = text.ENGLISH_STOP_WORDS.union(expanded_proper_names)

# Summary of Models

I explored all four combinations of CountVectorizer, TfidfVectorizer, MultinomialNB, and LogisticRegression

As measured by accuracy on test data, the best model is Model 2, which uses CountVectorizer and MultinomialNB. It also had the best fit. The best parameters were:
* Cvec max_df:
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': expanded_stop_words, 'nb__alpha': 1.0} Training score: 0.9344524380495604 Test score: 0.8801916932907349

As measured by accuracy on test data

# Model 1, TfidfVectorizer, MultinomialNB
My best TfidfVectorizer, MultinomialNB model was my best model with proper names, so I started with this model here.

In [31]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1),(1,2)],
    'tvec__stop_words': [stop_words, proper_names],
    'tvec__max_df': [0.1, 0.40, 0.9],
    'tvec__max_features': [2_000, 3_000, 4_900]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

43.651350021362305
Best parameters: {'tvec__max_df': 0.1, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'enterprise', 'every', 'toward', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'series', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'made', 'but', 'through', 'upon', 

In [156]:
#using expanded_stop_words makes it marginally weaker, but not much, really
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1),(1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.1, 0.40, 0.9],
    'tvec__max_features': [2_000, 3_000, 4_900]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
44.001856088638306
Best parameters: {'tvec__max_df': 0.1, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [158]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1),(1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.05, 0.1, 0.2],
    'tvec__max_features': [4_500, 4_900, 5_300]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
43.8257269859314
Best parameters: {'tvec__max_df': 0.1, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager'

In [159]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.09, 0.1, 0.11],
    'tvec__max_features': [4_500, 4_900, 5_300]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
13.498803853988647
Best parameters: {'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [160]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.07, 0.08, 0.09],
    'tvec__max_features': [4_500, 4_900, 5_300]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
6.10121488571167
Best parameters: {'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager', 

In [163]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_800, 4_900, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
2.1546919345855713
Best parameters: {'tvec__max_df': 0.09, 'tvec__max_features': 4800, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager'

In [161]:
# best without changing alpha
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_900, 5_300, 5_700]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 3.814697265625e-06
2.118048906326294
Best parameters: {'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager', 

In [15]:
# BEST MODEL 1

pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_900],
    'nb__alpha': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

2.1023550033569336
Best parameters: {'nb__alpha': 0.5, 'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him',

In [165]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_900],
    'nb__alpha': [0.3, 0.5, 0.7]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
2.1075940132141113
Best parameters: {'nb__alpha': 0.3, 'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being'

## Model 1, TfidfVectorizer and MultinomialNB, Best Parameters

BBest parameters: {'nb__alpha': 0.5, 'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': expanded_stop_words
Training score: 0.9536370903277378
Test score: 0.8769968051118211

# Model 2, CountVectorizer, MultinomialNB 
Without proper names removed, the second best model was the optimized CountVectorizer/MultinomialNB model, so I worked through this next.

In [168]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.34, 0.64, 0.94],
    'cvec__max_features': [2_500, 4_500, 6_500]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.245208740234375e-06
43.72111892700195
Best parameters: {'cvec__max_df': 0.34, 'cvec__max_features': 6500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']}
Training score: 0.929656274980016
Test score: 0.8793929712460063


In [169]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.34, 0.64, 0.94],
    'cvec__max_features': [5_500, 6_500, 7_500]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 7.152557373046875e-06
43.68476104736328
Best parameters: {'cvec__max_df': 0.34, 'cvec__max_features': 7500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager

No improvement in test score, less strong fit

In [170]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.34, 0.64, 0.94],
    'cvec__max_features': [6_000, 6_500, 7_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.9604644775390625e-06
43.8722620010376
Best parameters: {'cvec__max_df': 0.34, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager

In [171]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.34, 0.64, 0.94],
    'cvec__max_features': [6_900, 7_000, 7_100]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
43.96654176712036
Best parameters: {'cvec__max_df': 0.34, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [172]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.24, 0.34, 0.44],
    'cvec__max_features': [7_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 6.9141387939453125e-06
14.783697843551636
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [173]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.2, 0.24, 0.28],
    'cvec__max_features': [7_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.6689300537109375e-06
2.161890983581543
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [173]:
#BEST VERSION WITHOUT CHANGING NB PARAMETERS
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.24],
    'cvec__max_features': [7_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.6689300537109375e-06
2.161890983581543
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [175]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.24],
    'cvec__max_features': [7_000],
    'nb__alpha': [0.5, 1.0, 1.5]    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
2.142659902572632
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [176]:
#BEST VERSION, ALPHA 1.0

pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.24],
    'cvec__max_features': [7_000],
    'nb__alpha': [0.9, 1.0, 1.1]    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 4.0531158447265625e-06
2.1283137798309326
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

## Best Version of Model 2.2, Cvec & NB

Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': expanded_stop_words, 'nb__alpha': 1.0}
Training score: 0.9344524380495604
Test score: 0.8801916932907349

# Model 3, CountVectorizer & LogisticRegression
Without proper names removed, the optimized CountVectorizer and LogisticRegression combination was the third best, so I worked with that 3rd.

In [177]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.01, 0.19, 0.37],
    'cvec__max_features': [1_500, 3_000, 4_500],
    #'log__C': [1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.2159347534179688e-05
56.74627709388733
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [178]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.01, 0.19, 0.37],
    'cvec__max_features': [4_000, 4_500, 5_000],
    #'log__C': [1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 3.814697265625e-06
59.06657409667969
Best parameters: {'cvec__max_df': 0.37, 'cvec__max_features': 4000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']}
Training score: 0.968558486544098
Test score: 0.8306709265175719


In [179]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.09, 0.19, 0.29],
    'cvec__max_features': [4_500],
    #'log__C': [1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 7.152557373046875e-06
20.858755111694336
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [181]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.04, 0.09, 0.14],
    'cvec__max_features': [4_500],
    #'log__C': [1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
6.810006856918335
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [182]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.09],
    'cvec__max_features': [4_500],
    'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.4066696166992188e-05
3.3841030597686768
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [183]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.09],
    'cvec__max_features': [4_500],
    'log__C': [1.5, 2.0, 2.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 7.152557373046875e-07
3.591322183609009
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager

In [184]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.09],
    'cvec__max_features': [4_500],
    'log__C': [1.3, 1.5, 1.7]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.9604644775390625e-06
3.9458119869232178
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [185]:
#BEST VERSION OF 3
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.09],
    'cvec__max_features': [4_500],
    'log__C': [1.5, 1.6]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 4.76837158203125e-06
2.444786787033081
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager'

## Best 1.2 Cvec and LogReg

Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': expanded_stop_words, 'log__C': 1.5}
Training score: 0.9746869171329603
Test score: 0.8442492012779552

# Model 4, TfidfVectorizer, LogisticRegression
Of the 4 combinations I tried without proper names/show references removed, this was the least effect, so I tried it last.

In [186]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.02, 0.22, 0.42],
    'tvec__max_features': [900, 2_900, 4_900],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
47.06992697715759
Best parameters: {'tvec__max_df': 0.22, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [187]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.15, 0.22, 0.29],
    'tvec__max_features': [900, 2_900, 4_900],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 6.198883056640625e-06
47.04106307029724
Best parameters: {'tvec__max_df': 0.29, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']}
Training score: 0.9205968558486544
Test score: 0.8338658146964856


In [188]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.20, 0.22, 0.24],
    'tvec__max_features': [900, 2_900, 4_900],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
46.90589618682861
Best parameters: {'tvec__max_df': 0.2, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager

In [189]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.21, 0.22, 0.23],
    'tvec__max_features': [900, 2_900, 4_900],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
47.65503001213074
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [190]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.21],
    'tvec__max_features': [3_400, 4_900, 5_400],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
16.321935176849365
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [191]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_400, 4_900, 5_300],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
16.49087882041931
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [192]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2), (1,3)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_400, 4_900, 5_300],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
36.74275779724121
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [193]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_800, 4_900, 5_000],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.0067901611328125e-06
5.021714925765991
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [194]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_900],
    'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
4.9872589111328125
Best parameters: {'log__C': 1.5, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', '

In [195]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_900],
    'log__C': [1.5, 2.0, 2.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 8.106231689453125e-06
5.038173198699951
Best parameters: {'log__C': 2.5, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 

In [196]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_900],
    'log__C': [1.4, 1.5, 1.6]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.059906005859375e-06
5.052214860916138
Best parameters: {'log__C': 1.6, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 

In [16]:
#PREFERRED VERSION OF MODEL 4
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_900],
    'log__C': [1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

1.894463300704956
Best parameters: {'log__C': 1.5, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by

## Best Version of 4.2 Best Log Reg for Inference
Best parameters: {'log__C': 1.6, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': expanded_stop_words
Training score: 0.9547029043431922
Test score: 0.8618210862619808

# Model 5, CountVectorizer, RandomForest

In [17]:
pipe_params = [('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_proper_names, expanded_stop_words],
    'cvec__max_df': [0.1, 0.5, 1.0],
    'cvec__max_features': [1_000, 2_500, 5_000],
    'rf__n_estimators': [50, 100, 150],
    'rf__max_depth': [3, 5, 7],
    'rf__min_samples_split': [3, 5, 7],
    'rf__min_samples_leaf': [3, 5, 7]
    
}

pipe_grid_njobs(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best parameters:', gs.best_params_)
print('Training score:', gs.score(X_train, y_train))
print('Test score:', gs.score(X_test, y_test))

Time to run function: 1.1920928955078125e-06
Best parameters: {'cvec__max_df': 1.0, 'cvec__max_features': 1000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos'], 'rf__max_depth': 7, 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 3, 'rf__n_estimators': 100}
Training score: 0.7708499866773248
Test score: 0.713258785942492


In [23]:
pipe_params = [('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_proper_names, expanded_stop_words],
    'cvec__max_df': [1.0],
    'cvec__max_features': [500, 1000],
    'rf__n_estimators': [75, 100, 125],
    'rf__max_depth': [7, 9],
    'rf__min_samples_split': [3, 4],
    'rf__min_samples_leaf': [3, 4]
    
}

pipe_grid_njobs(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best parameters:', gs.best_params_)
print('Training score:', gs.score(X_train, y_train))
print('Test score:', gs.score(X_test, y_test))

Time to run function: 0.007119894027709961
99.50046181678772
Best parameters: {'cvec__max_df': 1.0, 'cvec__max_features': 1000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos'], 'rf__max_depth': 9, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 4, 'rf__n_estimators': 125}
Training score: 0.7807087663202771
Test score: 0.7228434504792333


In [None]:
Pipeline()

In [17]:
pipe_params = [('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_proper_names, expanded_stop_words],
    'cvec__max_df': [1.0],
    'cvec__max_features': [750, 1000],
    'rf__n_estimators': [75, 100, 125],
    'rf__max_depth': [7, 9],
    'rf__min_samples_split': [3, 4],
    'rf__min_samples_leaf': [3, 4]
    
}

pipe_grid_njobs(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best parameters:', gs.best_params_)
print('Training score:', gs.score(X_train, y_train))
print('Test score:', gs.score(X_test, y_test))

95.00438189506531
Best parameters: {'cvec__max_df': 1.0, 'cvec__max_features': 750, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos'], 'rf__max_depth': 9, 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 3, 'rf__n_estimators': 100}
Training score: 0.7881694644284573
Test score: 0.731629392971246


# Model 6, TfidfVecorizer, RandomForestClassifier()

In [None]:
pipe_params = [('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier())]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_proper_names, expanded_stop_words],
    'tvec__max_df': [0.1, 0.5, 1.0],
    'tvec__max_features': [1_000, 2_500, 5_000],
    'rf__n_estimators': [100], #50, 150
    'rf__max_depth': [3], #5, 7
    'rf__min_samples_split': [3], #, 5, 7
    'rf__min_samples_leaf': [3] #, 5, 7
    
}

pipe_grid_njobs(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best parameters:', gs.best_params_)
print('Training score:', gs.score(X_train, y_train))
print('Test score:', gs.score(X_test, y_test))

Time to run function: 0.00506281852722168
15.203682661056519
Best parameters: {'rf__max_depth': 3, 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 3, 'rf__n_estimators': 100, 'tvec__max_df': 0.5, 'tvec__max_features': 1000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']}
Training score: 0.697841726618705
Test score: 0.6661341853035144


# Model 7 VotingClassifier with AdaBoosting and GradientBoosting

In [35]:
vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
    
])

vote_params = {
    'cvec__stop_words': [expanded_proper_names, expanded_stop_words],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__max_df': [0.5, 1.0],
    'cvec__max_features': [1_000, 2_000],
    'vote__tree__max_depth': [None, 5],
    'vote__ada__n_estimators': [50, 75],
    'vote__gb__n_estimators': [50, 75]
    
}

pipe = Pipeline([('cvec', CountVectorizer()),
                 ('vote', vote)])

gs = GridSearchCV(pipe, param_grid = vote_params, cv = 3, n_jobs = -1)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best Parameters:',gs.best_params_)
print('Training Score:', gs.score(X_train, y_train))
print(('Test Score:'), gs.score(X_test, y_test))

66.90030717849731
Best Parameters: {'cvec__max_df': 1.0, 'cvec__max_features': 2000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'made'

In [36]:
vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
    
])

vote_params = {
    'cvec__stop_words': [expanded_proper_names, expanded_stop_words],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__max_df': [0.9, 1.0],
    'cvec__max_features': [2_000, 3_000],
    'vote__tree__max_depth': [None, 5],
    'vote__ada__n_estimators': [75, 100],
    'vote__gb__n_estimators': [75, 100]
    
}

pipe = Pipeline([('cvec', CountVectorizer()),
                 ('vote', vote)])

gs = GridSearchCV(pipe, param_grid = vote_params, cv = 3, n_jobs = -1)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best Parameters:',gs.best_params_)
print('Training Score:', gs.score(X_train, y_train))
print(('Test Score:'), gs.score(X_test, y_test))

94.36913919448853
Best Parameters: {'cvec__max_df': 1.0, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'made'

In [37]:
vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
    
])

vote_params = {
    'cvec__stop_words': [expanded_proper_names, expanded_stop_words],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__max_df': [0.95, 1.0],
    'cvec__max_features': [3_000, 4_000],
    #'vote__tree__max_depth': [None, 5],
    'vote__ada__n_estimators': [100, 150],
    'vote__gb__n_estimators': [100, 150]
    
}

pipe = Pipeline([('cvec', CountVectorizer()),
                 ('vote', vote)])

gs = GridSearchCV(pipe, param_grid = vote_params, cv = 3, n_jobs = -1)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best Parameters:',gs.best_params_)
print('Training Score:', gs.score(X_train, y_train))
print(('Test Score:'), gs.score(X_test, y_test))

72.05036687850952
Best Parameters: {'cvec__max_df': 0.95, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'made

In [38]:
vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
    
])

vote_params = {
    'cvec__stop_words': [expanded_proper_names, expanded_stop_words],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__max_df': [0.95, .96],
    'cvec__max_features': [3_000, 3_500],
    #'vote__tree__max_depth': [None, 5],
    'vote__ada__n_estimators': [150, 200],
    'vote__gb__n_estimators': [150, 200]
    
}

pipe = Pipeline([('cvec', CountVectorizer()),
                 ('vote', vote)])

gs = GridSearchCV(pipe, param_grid = vote_params, cv = 3, n_jobs = -1)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best Parameters:',gs.best_params_)
print('Training Score:', gs.score(X_train, y_train))
print(('Test Score:'), gs.score(X_test, y_test))

91.90894794464111
Best Parameters: {'cvec__max_df': 0.96, 'cvec__max_features': 3500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'made

In [43]:
vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
    
])

vote_params = {
    'cvec__stop_words': [expanded_stop_words],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__max_df': [0.96],
    'cvec__max_features': [3_500],
    #'vote__tree__max_depth': [None, 5],
    'vote__ada__n_estimators': [400, 500],
    'vote__gb__n_estimators': [300, 350]
    
}

pipe = Pipeline([('cvec', CountVectorizer()),
                 ('vote', vote)])

gs = GridSearchCV(pipe, param_grid = vote_params, cv = 3, n_jobs = -1)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best Parameters:',gs.best_params_)
print('Training Score:', gs.score(X_train, y_train))
print(('Test Score:'), gs.score(X_test, y_test))

20.20399808883667
Best Parameters: {'cvec__max_df': 0.96, 'cvec__max_features': 3500, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'made

In [42]:
#above is better

vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
    
])

vote_params = {
    'cvec__stop_words': [expanded_stop_words],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__max_df': [0.96],
    'cvec__max_features': [3_500],
    #'vote__tree__max_depth': [None, 5],
    'vote__ada__n_estimators': [500, 600],
    'vote__gb__n_estimators': [350, 400]
    
}

pipe = Pipeline([('cvec', CountVectorizer()),
                 ('vote', vote)])

gs = GridSearchCV(pipe, param_grid = vote_params, cv = 3, n_jobs = -1)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best Parameters:',gs.best_params_)
print('Training Score:', gs.score(X_train, y_train))
print(('Test Score:'), gs.score(X_test, y_test))

23.373717069625854
Best Parameters: {'cvec__max_df': 0.96, 'cvec__max_features': 3500, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'mad

In [44]:
vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
    
])

vote_params = {
    'cvec__stop_words': [expanded_stop_words],
    'cvec__ngram_range': [(1,2)],
    'cvec__max_df': [0.96],
    'cvec__max_features': [3_500],
    'vote__tree__max_depth': [5, 10, 15],
    'vote__ada__n_estimators': [400],
    'vote__gb__n_estimators': [350]
    
}

pipe = Pipeline([('cvec', CountVectorizer()),
                 ('vote', vote)])

gs = GridSearchCV(pipe, param_grid = vote_params, cv = 3, n_jobs = -1)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best Parameters:',gs.best_params_)
print('Training Score:', gs.score(X_train, y_train))
print(('Test Score:'), gs.score(X_test, y_test))

10.939864158630371
Best Parameters: {'cvec__max_df': 0.96, 'cvec__max_features': 3500, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'mad

In [45]:
#best version of model 7, with cvec__max_features: [3500]
vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
    
])

vote_params = {
    'cvec__stop_words': [expanded_stop_words],
    'cvec__ngram_range': [(1,2)],
    'cvec__max_df': [0.96],
    'cvec__max_features': [3_000, 3_500, 4_000],
    'vote__tree__max_depth': [None],
    'vote__ada__n_estimators': [400],
    'vote__gb__n_estimators': [350]
    
}

pipe = Pipeline([('cvec', CountVectorizer()),
                 ('vote', vote)])

gs = GridSearchCV(pipe, param_grid = vote_params, cv = 3, n_jobs = -1)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best Parameters:',gs.best_params_)
print('Training Score:', gs.score(X_train, y_train))
print(('Test Score:'), gs.score(X_test, y_test))

11.841071844100952
Best Parameters: {'cvec__max_df': 0.96, 'cvec__max_features': 3500, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'mad

In [46]:
#above is better
vote = VotingClassifier([
    ('tree', DecisionTreeClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier())
    
])

vote_params = {
    'cvec__stop_words': [expanded_stop_words],
    'cvec__ngram_range': [(1,2)],
    'cvec__max_df': [0.96],
    'cvec__max_features': [3_400, 3_500, 3_600],
    'vote__tree__max_depth': [None],
    'vote__ada__n_estimators': [400],
    'vote__gb__n_estimators': [350]
    
}

pipe = Pipeline([('cvec', CountVectorizer()),
                 ('vote', vote)])

gs = GridSearchCV(pipe, param_grid = vote_params, cv = 3, n_jobs = -1)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print('Best Parameters:',gs.best_params_)
print('Training Score:', gs.score(X_train, y_train))
print(('Test Score:'), gs.score(X_test, y_test))

11.784019947052002
Best Parameters: {'cvec__max_df': 0.96, 'cvec__max_features': 3400, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': frozenset({'thereby', 'thence', 'mill', 'vader', 'even', 'empty', 'contact', 'enterprise', 'every', 'toward', 'pike', 'among', 'anyone', 'hereupon', 'hereafter', 'front', 'ten', 'few', 'throughout', 'seems', 'other', 'un', 'along', 'skywalker', 'full', 'another', 'seven', 'less', 'last', 'lightsaber', 'after', 'into', 'the', 'bill', 'latterly', 'detail', 'beside', 'interest', 'yet', 'strange', 'more', 'move', 'about', 'towards', 'top', 'may', 'than', 'cry', 'nowhere', 'although', 'formerly', 'most', 'thereupon', 'kenobi', 'within', 'been', 'afterwards', 'as', 'first', 'there', 'own', 'thought', 'done', 'also', 'fill', 'because', 'while', 'still', 'have', 'many', 'will', 'must', 'any', 'becomes', 'others', 'he', 'hers', 'sith', 'fire', 'spock', 'are', 'nobody', 'anyhow', 'twelve', 'hasnt', 'back', 'fifteen', 'someone', 'hereby', 'him', 'by', 'and', 'mad

## Model 7, VoteClassifier with AdaBoost, GradientBoost, and DecisionTree

Best Parameters: {'cvec__max_df': 0.96, 'cvec__max_features': 3500, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': expanded_stop_words 'vote__ada__n_estimators': 400, 'vote__gb__n_estimators': 350, 'vote__tree__max_depth': None}
Training Score: 0.957900346389555
Test Score: 0.84185303514377

This is overfit and not as strong as my other two models.

# Adding in Post Length, Post Word Count, Average Word Length