I based my initial selection for imports off the work we did in the NLP Practice breakfast hour challenge.

In [136]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction import text 

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

import time

In [3]:
df = pd.read_csv('data/cleaned_all_text2022-06-27.csv')

df.head()

Unnamed: 0,subreddit_name,all_words
0,startrek,I’m beaming and I had to share - Sir Patrick ...
1,startrek,America and the Star Trek Universe. Roe Vs Wa...
2,startrek,Analysis: Star Trek: The Next Generation’ Gue...
3,startrek,One of the first occasions in which the word ...
4,startrek,Is A TOS Reboot Coming Soon?


# Baseline Accuracy

Our baseline accuracy is 57.2%

In [4]:
df.subreddit_name.value_counts(normalize = True)

starwars    0.572228
startrek    0.427772
Name: subreddit_name, dtype: float64

# Building Functions to Lemmatize and Stem Text

I'm using the workflow from the NLP Practice breakfast hour as a model.

In [7]:
def lemmatize_text(text):
    split_text = text.split()
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in split_text])

def stem_text(text):
    split_text = text.split()
    p_stemmer = PorterStemmer()
    return ' '.join([p_stemmer.stem(word) for word in split_text])

test_phrase1 = 'my computer computes computationally'
print(f'Test phrase: {test_phrase1}')
print(f'Test phrase lemmatized: {lemmatize_text(test_phrase1)}')
print(f'Test phrase stemmed: {stem_text(test_phrase1)}')
print('')

test_phrase2 = 'studies studying cries cry'
print(f'Test phrase: {test_phrase2}')
print(f'Test phrase lemmatized: {lemmatize_text(test_phrase2)}')
print(f'Test phrase stemmed: {stem_text(test_phrase2)}')

Test phrase: my computer computes computationally
Test phrase lemmatized: my computer computes computationally
Test phrase stemmed: my comput comput comput

Test phrase: studies studying cries cry
Test phrase lemmatized: study studying cry cry
Test phrase stemmed: studi studi cri cri


# Prepping Data for Modeling

Getting training and test data set up.

In [10]:
X = df['all_words']
y = df['subreddit_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)
print('')
print('Dataframe shape:', df.shape)
print('')
print('y_train value counts:', y_train.value_counts(normalize = True))
print('y_test value counts:', y_test.value_counts(normalize = True))

X_train shape: (3753,)
y_train shape: (3753,)
X_test shape: (1252,)
y_test shape: (1252,)

Dataframe shape: (5005, 2)

y_train value counts: starwars    0.572342
startrek    0.427658
Name: subreddit_name, dtype: float64
y_test value counts: starwars    0.571885
startrek    0.428115
Name: subreddit_name, dtype: float64


# Modeling

I built a function to streamline the pipeline/gridsearch writing.\
I modeled with out adding the proper name stop words first.

I began by working on CountVectorizer and LogisticRegression, tweaking the parameters and running the model repeatedly. The score was very high to start, probably because of the proper names.

## Building a Gridsearch Function

In [45]:
def pipe_grid(pipe_params, grid_params):
    '''
    This function is designed to streamline gridsearching.
    It returns a gridsearch named 'gs'
    'pipe_params' should be a list of tuples consisting of a series of name/transform pairs followed by a name/model, 
            e.g. [('cvec', CountVectorizer()), ('log', LogisticRegression())]
    'grid_params' should be a series of parameters for those transforms and the model in the form of a dictionary,
            e.g. {'cvec__ngram_range': [(1,1), (1,2)]}
    Be sure the names for the 'pipe_params' and in the 'grid_params match'
    '''
    global gs
    
    pipe = Pipeline(pipe_params)
    
    gs = GridSearchCV(pipe, grid_params)
    
    t0 = time.time()
    return gs, print(f'Time to run function: {time.time() - t0}')
    

## Model 1 - CountVectorizer and LogististicRegression

In [None]:
Pipeline()
GridSearchCV()

In [26]:
CountVectorizer().get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [34]:
LogisticRegression().get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [68]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.95, 0.98, 1.0],
    'cvec__max_features': [None, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.1920928955078125e-05
78.26734280586243
Best parameters: {'cvec__max_df': 0.95, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9845456967759126
Test score: 0.9464856230031949


In [56]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.9, 0.95],
    'cvec__max_features': [2_000, 3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
24.25687074661255
Best parameters: {'cvec__max_df': 0.9, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9845456967759126
Test score: 0.9464856230031949


In [57]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.85, 0.9],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.4066696166992188e-05
10.730860710144043
Best parameters: {'cvec__max_df': 0.85, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9845456967759126
Test score: 0.9464856230031949


In [61]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.75, 0.8, 0.85],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
8.003306865692139
Best parameters: {'cvec__max_df': 0.75, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9845456967759126
Test score: 0.9464856230031949


In [60]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.5, 0.6, 0.75],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 7.152557373046875e-06
8.02409815788269
Best parameters: {'cvec__max_df': 0.5, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9845456967759126
Test score: 0.9464856230031949


In [62]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.3, 0.4, 0.5],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.3113021850585938e-05
8.132829189300537
Best parameters: {'cvec__max_df': 0.3, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9845456967759126
Test score: 0.9464856230031949


In [63]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.1, 0.2, 0.3],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
7.960031986236572
Best parameters: {'cvec__max_df': 0.2, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9840127897681854
Test score: 0.9464856230031949


In [65]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.1, 0.15, 0.2],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
8.004861831665039
Best parameters: {'cvec__max_df': 0.2, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9840127897681854
Test score: 0.9464856230031949


In [66]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19, 0.2, 0.21],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
8.008489847183228
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9850786037836398
Test score: 0.9488817891373802


In [67]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.17, 0.18, 0.19],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.1920928955078125e-05
5.394302845001221
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9850786037836398
Test score: 0.9488817891373802


In [69]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__max_features': [2_000, 3_000, 4_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
5.5713582038879395
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9850786037836398
Test score: 0.9488817891373802


In [70]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__max_features': [2_500, 3_000, 3_500]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
5.639925003051758
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9850786037836398
Test score: 0.9488817891373802


In [71]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__max_features': [2_900, 3_000, 3_100]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
5.585331916809082
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9850786037836398
Test score: 0.9488817891373802


In [71]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__max_features': [2_900, 3_000, 3_100]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
5.585331916809082
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9850786037836398
Test score: 0.9488817891373802


In [72]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': ['english'],
    'cvec__max_df': [0.18, 0.19, 0.2],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.1920928955078125e-06
74.4193480014801
Best parameters: {'cvec__max_df': 0.2, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__preprocessor': None}
Training score: 0.9869437783106848
Test score: 0.9400958466453674


In [73]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.19],
    'cvec__max_features': [2_900, 3_000, 3_100]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
12.486733198165894
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.9850786037836398
Test score: 0.9488817891373802


In [77]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__min_df': [None, 0.05, 0.1],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.3113021850585938e-05


5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lorendunn/opt/anaconda3/envs/dsi/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lorendunn/opt/anaconda3/envs/dsi/lib/python3.9/site-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/lorendunn/opt/anaconda3/envs/dsi/lib/python3.9/site-packages/sklearn/pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/lorendunn/opt/anaconda3/envs/dsi/lib/python3.9/s

3.8016412258148193
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__min_df': 0.05, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.8654409805488942
Test score: 0.8522364217252396


In [78]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__min_df': [0.04, 0.05, 0.06],
    'cvec__max_features': [3_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.1920928955078125e-06
5.262088775634766
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__min_df': 0.04, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training score: 0.8928856914468425
Test score: 0.8881789137380192


In [79]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__max_features': [3_000],
    'log__C': [0.1, 0.5, 1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 6.198883056640625e-06
5.461315155029297
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'log__C': 1.0}
Training score: 0.9850786037836398
Test score: 0.9488817891373802


In [80]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__max_features': [3_000],
    'log__C': [0.9, 1.0, 1.1]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
5.550469160079956
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'log__C': 0.9}
Training score: 0.9829469757527312
Test score: 0.9472843450479234


In [82]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__max_features': [3_000],
    'log__C': [0.9, 0.95, 1.]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
5.586487054824829
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'log__C': 0.9}
Training score: 0.9829469757527312
Test score: 0.9472843450479234


Weirdly, the above two gridsearches are saying log__C = 0.9 is the best, but I can see it's better with 1.0

In [84]:
# BEST VERSION
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,2)],
    'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19],
    'cvec__max_features': [3_000],
    'log__C': [1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
2.105170965194702
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'log__C': 1.0}
Training score: 0.9850786037836398
Test score: 0.9488817891373802


### Model 1: CountVectorizer and LogReg, Best Version

Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 3000, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'log__C': 1.0}
Training score: 0.9850786037836398
Test score: 0.9488817891373802

## Model 2: CountVectorizer and Naive Bayes

In [88]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': ['english'],
    'cvec__max_df': [0.19, 0.5, 0.9],
    'cvec__max_features': [2_000, 3_000, 5_000],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 0.00012183189392089844
393.21941208839417
Best parameters: {'cvec__max_df': 0.9, 'cvec__max_features': 5000, 'cvec__ngram_range': (1, 1), 'cvec__preprocessor': None}
Training score: 0.968558486544098
Test score: 0.9600638977635783


In [90]:
#because best preprocessor was 'none' I'm going to check these params with stop words
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.19, 0.5, 0.9],
    'cvec__max_features': [2_000, 3_000, 5_000],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 0.0
43.8954119682312
Best parameters: {'cvec__max_df': 0.5, 'cvec__max_features': 5000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}
Training score: 0.9720223820943246
Test score: 0.9584664536741214


The training score was improved, but the test score was lower.

In [91]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.19, 0.5, 0.9],
    'cvec__max_features': [2_000, 3_000, 5_000],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 8.106231689453125e-06
24.8409903049469
Best parameters: {'cvec__max_df': 0.9, 'cvec__max_features': 5000, 'cvec__ngram_range': (1, 1)}
Training score: 0.968558486544098
Test score: 0.9600638977635783


In [92]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.85, 0.9, 0.95],
    'cvec__max_features': [4_500, 5_000, 5_500],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.09808349609375e-05
25.130499124526978
Best parameters: {'cvec__max_df': 0.85, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1)}
Training score: 0.9672262190247801
Test score: 0.9616613418530351


Narrowing to: ngram_range = (1,1) since that's consistently been best so far

In [93]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.75, 0.8, 0.85],
    'cvec__max_features': [3_500, 4_000, 4_500],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0013580322265625e-05
6.702489137649536
Best parameters: {'cvec__max_df': 0.75, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1)}
Training score: 0.9672262190247801
Test score: 0.9616613418530351


In [94]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.70, 0.75, 0.77],
    'cvec__max_features': [4_400, 4_500, 4_600],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.1920928955078125e-06
6.700904130935669
Best parameters: {'cvec__max_df': 0.7, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1)}
Training score: 0.9672262190247801
Test score: 0.9616613418530351


In [95]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.65, 0.70, 0.71],
    'cvec__max_features': [4_500],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.1920928955078125e-05
2.349112033843994
Best parameters: {'cvec__max_df': 0.65, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1)}
Training score: 0.9672262190247801
Test score: 0.9616613418530351


In [96]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    #'cvec__ngram_range': [(1,1)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.55, 0.60, 0.65],
    'cvec__max_features': [4_500],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.0067901611328125e-06
2.3762898445129395
Best parameters: {'cvec__max_df': 0.65, 'cvec__max_features': 4500}
Training score: 0.9672262190247801
Test score: 0.9616613418530351


In [97]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    #'cvec__ngram_range': [(1,1)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.64, 0.65, 0.66],
    'cvec__max_features': [4_500],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
2.3555970191955566
Best parameters: {'cvec__max_df': 0.64, 'cvec__max_features': 4500}
Training score: 0.9672262190247801
Test score: 0.9616613418530351


In [98]:
#BEST CVEC PARAMS
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    #'cvec__ngram_range': [(1,1)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.62, 0.63, 0.64],
    'cvec__max_features': [4_500],
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
2.363154888153076
Best parameters: {'cvec__max_df': 0.64, 'cvec__max_features': 4500}
Training score: 0.9672262190247801
Test score: 0.9616613418530351


In [100]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.64],
    'cvec__max_features': [4_500],
    'nb__alpha': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-05
8.400366067886353
Best parameters: {'cvec__max_df': 0.64, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'nb__alpha': 1.0}
Training score: 0.9672262190247801
Test score: 0.9616613418530351


In [101]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    #'cvec__preprocessor': [None, lemmatize_text, stem_text],
    #'cvec__stop_words': [None, 'english'],
    'cvec__max_df': [0.64],
    'cvec__max_features': [4_500],
    'nb__alpha': [0.9, 1.0, 1.1]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 6.9141387939453125e-06
8.448508739471436
Best parameters: {'cvec__max_df': 0.64, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'nb__alpha': 1.0}
Training score: 0.9672262190247801
Test score: 0.9616613418530351


## Best Model 2, CountVectorizer and Naive Bayes

**this model is superior to model 1**

Best parameters: {'cvec__max_df': 0.64, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'nb__alpha': 1.0}
Training score: 0.9672262190247801
Test score: 0.9616613418530351

# Model 3, TfidfVectorizer and Naive Bayes

In [103]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__preprocessor': [None, lemmatize_text, stem_text],
    #'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.1, 0.5, 0.9],
    'tvec__max_features': [2_000, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.7220458984375e-06
402.31639099121094
Best parameters: {'tvec__max_df': 0.5, 'tvec__max_features': 5000, 'tvec__ngram_range': (1, 2), 'tvec__preprocessor': <function stem_text at 0x7f9568451d30>}
Training score: 0.9752198241406874
Test score: 0.9440894568690096


In [105]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.1, 0.5, 0.9],
    'tvec__max_features': [2_000, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 0.0
45.24504804611206
Best parameters: {'tvec__max_df': 0.5, 'tvec__max_features': 5000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9837463362643218
Test score: 0.9664536741214057


In [106]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.4, 0.5, 0.6],
    'tvec__max_features': [4_000, 5_000, 6_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
45.448556900024414
Best parameters: {'tvec__max_df': 0.4, 'tvec__max_features': 4000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9832134292565947
Test score: 0.9640575079872205


In [107]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.5],
    'tvec__max_features': [4_000, 5_000, 6_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
15.6185941696167
Best parameters: {'tvec__max_df': 0.5, 'tvec__max_features': 4000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9832134292565947
Test score: 0.9640575079872205


In [108]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.4, 0.5, 0.6],
    'tvec__max_features': [5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
15.379570960998535
Best parameters: {'tvec__max_df': 0.4, 'tvec__max_features': 5000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9837463362643218
Test score: 0.9664536741214057


In [109]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.35, 0.4, 0.45],
    'tvec__max_features': [4_500, 5_000, 5_500]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.1920928955078125e-06
45.27949118614197
Best parameters: {'tvec__max_df': 0.35, 'tvec__max_features': 4500, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9837463362643218
Test score: 0.9656549520766773


In [110]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.35, 0.4, 0.45],
    'tvec__max_features': [5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
4.940686941146851
Best parameters: {'tvec__max_df': 0.35, 'tvec__max_features': 5000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9837463362643218
Test score: 0.9664536741214057


In [111]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.35, 0.4, 0.45],
    'tvec__max_features': [4_900, 5_000, 5_100]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
14.061980247497559
Best parameters: {'tvec__max_df': 0.35, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9837463362643218
Test score: 0.9672523961661342


In [112]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.35, 0.4, 0.45],
    'tvec__max_features': [4_700, 4_800, 4_900]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.1920928955078125e-06
13.970189094543457
Best parameters: {'tvec__max_df': 0.35, 'tvec__max_features': 4700, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9834798827604583
Test score: 0.9656549520766773


In [113]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.25, 0.30, 0.35],
    'tvec__max_features': [4_900]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 4.0531158447265625e-06
4.952348947525024
Best parameters: {'tvec__max_df': 0.25, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9834798827604583
Test score: 0.9664536741214057


In [114]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.35, 0.40],
    'tvec__max_features': [4_900]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
3.387397050857544
Best parameters: {'tvec__max_df': 0.35, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9837463362643218
Test score: 0.9672523961661342


In [115]:
# BEST FUNCTION
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.40],
    'tvec__max_features': [4_900]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 3.0994415283203125e-06
1.9567821025848389
Best parameters: {'tvec__max_df': 0.4, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9837463362643218
Test score: 0.9672523961661342


In [116]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.40],
    'tvec__max_features': [5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
1.9188129901885986
Best parameters: {'tvec__max_df': 0.4, 'tvec__max_features': 5000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9837463362643218
Test score: 0.9664536741214057


In [117]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.40],
    'tvec__max_features': [4_900],
    'nb__alpha': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 3.814697265625e-06
4.9508209228515625
Best parameters: {'nb__alpha': 0.5, 'tvec__max_df': 0.4, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.986144417799094
Test score: 0.9656549520766773


In [118]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.40],
    'tvec__max_features': [4_900],
    'nb__alpha': [0.9, 1.0, 1.1]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 4.76837158203125e-06
4.953273057937622
Best parameters: {'nb__alpha': 0.9, 'tvec__max_df': 0.4, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9840127897681854
Test score: 0.9672523961661342


## Best Model 3, TfidfVectorizer and Naive Bayes

Best parameters: {'tvec__max_df': 0.4, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}
Training score: 0.9837463362643218
Test score: 0.9672523961661342

**slightly higher accuracy, slightly more overfit. My top pick so far because overfit is ever so slight**

# Model 4, TfidfVectorizer and LogReg

In [119]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__preprocessor': [None, lemmatize_text, stem_text],
    #'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.5, 0.75, 1.0],
    'tvec__max_features': [2_000, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 7.867813110351562e-06
405.03462195396423
Best parameters: {'tvec__max_df': 0.5, 'tvec__max_features': 5000, 'tvec__ngram_range': (1, 1), 'tvec__preprocessor': None}
Training score: 0.9728217426059153
Test score: 0.9488817891373802


In [120]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.5, 0.75, 1.0],
    'tvec__max_features': [2_000, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.6689300537109375e-06
47.735355854034424
Best parameters: {'tvec__max_df': 0.5, 'tvec__max_features': 3000, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': 'english'}
Training score: 0.9757527311484147
Test score: 0.950479233226837


In [121]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.3, 0.4, 0.5],
    'tvec__max_features': [2_000, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.6689300537109375e-06
48.32843899726868
Best parameters: {'tvec__max_df': 0.3, 'tvec__max_features': 3000, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': 'english'}
Training score: 0.9757527311484147
Test score: 0.950479233226837


In [122]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': [None, 'english'],
    'tvec__max_df': [0.1, 0.2, 0.3],
    'tvec__max_features': [2_000, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
47.3973445892334
Best parameters: {'tvec__max_df': 0.2, 'tvec__max_features': 3000, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': 'english'}
Training score: 0.9781508126831868
Test score: 0.9512779552715654


In [124]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    #'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.15, 0.2, 0.25],
    'tvec__max_features': [2_000, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.1920928955078125e-06
6.969743251800537
Best parameters: {'tvec__max_df': 0.25, 'tvec__max_features': 3000, 'tvec__stop_words': 'english'}
Training score: 0.9778843591793233
Test score: 0.9512779552715654


In [125]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    #'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.24, 0.25, 0.26],
    'tvec__max_features': [2_000, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
6.891188144683838
Best parameters: {'tvec__max_df': 0.24, 'tvec__max_features': 3000, 'tvec__stop_words': 'english'}
Training score: 0.9778843591793233
Test score: 0.9512779552715654


In [126]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    #'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.22, 0.23, 0.24],
    'tvec__max_features': [2_000, 3_000, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
7.060287952423096
Best parameters: {'tvec__max_df': 0.22, 'tvec__max_features': 3000, 'tvec__stop_words': 'english'}
Training score: 0.9778843591793233
Test score: 0.9512779552715654


In [127]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    #'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.22],
    'tvec__max_features': [2_500, 3_000, 3_500]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
2.3874549865722656
Best parameters: {'tvec__max_df': 0.22, 'tvec__max_features': 3000, 'tvec__stop_words': 'english'}
Training score: 0.9778843591793233
Test score: 0.9512779552715654


In [128]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    #'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.22],
    'tvec__max_features': [2_900, 3_000, 3_100]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
2.3951382637023926
Best parameters: {'tvec__max_df': 0.22, 'tvec__max_features': 2900, 'tvec__stop_words': 'english'}
Training score: 0.977351452171596
Test score: 0.9512779552715654


In [132]:
# BEST VERSION
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    #'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.22],
    'tvec__max_features': [2_900],
    'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
2.3748159408569336
Best parameters: {'log__C': 1.5, 'tvec__max_df': 0.22, 'tvec__max_features': 2900, 'tvec__stop_words': 'english'}
Training score: 0.9866773248068212
Test score: 0.957667731629393


In [133]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    #'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.22],
    'tvec__max_features': [2_900],
    'log__C': [1.5, 2.0, 2.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
2.413182020187378
Best parameters: {'log__C': 2.5, 'tvec__max_df': 0.22, 'tvec__max_features': 2900, 'tvec__stop_words': 'english'}
Training score: 0.9920063948840927
Test score: 0.9584664536741214


In [134]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    #'tvec__ngram_range': [(1,1), (1,2)],
    #'tvec__preprocessor': [None, lemmatize_text, stem_text],
    'tvec__stop_words': ['english'],
    'tvec__max_df': [0.22],
    'tvec__max_features': [2_900],
    'log__C': [2.5, 3.0, 5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 3.0994415283203125e-06
2.4172849655151367
Best parameters: {'log__C': 5, 'tvec__max_df': 0.22, 'tvec__max_features': 2900, 'tvec__stop_words': 'english'}
Training score: 0.9952038369304557
Test score: 0.9584664536741214


## Model 4, TfidfVectorizer and LogReg Best Version

Best parameters: {'log__C': 1.5, 'tvec__max_df': 0.22, 'tvec__max_features': 2900, 'tvec__stop_words': 'english'}
Training score: 0.9866773248068212
Test score: 0.957667731629393

# Exploring Models with Proper Name Stop Words Removed

I have lists of proper names I created in EDA. I'd like to see how the models perform when I make things a little harder for the model.

Starting with best versions of other models and iterating

In [137]:
proper_names = ['snw', 'discovery', 'force', 'federation', 'darth', 'borg', 'wars', 'anakin',\
                'lightsaber', 'vader', 'picard', 'skywalker', 'sith', 'spock', 'starfleet', 'leia',\
                'star', 'tng', 'ds9', 'reva', 'strange', 'luke', 'series', 'obi', 'clone', 'trek',\
                'enterprise', 'disney', 'wan', 'voyager', 'jedi', 'kenobi', 'tos']

stop_words = text.ENGLISH_STOP_WORDS.union(proper_names)

In [154]:
expanded_proper_names = ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia',\
                         'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet',\
                         'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation',\
                         'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf',\
                         'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith',\
                         'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']


expanded_stop_words = text.ENGLISH_STOP_WORDS.union(expanded_proper_names)

# Model 3.2, Tvec, NB, taking out top proper names and show specific references.
Starting with best version and iterating.

In [140]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1),(1,2)],
    'tvec__stop_words': [stop_words, proper_names],
    'tvec__max_df': [0.1, 0.40, 0.9],
    'tvec__max_features': [2_000, 3_000, 4_900]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
44.57108020782471
Best parameters: {'tvec__max_df': 0.1, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'neither', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'must', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'obi', 'everyone', 'because', 'anything', 'whether', 'series', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager', 'other', 'sincere', 'thereafter', 'hereafter', 'con', 'take', 'four', 'here', 'find', 'per', 'je

In [156]:
#using expanded_stop_words makes it marginally weaker, but not much, really
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1),(1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.1, 0.40, 0.9],
    'tvec__max_features': [2_000, 3_000, 4_900]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
44.001856088638306
Best parameters: {'tvec__max_df': 0.1, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [158]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1),(1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.05, 0.1, 0.2],
    'tvec__max_features': [4_500, 4_900, 5_300]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
43.8257269859314
Best parameters: {'tvec__max_df': 0.1, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager'

In [159]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.09, 0.1, 0.11],
    'tvec__max_features': [4_500, 4_900, 5_300]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
13.498803853988647
Best parameters: {'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [160]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.07, 0.08, 0.09],
    'tvec__max_features': [4_500, 4_900, 5_300]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
6.10121488571167
Best parameters: {'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager', 

In [163]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_800, 4_900, 5_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
2.1546919345855713
Best parameters: {'tvec__max_df': 0.09, 'tvec__max_features': 4800, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager'

In [161]:
# best without changing alpha
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_900, 5_300, 5_700]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 3.814697265625e-06
2.118048906326294
Best parameters: {'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager', 

In [166]:
# BEST MODEL 3.2, EXPANDED STOP WORDS

pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_900],
    'nb__alpha': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
2.1531660556793213
Best parameters: {'nb__alpha': 0.5, 'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'bei

In [165]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_900],
    'nb__alpha': [0.3, 0.5, 0.7]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
2.1075940132141113
Best parameters: {'nb__alpha': 0.3, 'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being'

## Model 3.2 Best Version, Tvec, Log

BBest parameters: {'nb__alpha': 0.5, 'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': expanded_stop_words
Training score: 0.9536370903277378
Test score: 0.8769968051118211

## Model 2.2, Cvec, NB 
Working from best version

In [168]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.34, 0.64, 0.94],
    'cvec__max_features': [2_500, 4_500, 6_500]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.245208740234375e-06
43.72111892700195
Best parameters: {'cvec__max_df': 0.34, 'cvec__max_features': 6500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']}
Training score: 0.929656274980016
Test score: 0.8793929712460063


In [169]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.34, 0.64, 0.94],
    'cvec__max_features': [5_500, 6_500, 7_500]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 7.152557373046875e-06
43.68476104736328
Best parameters: {'cvec__max_df': 0.34, 'cvec__max_features': 7500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager

No improvement in test score, less strong fit

In [170]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.34, 0.64, 0.94],
    'cvec__max_features': [6_000, 6_500, 7_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.9604644775390625e-06
43.8722620010376
Best parameters: {'cvec__max_df': 0.34, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager

In [171]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.34, 0.64, 0.94],
    'cvec__max_features': [6_900, 7_000, 7_100]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.9073486328125e-06
43.96654176712036
Best parameters: {'cvec__max_df': 0.34, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [172]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.24, 0.34, 0.44],
    'cvec__max_features': [7_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 6.9141387939453125e-06
14.783697843551636
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [173]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.2, 0.24, 0.28],
    'cvec__max_features': [7_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.6689300537109375e-06
2.161890983581543
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [173]:
#BEST VERSION WITHOUT CHANGING NB PARAMETERS
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.24],
    'cvec__max_features': [7_000]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.6689300537109375e-06
2.161890983581543
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [175]:
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.24],
    'cvec__max_features': [7_000],
    'nb__alpha': [0.5, 1.0, 1.5]    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
2.142659902572632
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [176]:
#BEST ALPHA IS 1.0
pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.24],
    'cvec__max_features': [7_000],
    'nb__alpha': [0.9, 1.0, 1.1]    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 4.0531158447265625e-06
2.1283137798309326
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

## Best Version of Model 2.2, Cvec & NB

Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': expanded_stop_words, 'nb__alpha': 1.0}
Training score: 0.9344524380495604
Test score: 0.8801916932907349

## Model 1.2, Cvec & LogReg

In [177]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.01, 0.19, 0.37],
    'cvec__max_features': [1_500, 3_000, 4_500],
    #'log__C': [1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.2159347534179688e-05
56.74627709388733
Best parameters: {'cvec__max_df': 0.19, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [178]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.01, 0.19, 0.37],
    'cvec__max_features': [4_000, 4_500, 5_000],
    #'log__C': [1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 3.814697265625e-06
59.06657409667969
Best parameters: {'cvec__max_df': 0.37, 'cvec__max_features': 4000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']}
Training score: 0.968558486544098
Test score: 0.8306709265175719


In [179]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1),(1,2)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.09, 0.19, 0.29],
    'cvec__max_features': [4_500],
    #'log__C': [1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 7.152557373046875e-06
20.858755111694336
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [181]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'cvec__max_df': [0.04, 0.09, 0.14],
    'cvec__max_features': [4_500],
    #'log__C': [1.0]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
6.810006856918335
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [182]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.09],
    'cvec__max_features': [4_500],
    'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.4066696166992188e-05
3.3841030597686768
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [183]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.09],
    'cvec__max_features': [4_500],
    'log__C': [1.5, 2.0, 2.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 7.152557373046875e-07
3.591322183609009
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager

In [184]:
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.09],
    'cvec__max_features': [4_500],
    'log__C': [1.3, 1.5, 1.7]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.9604644775390625e-06
3.9458119869232178
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [185]:
#best version of 1.2
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.09],
    'cvec__max_features': [4_500],
    'log__C': [1.5, 1.6]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 4.76837158203125e-06
2.444786787033081
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager'

## Best 1.2 Cvec and LogReg

Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': expanded_stop_words, 'log__C': 1.5}
Training score: 0.9746869171329603
Test score: 0.8442492012779552

## Model 4.2 tvec, LogReg

In [186]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.02, 0.22, 0.42],
    'tvec__max_features': [900, 2_900, 4_900],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
47.06992697715759
Best parameters: {'tvec__max_df': 0.22, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [187]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.15, 0.22, 0.29],
    'tvec__max_features': [900, 2_900, 4_900],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 6.198883056640625e-06
47.04106307029724
Best parameters: {'tvec__max_df': 0.29, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia', 'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet', 'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation', 'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf', 'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith', 'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']}
Training score: 0.9205968558486544
Test score: 0.8338658146964856


In [188]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.20, 0.22, 0.24],
    'tvec__max_features': [900, 2_900, 4_900],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
46.90589618682861
Best parameters: {'tvec__max_df': 0.2, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager

In [189]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.21, 0.22, 0.23],
    'tvec__max_features': [900, 2_900, 4_900],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
47.65503001213074
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyager',

In [190]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.21],
    'tvec__max_features': [3_400, 4_900, 5_400],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
16.321935176849365
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyag

In [191]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_400, 4_900, 5_300],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
16.49087882041931
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [192]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,1), (1,2), (1,3)],
    'tvec__stop_words': [expanded_stop_words, expanded_proper_names],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_400, 4_900, 5_300],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 1.0967254638671875e-05
36.74275779724121
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [193]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_800, 4_900, 5_000],
    #'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 5.0067901611328125e-06
5.021714925765991
Best parameters: {'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 'our', 'voyage

In [194]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_900],
    'log__C': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
4.9872589111328125
Best parameters: {'log__C': 1.5, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', '

In [195]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_900],
    'log__C': [1.5, 2.0, 2.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 8.106231689453125e-06
5.038173198699951
Best parameters: {'log__C': 2.5, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 

In [196]:
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_900],
    'log__C': [1.4, 1.5, 1.6]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.059906005859375e-06
5.052214860916138
Best parameters: {'log__C': 1.6, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 

In [196]:
#PREFERRED VERSION OF THIS MODEL, 4.2
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_900],
    'log__C': [1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.059906005859375e-06
5.052214860916138
Best parameters: {'log__C': 1.6, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'trek', 'what', 'spock', 'with', 'alone', 'republic', 'order', 'call', 'mill', 'twenty', 'anyway', 'eleven', 'seven', 'upon', 'beside', 'therein', 'across', 'ltd', 'well', 'many', 'can', 'cannot', 'be', 'empire', 'neither', 'klingon', 'do', 'now', 'serious', 'behind', 'none', 'thick', 'done', 'elsewhere', 'one', 'nine', 'jurati', 'must', 'inquisitor', 'as', 'into', 'whither', 'leia', 'whereafter', 'anyhow', 'in', 'force', 'we', 'are', 'than', 'wherever', 'very', 'your', 'too', 'besides', 'contact', 'obi', 'everyone', 'because', 'anything', 'whether', 'enough', 'him', 'against', 'whatever', 'fifty', 'meanwhile', 'discovery', 'she', 'how', 'before', 'yours', 'klingons', 'boba', 'still', 'although', 'nobody', 'through', 'least', 'made', 'top', 'own', 'would', 'whoever', 'this', 'being', 

## Best Version of 4.2 Best Log Reg for Inference
Best parameters: {'log__C': 1.6, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': expanded_stop_words
Training score: 0.9547029043431922
Test score: 0.8618210862619808