In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction import text

from nltk.corpus import stopwords

import time

In [2]:
df = pd.read_csv('data/cleaned_all_text2022-06-27.csv')

df.head()

Unnamed: 0,subreddit_name,all_words
0,startrek,I’m beaming and I had to share - Sir Patrick ...
1,startrek,America and the Star Trek Universe. Roe Vs Wa...
2,startrek,Analysis: Star Trek: The Next Generation’ Gue...
3,startrek,One of the first occasions in which the word ...
4,startrek,Is A TOS Reboot Coming Soon?


# Baseline Accuracy

Our baseline accuracy is 57.2%

In [3]:
df.subreddit_name.value_counts(normalize = True)

starwars    0.572228
startrek    0.427772
Name: subreddit_name, dtype: float64

# Prepping Data for Modeling

Getting training and test data set up.

In [5]:
X = df['all_words']
y = df['subreddit_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)
print('')
print('Dataframe shape:', df.shape)
print('')
print('y_train value counts:', y_train.value_counts(normalize = True))
print('y_test value counts:', y_test.value_counts(normalize = True))

X_train shape: (3753,)
y_train shape: (3753,)
X_test shape: (1252,)
y_test shape: (1252,)

Dataframe shape: (5005, 2)

y_train value counts: starwars    0.572342
startrek    0.427658
Name: subreddit_name, dtype: float64
y_test value counts: starwars    0.571885
startrek    0.428115
Name: subreddit_name, dtype: float64


In [6]:
def pipe_grid(pipe_params, grid_params):
    '''
    This function is designed to streamline gridsearching.
    It returns a gridsearch named 'gs'
    'pipe_params' should be a list of tuples consisting of a series of name/transform pairs followed by a name/model, 
            e.g. [('cvec', CountVectorizer()), ('log', LogisticRegression())]
    'grid_params' should be a series of parameters for those transforms and the model in the form of a dictionary,
            e.g. {'cvec__ngram_range': [(1,1), (1,2)]}
    Be sure the names for the 'pipe_params' and in the 'grid_params match'
    '''
    global gs
    
    pipe = Pipeline(pipe_params)
    
    gs = GridSearchCV(pipe, grid_params)
    
    t0 = time.time()
    return gs, print(f'Time to run function: {time.time() - t0}')

-----

I've decided to only work with the expanded stop words list. Without, it's too easy. I'm working w

In [13]:
expanded_proper_names = ['seven', 'clone', 'warp', 'borg', 'trilogy', 'contact', 'prequels', 'anakin', 'paramount', 'leia',\
                         'kirk', 'wan', 'jedi', 'kenobi', 'snw', 'wars', 'vader', 'order', 'skywalker', 'klingon', 'starfleet',\
                         'ds9', 'captain', 'maul', 'luke', 'obi', 'rebels', 'data', 'voyager', 'st', 'discovery', 'federation',\
                         'pike', 'picard', 'mandalorian', 'klingons', 'star', 'tng', 'reva', 'strange', 'disney', 'worf',\
                         'riker', 'empire', 'jurati', 'palpatine', 'yoda', 'force', 'darth', 'republic', 'lightsaber', 'sith',\
                         'spock', 'boba', 'fett', 'thought', 'inquisitor', 'trek', 'enterprise', 'tos']


expanded_stop_words = text.ENGLISH_STOP_WORDS.union(expanded_proper_names)

In [16]:
# BEST MODEL 3.2, EXPANDED STOP WORDS: Training score: 0.9536370903277378 Test score: 0.8769968051118211

pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_900],
    'nb__alpha': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
2.170448064804077
Best parameters: {'nb__alpha': 0.5, 'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'several', 'such', 'while', 'tng', 'serious', 'via', 'throughout', 'find', 'not', 'anyone', 'above', 'get', 'its', 'at', 'clone', 'up', 'amoungst', 'your', 'now', 'when', 'picard', 'thereby', 'keep', 'see', 'four', 'almost', 'ourselves', 'herein', 'further', 'forty', 'are', 'have', 'maul', 'mill', 'again', 'hereupon', 'onto', 'anyhow', 'show', 'will', 'during', 'you', 'per', 'give', 'be', 'sixty', 'everything', 'whoever', 'mandalorian', 'part', 'cry', 'must', 'meanwhile', 'amount', 'reva', 'klingons', 'many', 'him', 'though', 'own', 'republic', 'each', 'always', 'trek', 'yourself', 'towards', 'it', 'name', 'then', 'someone', 'eg', 'often', 'that', 'am', 'i', 'yet', 'besides', 'move', 'might', 'go', 'system', 'within', 'latterly', 'under', 'is', 'ours', 'nobody', 'themselves', 'whe

In [16]:
# BEST MODEL 3.2, EXPANDED STOP WORDS: Training score: 0.9536370903277378 Test score: 0.8769968051118211

pipe_params = [('tvec', TfidfVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'tvec__ngram_range': [(1,1)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.09],
    'tvec__max_features': [4_900],
    'nb__alpha': [0.5, 1.0, 1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.1457672119140625e-06
2.170448064804077
Best parameters: {'nb__alpha': 0.5, 'tvec__max_df': 0.09, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': frozenset({'several', 'such', 'while', 'tng', 'serious', 'via', 'throughout', 'find', 'not', 'anyone', 'above', 'get', 'its', 'at', 'clone', 'up', 'amoungst', 'your', 'now', 'when', 'picard', 'thereby', 'keep', 'see', 'four', 'almost', 'ourselves', 'herein', 'further', 'forty', 'are', 'have', 'maul', 'mill', 'again', 'hereupon', 'onto', 'anyhow', 'show', 'will', 'during', 'you', 'per', 'give', 'be', 'sixty', 'everything', 'whoever', 'mandalorian', 'part', 'cry', 'must', 'meanwhile', 'amount', 'reva', 'klingons', 'many', 'him', 'though', 'own', 'republic', 'each', 'always', 'trek', 'yourself', 'towards', 'it', 'name', 'then', 'someone', 'eg', 'often', 'that', 'am', 'i', 'yet', 'besides', 'move', 'might', 'go', 'system', 'within', 'latterly', 'under', 'is', 'ours', 'nobody', 'themselves', 'whe

In [15]:
# Best Model 2.2 Training score: 0.9344524380495604 Test score: 0.8801916932907349
#BEST VERSION SO FAR

pipe_params = [('cvec', CountVectorizer()), 
               ('nb', MultinomialNB())]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.24],
    'cvec__max_features': [7_000],
    'nb__alpha': [1.0]    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 2.86102294921875e-06
0.8099210262298584
Best parameters: {'cvec__max_df': 0.24, 'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'several', 'such', 'while', 'tng', 'serious', 'via', 'throughout', 'find', 'not', 'anyone', 'above', 'get', 'its', 'at', 'clone', 'up', 'amoungst', 'your', 'now', 'when', 'picard', 'thereby', 'keep', 'see', 'four', 'almost', 'ourselves', 'herein', 'further', 'forty', 'are', 'have', 'maul', 'mill', 'again', 'hereupon', 'onto', 'anyhow', 'show', 'will', 'during', 'you', 'per', 'give', 'be', 'sixty', 'everything', 'whoever', 'mandalorian', 'part', 'cry', 'must', 'meanwhile', 'amount', 'reva', 'klingons', 'many', 'him', 'though', 'own', 'republic', 'each', 'always', 'trek', 'yourself', 'towards', 'it', 'name', 'then', 'someone', 'eg', 'often', 'that', 'am', 'i', 'yet', 'besides', 'move', 'might', 'go', 'system', 'within', 'latterly', 'under', 'is', 'ours', 'nobody', 'themselves', 'whereby', 'a', 'what',

In [17]:
#best version of 1.2 with expanded stop word list Training score: 0.9746869171329603 # Test score: 0.8442492012779552
#2.2 still superior
pipe_params = [('cvec', CountVectorizer()), 
               ('log', LogisticRegression(max_iter = 10_000))]

grid_params = {
    'cvec__ngram_range': [(1,1)],
    'cvec__stop_words': [expanded_stop_words],
    'cvec__max_df': [0.09],
    'cvec__max_features': [4_500],
    'log__C': [1.5, 1.6]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
2.4627199172973633
Best parameters: {'cvec__max_df': 0.09, 'cvec__max_features': 4500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': frozenset({'several', 'such', 'while', 'tng', 'serious', 'via', 'throughout', 'find', 'not', 'anyone', 'above', 'get', 'its', 'at', 'clone', 'up', 'amoungst', 'your', 'now', 'when', 'picard', 'thereby', 'keep', 'see', 'four', 'almost', 'ourselves', 'herein', 'further', 'forty', 'are', 'have', 'maul', 'mill', 'again', 'hereupon', 'onto', 'anyhow', 'show', 'will', 'during', 'you', 'per', 'give', 'be', 'sixty', 'everything', 'whoever', 'mandalorian', 'part', 'cry', 'must', 'meanwhile', 'amount', 'reva', 'klingons', 'many', 'him', 'though', 'own', 'republic', 'each', 'always', 'trek', 'yourself', 'towards', 'it', 'name', 'then', 'someone', 'eg', 'often', 'that', 'am', 'i', 'yet', 'besides', 'move', 'might', 'go', 'system', 'within', 'latterly', 'under', 'is', 'ours', 'nobody', 'themselves', 'whereby', 'a', 'what', 

In [18]:
#PREFERRED VERSION OF THIS MODEL, 4.2, expanded stop word list Training score: 0.9547029043431922 Test score: 0.8618210862619808
#2.2 is the best version, but this is the best logreg for purposes of inference.
pipe_params = [('tvec', TfidfVectorizer()), 
               ('log', LogisticRegression(max_iter=10_000))]

grid_params = {
    'tvec__ngram_range': [(1,2)],
    'tvec__stop_words': [expanded_stop_words],
    'tvec__max_df': [0.21],
    'tvec__max_features': [4_900],
    'log__C': [1.5]
    
}

pipe_grid(pipe_params, grid_params)

t0 = time.time()
gs.fit(X_train, y_train)
print(time.time()-t0)

print(f'Best parameters: {gs.best_params_}')
print(f'Training score: {gs.score(X_train, y_train)}')
print(f'Test score: {gs.score(X_test, y_test)}')

Time to run function: 9.5367431640625e-07
1.9262306690216064
Best parameters: {'log__C': 1.5, 'tvec__max_df': 0.21, 'tvec__max_features': 4900, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': frozenset({'several', 'such', 'while', 'tng', 'serious', 'via', 'throughout', 'find', 'not', 'anyone', 'above', 'get', 'its', 'at', 'clone', 'up', 'amoungst', 'your', 'now', 'when', 'picard', 'thereby', 'keep', 'see', 'four', 'almost', 'ourselves', 'herein', 'further', 'forty', 'are', 'have', 'maul', 'mill', 'again', 'hereupon', 'onto', 'anyhow', 'show', 'will', 'during', 'you', 'per', 'give', 'be', 'sixty', 'everything', 'whoever', 'mandalorian', 'part', 'cry', 'must', 'meanwhile', 'amount', 'reva', 'klingons', 'many', 'him', 'though', 'own', 'republic', 'each', 'always', 'trek', 'yourself', 'towards', 'it', 'name', 'then', 'someone', 'eg', 'often', 'that', 'am', 'i', 'yet', 'besides', 'move', 'might', 'go', 'system', 'within', 'latterly', 'under', 'is', 'ours', 'nobody', 'themselves', 'whereby'