# Table of Contents<a id="top"></a>
- [Importing Libraries](#import)
- [Importing Scraped Data](#data)
- [Count Vectorizer Models - No Stemming/Lemmetization, Stop Words Removed](#cvec1)
- [TFIDF Vectorizer Models - No Stemming/Lemmetization, Stop Words Removed](#tvec1)
- [Function to Tokenize, Lemmatize and Stem Posts](#func)
- [Count Vectorizer Models - Lemmetized, Stop Words Removed](#cvec2)
- [TFIDF Vectorizer Models - Lemmetized, Stop Words Removed](#tvec2)
- [Count Vectorizer Models - Stemmed, Stop Words Removed](#cvec3) <-- Best Performing Vectorized Model
- [TFIDF Vectorizer Models - Stemmed, Stop Words Removed](#tvec3)
- [Naive Bayes Models](#nb)
- [KNN, Random Forrest, SVM](#other)

### Importing Libraries <a id="import"></a>

In [1]:
# Import Libraries

import pandas as pd
import numpy as np

# Library tools to turn text in to interpretable DataFrames
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
# Object that uses count vectorizer and Logistic Regrssion as one
from sklearn.pipeline import Pipeline
# Split data to check train model, Input parameters to create best model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import stop_words
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Importing lemmatizer.
import nltk
from nltk.stem import WordNetLemmatizer

# Importing stemmer.
from nltk.stem.porter import PorterStemmer

# Import Tokenizer
from nltk.tokenize import RegexpTokenizer

# Naive Bayes Models 
from sklearn.naive_bayes import MultinomialNB, GaussianNB

### Importing Scraped Data <a id="data"></a>

In [2]:
text_df = pd.read_csv('./datasets/text_df.csv')
text_df.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
text_df['text'][10]

'The Unofficial Rewatch Thread - S3E02 "Run" **From TV Guide:** Rebecca defends ACN again as another lawsuit looms; Neal could be in trouble after a dangerous leak; Charlie and Leona confront a hostile takeover attempt by Reese\'s half-siblings; Sloan worries that Don has crossed an ethical line; Hallie regrets a late-night tweet; Maggie weighs the pros and cons of eavesdropping.\n\n**From IMDb:** While Rebecca must once again defend ACN during a possible lawsuit, Will tries to protect Neal from the aftermath of the DOD leak; Charlie and Leona deal with a hostile takeover; Sloan worries about Don\'s involvement with insider information.\n\n**Original Discussion Thread [HERE](https://www.reddit.com/r/Thenewsroom/comments/2mips3/episode_discussion_s03e02_run/)**'

### Count Vectorizer Models - No Stemming/Lemmetization, Stop Words Removed <a id="cvec1"></a>

In [4]:
# Setting parameters
X = text_df['text']
y = text_df['class']

In [None]:
# Train test split data

# Because the sample size is large and I noticed characters were being missed as indicators if they didn't make it 
# in to the training data, set test_size = 0.1
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=30)

In [None]:
# Instantiate Pipeline for Count Vectorizer
pipe = Pipeline([('cvec', CountVectorizer(stop_words='english')),
                 ('lr', LogisticRegression())])

In [None]:
# Run Grid Search to create optimum  model
pipe_params = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_





0.846711259754738




{'cvec__max_df': 0.9,
 'cvec__max_features': 3000,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 2)}

In [None]:
# ReRun Grid Search based on previous results
pipe_params = {
    'cvec__max_features': [2900, 3000, 3100],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.87, .9, .93],
    'cvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_



In [None]:
# ReRun Grid Search based on previous results
pipe_params = {
    'cvec__max_features': [2700, 2800, 2900],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.87],
    'cvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5)

gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

# Model is overfit. How to fix? 
y_pred_test = gs.predict(X_test)
y_pred_train = gs.predict(X_train)

In [None]:
# Creating data frames to better understand the false predictions

train_df = pd.DataFrame()
train_df['text'] = X_train
train_df['actual_y'] = y_train
train_df['pred_y'] = y_pred_train

test_df = pd.DataFrame()
test_df['text'] = X_test
test_df['actual_y'] = y_test
test_df['pred_y'] = y_pred_test

In [None]:
# Out of curiosity - Reading the incorrect predictions in the overfit train data. 
train_df[train_df.pred_y != train_df.actual_y]
test_df[test_df.pred_y != test_df.actual_y]


In [None]:
# Checking to see what words are showing up. 

cvec_all_text = CountVectorizer(ngram_range=(1,2), max_features=3700, min_df=2, max_df=0.87, stop_words='english')

cvec_all_text.fit(X_train)

X_train_cv = cvec_all_text.transform(X_train)

X_train_cv = pd.DataFrame(X_train_cv.toarray(),
                                 columns = cvec_all_text.get_feature_names())

X_train_cv.sum().sort_values(ascending=False).head(15)

### TFIDF Vectorizer Models - No Stemming/Lemmetization, Stop Words Removed <a id="tvec1"></a>

[return to top](#top)

In [None]:
# Instantiate Pipeline for TFID Vectorizer 
pipe = Pipeline([('tvec', TfidfVectorizer(stop_words='english')),
                 ('lr', LogisticRegression())])

In [None]:
pipe_params = {
    'tvec__max_features': [2500, 3000, 3500],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.7, .8, .9],
    'tvec__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'tvec__max_features': [2900, 3000, 3100],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.1, .3, .7],
    'tvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'tvec__max_features': [3000],
    'tvec__min_df': [2],
    'tvec__max_df': [.2, .3, .4],
    'tvec__ngram_range': [(1,2), (1,3)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

# Model is overfit. How to fix? 
y_pred_test = gs.predict(X_test)
y_pred_train = gs.predict(X_train)

### Function to Tokenize, Lemmatize and Stem Posts<a id="func"></a>

[return to top](#top)

In [None]:
# Tokenize, lemmatizing, and stemming function 
import time

# Instantiating
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
p_stemmer = PorterStemmer()

def lem_stem(df):
    start = time.time()
    print(start)
    # Adding columns to DataFrame
    df['lem_text'] = None
    df['stem_text'] = None
    
    # Looping through each post
    for i in range(df.shape[0]):
        
        post = df['text'][i]
        tokens = tokenizer.tokenize(post.lower()) 
        
        new_post_lem = ''
        new_post_stem = ''
        
        for token in tokens:
            lem = lemmatizer.lemmatize(token)
            stem = p_stemmer.stem(token)
            new_post_lem += ' ' + lem
            new_post_stem += ' ' + stem
            
        df['lem_text'][i] = new_post_lem
        df['stem_text'][i] = new_post_stem
        
        if i%100 == 0:
            print(f'{i} posts complete.')
    end = time.time()
    print(end - start)
    return df


In [None]:
# Commenting out calling this code because it was extremely time intensive and I do not want to run it again 
# by mistake. If I could spend more time on this project, I would like to find a way to do this that is less
# computationally intensive.

# lem_stem(text_df)
# text_df.to_csv('./lem_stem.csv')

# Saving csv and loading below

In [None]:
lem_stem_df = pd.read_csv('./lem_stem.csv')
lem_stem_df.drop('Unnamed: 0', axis=1, inplace=True)

### Count Vectorizer Models - Lemmetized, Stop Words Removed <a id="cvec2"></a>

[return to top](#top)

In [None]:
X = lem_stem_df['lem_text']
y = lem_stem_df['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=30)

In [None]:
pipe = Pipeline([('cvec', CountVectorizer(stop_words='english')),
                 ('lr', LogisticRegression())])

In [None]:
pipe_params = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'cvec__max_features': [2900, 3000, 3100],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.75, .85, .9],
    'cvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'cvec__max_features': [3000],
    'cvec__min_df': [2],
    'cvec__max_df': [.15, .35, .75],
    'cvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'cvec__max_features': [3000],
    'cvec__min_df': [2],
    'cvec__max_df': [.14, .15, .16],
    'cvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

# Model accuracy is almost identical to non lemmetized model.
y_pred_test = gs.predict(X_test)
y_pred_train = gs.predict(X_train)

### TFIDF Vectorizer Models - Lemmetized, Stop Words Removed <a id="tvec2"></a>

[return to top](#top)

In [None]:
pipe = Pipeline([('tvec', TfidfVectorizer(stop_words='english')),
                 ('lr', LogisticRegression())])

In [None]:
pipe_params = {
    'tvec__max_features': [2000, 3000],
    'tvec__min_df': [2],
    'tvec__max_df': [.2, .3, .4],
    'tvec__ngram_range': [(1,2), (1,3)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'tvec__max_features': [2900, 3000, 3100],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.25, .3],
    'tvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'tvec__max_features': [3100, 3200, 3300],
    'tvec__min_df': [2],
    'tvec__max_df': [.1, .2, .25],
    'tvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

# Model accuracy is almost identical to non lemmetized model, just slightly better
y_pred_test = gs.predict(X_test)
y_pred_train = gs.predict(X_train)

### Count Vectorizer Models - Stemmed, Stop Words Removed <a id="cvec3"></a>

[return to top](#top)

In [None]:
X = lem_stem_df['stem_text']
y = lem_stem_df['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=30)

In [None]:
pipe = Pipeline([('cvec', CountVectorizer(stop_words='english')),
                 ('lr', LogisticRegression())])

In [None]:
pipe_params = {
    'cvec__max_features': [2000, 3000, 4000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.05, .15, .3],
    'cvec__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'cvec__max_features': [3500, 4000, 4500],
    'cvec__min_df': [2],
    'cvec__max_df': [.23, .3, .35],
    'cvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'cvec__max_features': [4050],
    'cvec__min_df': [2],
    'cvec__max_df': [.22, .23, .24],
    'cvec__ngram_range': [(1,2)],
    'lr__C' : [.08],
    'lr__penalty': ['l2']
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

# Best version of three very similar models
y_pred_test = gs.predict(X_test)
y_pred_train = gs.predict(X_train)

In [None]:
preds = pd.DataFrame(y_test)

In [None]:
preds['lr'] = gs.predict(X_test)

### TFIDF Vectorizer Models - Stemmed, Stop Words Removed <a id="tvec3"></a>

[return to top](#top)

In [None]:
pipe = Pipeline([('tvec', TfidfVectorizer(stop_words='english')),
                 ('lr', LogisticRegression())])

In [None]:
pipe_params = {
    'tvec__max_features': [2600, 3100, 3600],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.2, .25, .3],
    'tvec__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
pipe_params = {
    'tvec__max_features': [2700, 2800, 2950],
    'tvec__min_df': [3],
    'tvec__max_df': [.27, .28, .29],
    'tvec__ngram_range': [(1,2)]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

# Identical performance to lemmetized data
y_pred_test = gs.predict(X_test)
y_pred_train = gs.predict(X_train)

### Naive Bayes Model <a id="nb"></a>

[return to top](#top)

#### Models to run
I will be creating 6 total Naive Bayes models to compare based on my best found hyperparameters for each of the 6 types of Vectorized models that I ran. I will be running MultinomialNB on my count vectorizer models and GaussianNB on my TFIDF models.

- Count Vectorizer - No Stemming/Lemmetization, Stop Words Removed
    - {'cvec__max_df': 0.87, 'cvec__max_features': 2900, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 2)}
    - Train accuracy score = 98.0%
    - Test accuracy score = 86.5%
- TFIDF Vectorizer - No Stemming/Lemmetization, Stop Words Removed
    - {'tvec__max_df': 0.3, 'tvec__max_features': 3000, 'tvec__min_df': 2, 'tvec__ngram_range': (1, 2)}
    - Train accuracy score = 95.3%
    - Test accuracy score = 87.5%
- Count Vectorizer - Lemmetized, Stop Words Removed
    - {'cvec__max_df': 0.15, 'cvec__max_features': 3000, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2)}
    - Train accuracy score = 97.9%
    - Test accuracy score = 85.5%
- TFIDF Vectorizer - Lemmetized, Stop Words Removed
    - {'tvec__max_df': 0.25, 'tvec__max_features': 3100, 'tvec__min_df': 2, 'tvec__ngram_range': (1, 2)}
    - Train accuracy score = 95.6%
    - Test accuracy score = 87.5%
- Count Vectorizer - Stemmed, Stop Words Removed <-- Best Performing Vectorized Model
    - {'cvec__max_df': 0.23, 'cvec__max_features': 4100, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2)}
    - Train accuracy score = 98.5%
    - Test accuracy score = 87.5%
- TFIDF Vectorizer - Stemmed, Stop Words Removed
    - {'tvec__max_df': 0.29, 'tvec__max_features': 2950, 'tvec__min_df': 3, 'tvec__ngram_range': (1, 2)}
    - Train accuracy score = 95.5%
    - Test accuracy score = 87.5%

In [None]:
X = lem_stem_df['text']
y = lem_stem_df['class']

vect = CountVectorizer(max_df=0.87, max_features=2900, ngram_range=(1,2), stop_words='english')
vect.fit(X)

X_transform = vect.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transform, y, stratify=y, test_size=0.1, random_state=30)

nb = MultinomialNB()
nb.fit(X_train, y_train)

print(f'Train score: {metrics.accuracy_score(y_train, nb.predict(X_train))}')
print(f'Test score: {metrics.accuracy_score(y_test, nb.predict(X_test))}')

In [None]:
tvect = TfidfVectorizer(max_df=0.3, max_features=3000, min_df=2, ngram_range=(1,2), stop_words='english')
tvect.fit(X)

X_transform = tvect.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transform, y, stratify=y, test_size=0.1, random_state=30)

X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

gb = GaussianNB()
gb.fit(X_train_dense, y_train)

print(f'Train score: {metrics.accuracy_score(y_train, gb.predict(X_train_dense))}')
print(f'Test score: {metrics.accuracy_score(y_test, gb.predict(X_test_dense))}')

In [None]:
X = lem_stem_df['lem_text']
y = lem_stem_df['class']

vect = CountVectorizer(max_df=0.15, max_features=3000, min_df=2, ngram_range=(1,2), stop_words='english')
vect.fit(X)

X_transform = vect.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transform, y, stratify=y, test_size=0.1, random_state=30)

nb = MultinomialNB()
nb.fit(X_train, y_train)

print(f'Train score: {metrics.accuracy_score(y_train, nb.predict(X_train))}')
print(f'Test score: {metrics.accuracy_score(y_test, nb.predict(X_test))}')

In [None]:
tvect = TfidfVectorizer(max_df=0.25, max_features=3100, min_df=2, ngram_range=(1,2), stop_words='english')
tvect.fit(X)

X_transform = tvect.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transform, y, stratify=y, test_size=0.1, random_state=30)

X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

gb = GaussianNB()
gb.fit(X_train_dense, y_train)

print(f'Train score: {metrics.accuracy_score(y_train, gb.predict(X_train_dense))}')
print(f'Test score: {metrics.accuracy_score(y_test, gb.predict(X_test_dense))}')

In [None]:
X = lem_stem_df['stem_text']
y = lem_stem_df['class']

vect = CountVectorizer(max_df=0.23, max_features=4100, min_df=2, ngram_range=(1,2), stop_words='english')
vect.fit(X)

X_transform = vect.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transform, y, stratify=y, test_size=0.1, random_state=30)

nb = MultinomialNB()
nb.fit(X_train, y_train)

print(f'Train score: {metrics.accuracy_score(y_train, nb.predict(X_train))}')
print(f'Test score: {metrics.accuracy_score(y_test, nb.predict(X_test))}')

In [None]:
preds['nb'] = nb.predict(X_test)

In [None]:
tvect = TfidfVectorizer(max_df=0.29, max_features=2950, min_df=3, ngram_range=(1,2), stop_words='english')
tvect.fit(X)

X_transform = tvect.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transform, y, stratify=y, test_size=0.1, random_state=30)

X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

gb = GaussianNB()
gb.fit(X_train_dense, y_train)

print(f'Train score: {metrics.accuracy_score(y_train, gb.predict(X_train_dense))}')
print(f'Test score: {metrics.accuracy_score(y_test, gb.predict(X_test_dense))}')

### KNN, Random Forrest, SVM <a id="other"></a>

[return to top](#top)

In [None]:
X = lem_stem_df['stem_text']
y = lem_stem_df['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=30)

In [None]:
pipe = Pipeline([('cvec', CountVectorizer(stop_words='english')),
                 ('knn', KNeighborsClassifier())])

In [None]:
pipe_params = {
    'cvec__max_features': [50],
    'cvec__min_df': [2],
    'cvec__max_df': [.34],
    'cvec__ngram_range': [(1,2)],
    'knn__n_neighbors': [21],
    'knn__metric': ['euclidean']
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

In [None]:
preds['knn'] = gs.predict(X_test)

In [None]:
pipe = Pipeline([('cvec', CountVectorizer(stop_words='english')),
                 ('rf', RandomForestClassifier())])

In [None]:
pipe_params = {
    'cvec__max_features': [2000],
    'cvec__min_df': [3],
    'cvec__max_df': [.8],
    'cvec__ngram_range': [(1,2)],
    'rf__n_estimators': [700], 
    'rf__max_depth' : [40],
    'rf__min_samples_split': [200], 
    'rf__min_samples_leaf': [3]
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)


print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')
gs.best_params_

In [None]:
preds['rf'] = gs.predict(X_test)

In [None]:
pipe = Pipeline([('cvec', CountVectorizer(stop_words='english')),
                 ('svc', SVC())])

In [None]:
pipe_params = {
    'cvec__max_features': [300],
    'cvec__min_df': [2],
    'cvec__max_df': [.3],
    'cvec__ngram_range': [(1,2)],
    'svc__degree' : [0],
    'svc__C' : [1, 3 ],
    'svc__gamma' : [.001, .01],
    'svc__kernel' : ['rbf']
}
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)


print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

gs.best_params_

In [None]:
preds['svc'] = gs.predict(X_test)

In [None]:
# Creating a KNN model which pulls 50 most predictive words based on best Naive Bayes Parameters

In [None]:
# Rebuilding model that was most accurate on training data - 
X = lem_stem_df['stem_text']
y = lem_stem_df['class']

vect = CountVectorizer(max_df=0.23, max_features=4100, min_df=2, ngram_range=(1,2), stop_words='english')
vect.fit(X)

X_transform = vect.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transform, y, stratify=y, test_size=0.1, random_state=30)

nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
# X_transform_cv = pd.DataFrame(X_transform.toarray(),
#                                  columns = vect.get_feature_names())

X_df = pd.DataFrame(nb.coef_.T, index = vect.get_feature_names(), columns =['coef'])
#X_df['coef_abs'] = np.absolute(X_df['coef'])
ww_words = list(X_df.sort_values('coef', ascending=False).head(25).index)
nr_words = list(X_df.sort_values('coef', ascending=False).tail(25).index)


In [None]:
X_transform_cv = pd.DataFrame(X_transform.toarray(),
                                 columns = vect.get_feature_names())
X = X_transform_cv[nr_words + ww_words]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transform, y, stratify=y, test_size=0.1, random_state=30)

knn = KNeighborsClassifier()
gs = {
    'n_neighbors': [10],
    'metric': ['euclidean', 'minkowski'],
    'leaf_size' : [4, 5, 6],
    'weights' : ['uniform']
}
gs = GridSearchCV(knn, param_grid=gs, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)

print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

gs.best_params_

In [None]:
# Creating KNN with best performing logistic regression words. 
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
X_df['lr_coef'] = lr.coef_.T
ww_words = list(X_df.sort_values('lr_coef', ascending=False).head(25).index)
nr_words = list(X_df.sort_values('lr_coef', ascending=False).tail(25).index)
X = X_transform_cv[nr_words + ww_words]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transform, y, stratify=y, test_size=0.1, random_state=30)

knn = KNeighborsClassifier()
gs = {
    'n_neighbors': [9, 10, 11],
    'metric': ['euclidean'],
    'leaf_size' : [1, 2, 3],
    'weights' : ['uniform'],
    'p' : [1, 2, 3, 4]
}
gs = GridSearchCV(knn, param_grid=gs, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)

print(f'Train score: {gs.score(X_train, y_train)}.')
print(f'Test score: {gs.score(X_test, y_test)}.')

gs.best_params_

In [None]:
preds['knn2'] = gs.predict(X_test)

In [None]:
probs = preds.drop('class', axis=1).apply(lambda x: np.mean(x), axis=1)

In [None]:
preds['avg'] = probs>0.5

In [None]:
preds['avg'] = preds['avg'].map({False:0, True:1})

In [None]:
metrics.accuracy_score(preds['class'], preds['avg'])