In [1]:
import numpy as np
import pandas as pd


In [2]:
import re

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

In [4]:
canada_df=pd.read_csv('..\data\canada_subreddit_comments.csv')

# Data Processing and Modeling
## Methodology

Most of the processing is going to fall within the modeling itself, as we're going to define a custom stopword set and build a custom tokenizer that stems words to use. However, before we do that, I'd like to yank URL's out of the dataset, since we saw that they occur quite frequently.

In [5]:
def strip_url(text):
    text = re.sub(r'https?:.*?\s|www.*?\s', ' ', text)
    return text

In [6]:
canada_df['body_processed']= canada_df['body'].map(strip_url)

Recall in our exploration that we identified a few top words shared between both subreddits that are outside the default stopword set. "people", "like", and "canada". Let's add this to our stopword set.

In [7]:
custom_stopwords= stopwords.words('english')

In [8]:
custom_stopwords.extend(['people', 'like', 'canada'])

I guess we probably should make the subreddit category binary. Let's do 1 for r/OnGuardForThee, and 0 for r/Canada

In [9]:
canada_df['subreddit_bin']=canada_df['subreddit'].map(lambda x: 1 if x=="onguardforthee" else 0 )

Next, let's build our tokenizer. It's going to do pretty much the same thing as the default tokenizer (remove punctuation, separate words), but it's also going to stem each word for consistency. Also we're going to keep question marks and exclamations marks as tokens, because I have a hunch that the number of occurances of them is reflective of argument style and could be useful information. We're going to use the Snowball stemmer because it anecdotally performs better than the Porter stemmer.

Source acknowledgement: This is loosely modeled off of some of the stuff this guy did: http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/

In [10]:
snow=SnowballStemmer("english")

In [11]:
def snowball_tokens(text):
    text_processed = re.sub(r'[^A-Za-z0-9!*?*]', ' ', text).split()
    tokens = [snow.stem(word) for word in text_processed]
    return tokens

In [12]:
def snowball_tokens2(text):
    text_processed = re.sub(r'[^A-Za-z]', ' ', text).split()
    tokens = [snow.stem(word) for word in text_processed]
    return tokens

Also, we should tokenize our stopwords.

In [13]:
custom_stopwords = [snow.stem(word) for word in custom_stopwords]

### Train-Test-Split

I'm keeping the default settings and am going to train on 75% and test on 25%. We've got 20,000 comments. I think that if we don't get results training on 15,000 and testing on 5000 I doubt we'll get them on any other ratio.

In [14]:
X=canada_df['body_processed']
y=canada_df['subreddit_bin']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1920)

# Modeling: Initial Tests 

## Logistic Regression Tests

My hunch here is that CountVectorizer will give us more information that tf-idf, but might as well try both.

In [15]:
pipe_cvec= Pipeline([('cvec', CountVectorizer(tokenizer=snowball_tokens, stop_words=custom_stopwords)),
                     ('lr', LogisticRegression(max_iter=1000))])

In [16]:
pipe_cvec_params = {'cvec__max_features': [3000, 5000, 7000],
                   'cvec__ngram_range': [(1,1)],
                   'lr__C':[0.1, 0.04, 0.01, 0.004,0.001]}
gs_cvec = GridSearchCV(pipe_cvec, pipe_cvec_params, cv=5, n_jobs=6, verbose=10)

In [17]:
gs_cvec.fit(X_train, y_train);

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   17.1s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   18.2s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   45.7s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.0min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.8min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.3min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  2.6min
[Parallel(n_jobs=6)]: Done  72 out of  75 | elapsed:  3.1min remaining:    7.7s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  3.3min finished


In [18]:
gs_cvec.best_params_

{'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'lr__C': 0.01}

In [19]:
gs_cvec.best_score_

0.6055981806264874

In [20]:
gs_cvec.score(X_train, y_train)

0.6780976386309366

In [21]:
gs_cvec.score(X_test, y_test)

0.5998806207719857

In [46]:
preds=gs_cvec.predict(X_test)
f1_score(y_test, preds)

0.5816517578531308

Out of curiousity, let's try this on comments that are greater than 50 words, with the idea that more words gives us more accuracy.

In [22]:
X_test_long = X_test[X_test.str.split().str.len() > 50]
y_test_long = y_test[X_test.str.split().str.len() > 50]
X_train_long = X_train[X_train.str.split().str.len() > 50]
y_train_long = y_train[X_train.str.split().str.len() > 50]

In [23]:
gs_cvec.best_params_

{'cvec__max_features': 7000, 'cvec__ngram_range': (1, 1), 'lr__C': 0.01}

In [24]:
gs_cvec.score(X_train_long, y_train_long)

0.7525377229080933

In [25]:
gs_cvec.score(X_test_long, y_test_long)

0.6345486111111112

So we're slightly more accurate at classifying long comments than short ones.

Let's test a tfidf model for comparison.

In [26]:
pipe_tfidf= Pipeline([('tfidf', TfidfVectorizer(tokenizer=snowball_tokens, stop_words=custom_stopwords)),
                      ('lr', LogisticRegression(max_iter=1000))])

pipe_tfidf_params = {'tfidf__max_features': [7000,10_000, 15_000],
                   'tfidf__ngram_range': [(1,1)],
                   'lr__C':[0.1, 0.04, 0.01, 0.004,0.001]}
gs_tfidf = GridSearchCV(pipe_tfidf, pipe_tfidf_params, cv=5, n_jobs=6, verbose=2)

In [27]:
gs_tfidf.fit(X_train, y_train);

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  3.5min finished


In [28]:
gs_tfidf.best_params_

{'lr__C': 0.1, 'tfidf__max_features': 7000, 'tfidf__ngram_range': (1, 1)}

In [29]:
gs_tfidf.best_score_

0.621053287936973

In [30]:
gs_tfidf.score(X_train, y_train)

0.7025736269567524

In [31]:
gs_tfidf.score(X_test, y_test)

0.610624751293275

In [47]:
preds=gs_tfidf.predict(X_test)
f1_score(y_test, preds)

0.6074222668004012

In [32]:
gs_tfidf.score(X_train_long, y_train_long)

0.7089163237311386

In [33]:
gs_tfidf.score(X_test_long, y_test_long)

0.6380208333333334

**Logistic Regression Conclusions:**

The TFIDF vectorizer seems to be slightly outperforming the count vectorizer. But not by much.

## K Nearest Neighbors Modeling

In [34]:
pipe_knn= Pipeline([('cvec', TfidfVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords)),
                    ('knn', KNeighborsClassifier())])

pipe_knn_params = {'cvec__max_features': [5000],
                   'cvec__ngram_range': [(1,1)],
                   'knn__n_neighbors':[400,500,600]}
gs_knn = GridSearchCV(pipe_knn, pipe_knn_params, cv=5, n_jobs=6, verbose=10)

In [35]:
gs_knn.fit(X_train, y_train);

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   17.1s
[Parallel(n_jobs=6)]: Done   6 out of  15 | elapsed:   17.6s remaining:   26.5s
[Parallel(n_jobs=6)]: Done   8 out of  15 | elapsed:   35.3s remaining:   30.9s
[Parallel(n_jobs=6)]: Done  10 out of  15 | elapsed:   35.7s remaining:   17.8s
[Parallel(n_jobs=6)]: Done  12 out of  15 | elapsed:   35.9s remaining:    8.9s
[Parallel(n_jobs=6)]: Done  15 out of  15 | elapsed:   48.8s finished


In [36]:
gs_knn.best_params_

{'cvec__max_features': 5000,
 'cvec__ngram_range': (1, 1),
 'knn__n_neighbors': 500}

In [37]:
gs_knn.best_score_

0.6113684891193898

In [38]:
gs_knn.score(X_train, y_train)

0.6191297426373044

In [39]:
gs_knn.score(X_test, y_test)

0.5927178670911262

In [48]:
preds=gs_knn.predict(X_test)
f1_score(y_test, preds)

0.5552900282424507

In [40]:
gs_knn.score(X_train_long, y_train_long)

0.6436213991769547

In [41]:
gs_knn.score(X_test_long, y_test_long)

0.6145833333333334

### KNN Conclusions

KNN gives worse results. Much less overfitting, though.

## Naive Bayesian

### Gaussian/TFIDF

Let's try a Gaussian model first.

In [42]:
class DenseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.toarray()
        return X

In [43]:
pipe_gnb= Pipeline([('tfidf', TfidfVectorizer(tokenizer=snowball_tokens, stop_words=custom_stopwords)), ('dense', DenseTransformer()), ('gnb', GaussianNB())])
pipe_gnb_params = {'tfidf__max_features': [2000,3000,4000,5000,6000],
                    'tfidf__ngram_range':[(1,1), (2,2), (3,3)],}

gs_gnb = GridSearchCV(pipe_gnb, pipe_gnb_params, cv=5, n_jobs=6, verbose=10)

In [49]:
gs_gnb.fit(X_train, y_train);

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   21.1s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   21.8s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   56.7s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.6min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  2.1min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.8min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  3.2min
[Parallel(n_jobs=6)]: Done  72 out of  75 | elapsed:  3.9min remaining:    9.5s
[Parallel(n_jobs=6)]: Done  75 out of  75 | elapsed:  4.0min finished


In [50]:
gs_gnb.best_params_

{'tfidf__max_features': 2000, 'tfidf__ngram_range': (1, 1)}

In [51]:
gs_gnb.best_score_

0.5799278584970813

In [52]:
gs_gnb.score(X_train, y_train)

0.6715972406473866

In [53]:
gs_gnb.score(X_test, y_test)

0.575407879029049

In [56]:
preds=gs_gnb.predict(X_test)
f1_score(y_test, preds)

0.5765873015873016

In [54]:
gs_gnb.score(X_train_long, y_train_long)

0.658161865569273

In [57]:
gs_gnb.score(X_test_long, y_test_long)

0.5998263888888888



### Multinomial/CountVectorizer

In [58]:
pipe_mnb= Pipeline([('cv', CountVectorizer(stop_words=custom_stopwords)), ('mnb', MultinomialNB())])
pipe_mnb_params = {'cv__max_features': [2000,3000,4000,5000,6000, 7000, 8000, 9000],
                   'cv__tokenizer':[snowball_tokens, snowball_tokens2]
                  }

gs_mnb = GridSearchCV(pipe_mnb, pipe_mnb_params, cv=5, n_jobs=6, verbose=10)

In [59]:
gs_mnb.fit(X_train, y_train);

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   18.4s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   18.6s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   47.3s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.0min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.8min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.3min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  2.6min
[Parallel(n_jobs=6)]: Done  78 out of  80 | elapsed:  3.4min remaining:    5.1s
[Parallel(n_jobs=6)]: Done  80 out of  80 | elapsed:  3.6min finished


In [60]:
gs_mnb.best_params_

{'cv__max_features': 8000,
 'cv__tokenizer': <function __main__.snowball_tokens2(text)>}

In [61]:
gs_mnb.best_score_

0.621451275892861

In [62]:
gs_mnb.score(X_train, y_train)

0.7174980100822499

In [63]:
gs_mnb.score(X_test, y_test)

0.624950258654994

In [146]:
preds=gs_mnb.predict(X_test)
f1_score(y_test, preds)

0.6368715083798883

In [64]:
gs_mnb.score(X_train_long, y_train_long)

0.7791495198902606

In [65]:
gs_mnb.score(X_test_long, y_test_long)

0.6423611111111112

### BernoulliNB

In [66]:
pipe_bnb= Pipeline([('cv', CountVectorizer(stop_words=custom_stopwords, binary=True)), 
                    ('bnb', BernoulliNB())])
pipe_bnb_params = {'cv__max_features': [2000,3000,4000,5000,6000, 7000, 8000, 9000],
                  'cv__tokenizer':[snowball_tokens, snowball_tokens2]}

gs_bnb = GridSearchCV(pipe_bnb, pipe_bnb_params, cv=5, n_jobs=6, verbose=10)

In [67]:
gs_bnb.fit(X_train, y_train);

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   14.6s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   15.4s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   46.2s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.0min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.4min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  2.7min
[Parallel(n_jobs=6)]: Done  78 out of  80 | elapsed:  3.6min remaining:    5.4s
[Parallel(n_jobs=6)]: Done  80 out of  80 | elapsed:  3.7min finished


In [68]:
gs_bnb.best_params_

{'cv__max_features': 4000,
 'cv__tokenizer': <function __main__.snowball_tokens2(text)>}

In [69]:
gs_bnb.best_score_

0.6190627763041557

In [70]:
gs_bnb.score(X_train, y_train)

0.6840673918811356

In [71]:
gs_bnb.score(X_test, y_test)

0.6136092319936332

In [147]:
preds=gs_bnb.predict(X_test)
f1_score(y_test, preds)

0.5731868131868132

In [72]:
gs_bnb.score(X_train_long, y_train_long)

0.7377229080932785

In [73]:
gs_bnb.score(X_test_long, y_test_long)

0.6293402777777778

### Naive Bayesian Conclusions:
- MNB seems to slightly outperform BNB in the general case. 
- Both significantly outperforms GNB.
- However, BNB seems a bit less overfit.
- The tokenizer that removes numbers and all punctuation seems to be outperforming the one that leaves the min.
- MNB seems to like higher feature counts than BNB


## Trees, Forests, and Bagging.

I twiddled with bagging and tree models here for a bit, and didn't get any satisfactory results. Models typically were extremely overfit and didn't have great results.

Let's try some random forest techniques.

In [74]:
pipe_rf=Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords, max_features=5000)), 
                  ('rf', RandomForestClassifier())])
pipe_et=Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords, max_features=5000)), 
                  ('et', ExtraTreesClassifier())])

In [75]:
cross_val_score(pipe_rf, X_train, y_train, cv=5).mean() 



0.5979701844447083

In [76]:
cross_val_score(pipe_et, X_train, y_train, cv=5).mean() 



0.5984349032907963

In [77]:
pipe_et.fit(X_train, y_train)



Pipeline(steps=[('cv',
                 CountVectorizer(max_features=5000,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'our', 'ourselv', 'you',
                                             "you'r", "you'v", "you'll",
                                             "you'd", 'your', 'your',
                                             'yourself', 'yourselv', 'he',
                                             'him', 'his', 'himself', 'she',
                                             'she', 'her', 'her', 'herself',
                                             'it', 'it', 'it', 'itself', ...],
                                 tokenizer=<function snowball_tokens2 at 0x00000246DE381940>)),
                ('et', ExtraTreesClassifier())])

In [78]:
pipe_et.score(X_train, y_train)

0.9911780313080393

In [79]:
pipe_et.score(X_test, y_test)

0.5955033824114604

In [80]:
pipe_et[1].estimators_[10].get_depth()

1517

Let's try gridsearching over ExtraTrees.

In [81]:
pipe_et2=Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords)), 
                  ('et', ExtraTreesClassifier())])

pipe_et2_params = {'cv__max_features': [2500, 5000, 10_000],
                   'et__max_depth':[10,50,100,500],
                   'et__min_samples_split':[2,4,10,20],
                   'et__min_samples_leaf':[1,5,10,20]
                  }

In [82]:
gs_et2 = GridSearchCV(pipe_et2, pipe_et2_params, cv=5, n_jobs=8, verbose=50)

In [83]:
gs_et2.fit(X_train, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:   24.8s
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   25.1s
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:   25.5s
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:   25.9s
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:   26.0s
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:   26.1s
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:   26.5s
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:   26.8s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:   45.4s
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed:   46.5s
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed:   46.7s
[Parallel(n_jobs=8)]: Done  12 tasks      | elapsed:   47.0s
[Parallel(n_jobs=8)]: Done  13 tasks      | elapsed:   47.1s
[Parallel(n_jobs=8)]: Done  14 tasks      | elapsed:   47.3s
[Paralle

[Parallel(n_jobs=8)]: Done 134 tasks      | elapsed:  6.8min
[Parallel(n_jobs=8)]: Done 135 tasks      | elapsed:  6.8min
[Parallel(n_jobs=8)]: Done 136 tasks      | elapsed:  6.8min
[Parallel(n_jobs=8)]: Done 137 tasks      | elapsed:  7.0min
[Parallel(n_jobs=8)]: Done 138 tasks      | elapsed:  7.1min
[Parallel(n_jobs=8)]: Done 139 tasks      | elapsed:  7.1min
[Parallel(n_jobs=8)]: Done 140 tasks      | elapsed:  7.1min
[Parallel(n_jobs=8)]: Done 141 tasks      | elapsed:  7.1min
[Parallel(n_jobs=8)]: Done 142 tasks      | elapsed:  7.1min
[Parallel(n_jobs=8)]: Done 143 tasks      | elapsed:  7.2min
[Parallel(n_jobs=8)]: Done 144 tasks      | elapsed:  7.2min
[Parallel(n_jobs=8)]: Done 145 tasks      | elapsed:  7.4min
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  7.5min
[Parallel(n_jobs=8)]: Done 147 tasks      | elapsed:  7.5min
[Parallel(n_jobs=8)]: Done 148 tasks      | elapsed:  7.5min
[Parallel(n_jobs=8)]: Done 149 tasks      | elapsed:  7.5min
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 269 tasks      | elapsed: 14.9min
[Parallel(n_jobs=8)]: Done 270 tasks      | elapsed: 14.9min
[Parallel(n_jobs=8)]: Done 271 tasks      | elapsed: 14.9min
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed: 14.9min
[Parallel(n_jobs=8)]: Done 273 tasks      | elapsed: 15.2min
[Parallel(n_jobs=8)]: Done 274 tasks      | elapsed: 15.2min
[Parallel(n_jobs=8)]: Done 275 tasks      | elapsed: 15.2min
[Parallel(n_jobs=8)]: Done 276 tasks      | elapsed: 15.3min
[Parallel(n_jobs=8)]: Done 277 tasks      | elapsed: 15.3min
[Parallel(n_jobs=8)]: Done 278 tasks      | elapsed: 15.3min
[Parallel(n_jobs=8)]: Done 279 tasks      | elapsed: 15.3min
[Parallel(n_jobs=8)]: Done 280 tasks      | elapsed: 15.3min
[Parallel(n_jobs=8)]: Done 281 tasks      | elapsed: 15.6min
[Parallel(n_jobs=8)]: Done 282 tasks      | elapsed: 15.7min
[Parallel(n_jobs=8)]: Done 283 tasks      | elapsed: 15.7min
[Parallel(n_jobs=8)]: Done 284 tasks      | elapsed: 15.7min
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 404 tasks      | elapsed: 21.4min
[Parallel(n_jobs=8)]: Done 405 tasks      | elapsed: 21.4min
[Parallel(n_jobs=8)]: Done 406 tasks      | elapsed: 21.4min
[Parallel(n_jobs=8)]: Done 407 tasks      | elapsed: 21.4min
[Parallel(n_jobs=8)]: Done 408 tasks      | elapsed: 21.4min
[Parallel(n_jobs=8)]: Done 409 tasks      | elapsed: 21.7min
[Parallel(n_jobs=8)]: Done 410 tasks      | elapsed: 21.7min
[Parallel(n_jobs=8)]: Done 411 tasks      | elapsed: 21.8min
[Parallel(n_jobs=8)]: Done 412 tasks      | elapsed: 21.8min
[Parallel(n_jobs=8)]: Done 413 tasks      | elapsed: 21.8min
[Parallel(n_jobs=8)]: Done 414 tasks      | elapsed: 21.8min
[Parallel(n_jobs=8)]: Done 415 tasks      | elapsed: 21.8min
[Parallel(n_jobs=8)]: Done 416 tasks      | elapsed: 21.8min
[Parallel(n_jobs=8)]: Done 417 tasks      | elapsed: 22.1min
[Parallel(n_jobs=8)]: Done 418 tasks      | elapsed: 22.1min
[Parallel(n_jobs=8)]: Done 419 tasks      | elapsed: 22.2min
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 539 tasks      | elapsed: 28.3min
[Parallel(n_jobs=8)]: Done 540 tasks      | elapsed: 28.3min
[Parallel(n_jobs=8)]: Done 541 tasks      | elapsed: 28.3min
[Parallel(n_jobs=8)]: Done 542 tasks      | elapsed: 28.3min
[Parallel(n_jobs=8)]: Done 543 tasks      | elapsed: 28.3min
[Parallel(n_jobs=8)]: Done 544 tasks      | elapsed: 28.3min
[Parallel(n_jobs=8)]: Done 545 tasks      | elapsed: 28.7min
[Parallel(n_jobs=8)]: Done 546 tasks      | elapsed: 28.7min
[Parallel(n_jobs=8)]: Done 547 tasks      | elapsed: 28.7min
[Parallel(n_jobs=8)]: Done 548 tasks      | elapsed: 28.7min
[Parallel(n_jobs=8)]: Done 549 tasks      | elapsed: 28.7min
[Parallel(n_jobs=8)]: Done 550 tasks      | elapsed: 28.7min
[Parallel(n_jobs=8)]: Done 551 tasks      | elapsed: 28.7min
[Parallel(n_jobs=8)]: Done 552 tasks      | elapsed: 28.7min
[Parallel(n_jobs=8)]: Done 553 tasks      | elapsed: 29.1min
[Parallel(n_jobs=8)]: Done 554 tasks      | elapsed: 29.1min
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 674 tasks      | elapsed: 35.9min
[Parallel(n_jobs=8)]: Done 675 tasks      | elapsed: 35.9min
[Parallel(n_jobs=8)]: Done 676 tasks      | elapsed: 35.9min
[Parallel(n_jobs=8)]: Done 677 tasks      | elapsed: 35.9min
[Parallel(n_jobs=8)]: Done 678 tasks      | elapsed: 36.0min
[Parallel(n_jobs=8)]: Done 679 tasks      | elapsed: 36.0min
[Parallel(n_jobs=8)]: Done 680 tasks      | elapsed: 36.0min
[Parallel(n_jobs=8)]: Done 681 tasks      | elapsed: 36.3min
[Parallel(n_jobs=8)]: Done 682 tasks      | elapsed: 36.3min
[Parallel(n_jobs=8)]: Done 683 tasks      | elapsed: 36.3min
[Parallel(n_jobs=8)]: Done 684 tasks      | elapsed: 36.3min
[Parallel(n_jobs=8)]: Done 685 tasks      | elapsed: 36.3min
[Parallel(n_jobs=8)]: Done 686 tasks      | elapsed: 36.3min
[Parallel(n_jobs=8)]: Done 687 tasks      | elapsed: 36.3min
[Parallel(n_jobs=8)]: Done 688 tasks      | elapsed: 36.4min
[Parallel(n_jobs=8)]: Done 689 tasks      | elapsed: 36.6min
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 809 tasks      | elapsed: 42.6min
[Parallel(n_jobs=8)]: Done 810 tasks      | elapsed: 42.7min
[Parallel(n_jobs=8)]: Done 811 tasks      | elapsed: 42.7min
[Parallel(n_jobs=8)]: Done 812 tasks      | elapsed: 42.7min
[Parallel(n_jobs=8)]: Done 813 tasks      | elapsed: 42.7min
[Parallel(n_jobs=8)]: Done 814 tasks      | elapsed: 42.7min
[Parallel(n_jobs=8)]: Done 815 tasks      | elapsed: 42.7min
[Parallel(n_jobs=8)]: Done 816 tasks      | elapsed: 42.7min
[Parallel(n_jobs=8)]: Done 817 tasks      | elapsed: 43.0min
[Parallel(n_jobs=8)]: Done 818 tasks      | elapsed: 43.1min
[Parallel(n_jobs=8)]: Done 819 tasks      | elapsed: 43.1min
[Parallel(n_jobs=8)]: Done 820 tasks      | elapsed: 43.1min
[Parallel(n_jobs=8)]: Done 821 tasks      | elapsed: 43.1min
[Parallel(n_jobs=8)]: Done 822 tasks      | elapsed: 43.1min
[Parallel(n_jobs=8)]: Done 823 tasks      | elapsed: 43.1min
[Parallel(n_jobs=8)]: Done 824 tasks      | elapsed: 43.2min
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 944 tasks      | elapsed: 50.3min
[Parallel(n_jobs=8)]: Done 945 tasks      | elapsed: 50.5min
[Parallel(n_jobs=8)]: Done 960 out of 960 | elapsed: 51.0min finished




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'our',
                                                                    'ourselv',
                                                                    'you',
                                                                    "you'r",
                                                                    "you'v",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
         

In [84]:
gs_et2.best_params_

{'cv__max_features': 10000,
 'et__max_depth': 100,
 'et__min_samples_leaf': 1,
 'et__min_samples_split': 10}

In [85]:
gs_et2.best_score_

0.6147521675442417

In [86]:
gs_et2.score(X_train, y_train)

0.8731759087291059

In [87]:
gs_et2.score(X_test, y_test)

0.6128133704735376

This is still overfit, but seems promising. Let's twiddle some more.

In [88]:
pipe_et3=Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords, max_features=5000)), 
                  ('et', ExtraTreesClassifier(max_depth=100))])

pipe_et3_params = {
                   'et__min_samples_split':[4,10,20,50],
                   'et__min_samples_leaf':[5,10,20]
                  }
gs_et3 = GridSearchCV(pipe_et3, pipe_et3_params, cv=5, n_jobs=7, verbose=10)

In [89]:
gs_et3.fit(X_train, y_train);

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:   19.8s
[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:   41.3s
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed:  1.0min
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:  1.4min
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  2.1min
[Parallel(n_jobs=7)]: Done  47 tasks      | elapsed:  2.4min
[Parallel(n_jobs=7)]: Done  54 out of  60 | elapsed:  2.7min remaining:   18.2s
[Parallel(n_jobs=7)]: Done  60 out of  60 | elapsed:  3.0min finished


In [90]:
gs_et3.best_score_

0.6134914727863776

In [91]:
gs_et3.score(X_train, y_train)

0.716370390023879

In [92]:
gs_et3.score(X_test, y_test)

0.6120175089534421

In [148]:
preds=gs_et3.predict(X_test)
f1_score(y_test, preds)

0.5658949243098843

In [93]:
gs_et3.best_params_

{'et__min_samples_leaf': 5, 'et__min_samples_split': 4}

Getting warmer.....

In [94]:
pipe_et4=Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords, max_features=5000)), 
                  ('et', ExtraTreesClassifier(max_depth=10))])

pipe_et4_params = {
                   'et__min_samples_split':[20],
                   'et__min_samples_leaf':[10],
                    'et__n_estimators':[100,1000]
                  }
gs_et4 = GridSearchCV(pipe_et4, pipe_et4_params, cv=5, n_jobs=7, verbose=10)

In [95]:
gs_et4.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   3 out of  10 | elapsed:   18.5s remaining:   43.4s
[Parallel(n_jobs=7)]: Done   5 out of  10 | elapsed:   18.7s remaining:   18.7s
[Parallel(n_jobs=7)]: Done   7 out of  10 | elapsed:   22.9s remaining:    9.7s
[Parallel(n_jobs=7)]: Done  10 out of  10 | elapsed:   34.9s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(max_features=5000,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'our',
                                                                    'ourselv',
                                                                    'you',
                                                                    "you'r",
                                                                    "you'v",
                                                                    "you'll",
                                                                    "you'd",
          

In [96]:
gs_et4.score(X_train, y_train)

0.6518307243300611

In [97]:
gs_et4.score(X_test, y_test)

0.6010744130521289

In [149]:
preds=gs_et4.predict(X_test)
f1_score(y_test, preds)

0.6269767441860467

In [98]:
gs_et4.best_params_

{'et__min_samples_leaf': 10,
 'et__min_samples_split': 20,
 'et__n_estimators': 1000}

## Adaboost

In [99]:
from sklearn.ensemble import AdaBoostClassifier

In [100]:
ada=AdaBoostClassifier()

In [101]:
pipe_ada=Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords, max_features=5000)), 
                  ('ada', AdaBoostClassifier(n_estimators=1000))])

In [102]:
cross_val_score(pipe_ada,X_train, y_train, cv=5, verbose=1).mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min finished


0.5860304138019012

In [103]:
pipe_ada.fit(X_train, y_train)



Pipeline(steps=[('cv',
                 CountVectorizer(max_features=5000,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'our', 'ourselv', 'you',
                                             "you'r", "you'v", "you'll",
                                             "you'd", 'your', 'your',
                                             'yourself', 'yourselv', 'he',
                                             'him', 'his', 'himself', 'she',
                                             'she', 'her', 'her', 'herself',
                                             'it', 'it', 'it', 'itself', ...],
                                 tokenizer=<function snowball_tokens2 at 0x00000246DE381940>)),
                ('ada', AdaBoostClassifier(n_estimators=1000))])

In [104]:
pipe_ada.score(X_train, y_train)

0.720151233749005

In [105]:
pipe_ada.score(X_test, y_test)

0.5873458018304815

I don't think this is worth exploring more in this case.

## Model Selection Initial Conclusions

From the above tests, I believe the two models worth exploring more are MultinomialNB and BinomialNB. They offered the best scores (although not significantly). Also, they had notably quicker runtime, which makes further exploration easier.

# Model Refinement

Let's play around with a few more parameters, and do some grid searches that might take a while.

In [106]:
pipe_mnb2= Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2)), ('mnb', MultinomialNB())])
pipe_mnb2_params = {'cv__max_features': [4000, 6000, 8000, 10_000, 12_000],
                    'cv__stop_words': [None, custom_stopwords],
                    'cv__ngram_range':[(1,1),(1,2),(2,2)],
                    'cv__max_df':[1.0, 0.95],
                    'cv__min_df':[1, 0.01]}

gs_mnb2 = GridSearchCV(pipe_mnb2, pipe_mnb2_params, cv=5, n_jobs=6, verbose=10)

In [107]:
gs_mnb2.fit(X_train, y_train);

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   14.3s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   15.7s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   46.8s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.4min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  2.7min
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:  3.6min
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:  4.2min
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:  4.8min
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:  5.6min
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed:  6.5min
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:  7.1min
[Parallel(n_jobs=6)]: Done 169 tasks      | elapsed:  8.2min
[Parallel(

In [108]:
gs_mnb2.best_params_

{'cv__max_df': 1.0,
 'cv__max_features': 12000,
 'cv__min_df': 1,
 'cv__ngram_range': (1, 1),
 'cv__stop_words': None}

In [109]:
gs_mnb2.best_score_

0.6237063137011669

In [110]:
gs_mnb2.score(X_train, y_train)

0.7357389227911914

In [111]:
gs_mnb2.score(X_test, y_test)

0.6237564663748508

In [151]:
preds=gs_mnb2.predict(X_test)
f1_score(y_test, preds)

0.6368350297676205

In [112]:
gs_mnb2.score(X_train_long, y_train_long)

0.7969821673525377

In [113]:
gs_mnb2.score(X_test_long, y_test_long)

0.6414930555555556

**Multinomial NB notes before we do an even more in depth search:**
- In all tests I've done, including custom_stopwords has outperformed not including it.
- Multinomial NB seems to have a preference for large feature sets -- in both cases, our search has selected the largest possible feature set.
- min_df and max_df don't seem to be a factor, possibly because we've already yanked stop words.
- I have a hunch that n_gram range goes up as features go up, so doing a final search with (1,3) included will be interesting.

In [114]:
pipe_bnb2= Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, binary=True, stop_words=custom_stopwords)),
                     ('bnb', BernoulliNB())])
pipe_bnb2_params = {'cv__max_features': [2000, 4000, 6000, 8000, 10_000, 15_000],
                    'cv__ngram_range':[(1,1),(1,2),(2,2)]}

gs_bnb2 = GridSearchCV(pipe_bnb2, pipe_bnb2_params, cv=5, n_jobs=6, verbose=10)

In [115]:
gs_bnb2.fit(X_train, y_train);

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   14.4s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   15.3s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   46.7s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.0min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.4min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  2.7min
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done  90 out of  90 | elapsed:  4.2min finished


In [116]:
gs_bnb2.best_score_

0.6190627763041557

In [117]:
gs_bnb2.best_params_

{'cv__max_features': 4000, 'cv__ngram_range': (1, 1)}

In [118]:
gs_bnb2.score(X_train, y_train)

0.6840673918811356

In [119]:
gs_bnb2.score(X_test, y_test)

0.6136092319936332

In [152]:
preds=gs_bnb2.predict(X_test)
f1_score(y_test, preds)

0.5731868131868132

In [120]:
gs_bnb2.score(X_train_long, y_train_long)

0.7377229080932785

In [121]:
gs_bnb2.score(X_test_long, y_test_long)

0.6293402777777778

Let's do a search on a multinomial NB over a wide max feature range.

In [122]:
pipe_mnb3= Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords)),
                     ('mnb', MultinomialNB())])
pipe_mnb3_params = {'cv__max_features': range(10000,26000,1000),
                    'cv__ngram_range':[(1,1),(1,2),(1,3)],}

gs_mnb3 = GridSearchCV(pipe_mnb3, pipe_mnb3_params, cv=5, n_jobs=-1, verbose=10)

In [123]:
gs_mnb3.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   56.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'our',
                                                                    'ourselv',
                                                                    'you',
                                                                    "you'r",
                                                                    "you'v",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
         

In [124]:
gs_mnb3.best_params_

{'cv__max_features': 15000, 'cv__ngram_range': (1, 1)}

In [125]:
gs_mnb3.best_score_

0.6217161979668413

In [126]:
gs_mnb3.score(X_train, y_train)

0.7525205624834174

In [127]:
gs_mnb3.score(X_test, y_test)

0.6235575009948269

In [153]:
preds=gs_mnb3.predict(X_test)
f1_score(y_test, preds)

0.6361538461538461

In [128]:
mnb3_preds=gs_mnb3.predict(X_test)

In [129]:
gs_mnb3.score(X_train_long, y_train_long)

0.8128943758573388

In [130]:
gs_mnb3.score(X_test_long, y_test_long)

0.6423611111111112

This increased accuracy by 0.005%, but made the overfitting gap even worse than it already was. Again, it picked the largest n_gram range and largest feature_set. I don't think increasing our feature size is the way to go.

In [131]:
pipe_bnb3= Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords, binary=True)),
                     ('bnb', BernoulliNB())])
pipe_bnb3_params = {'cv__max_features': [2000,4000,6000,8000,10_000,15_000],
                    'cv__ngram_range':[(1,1), (1,2)]}

gs_bnb3 = GridSearchCV(pipe_bnb3, pipe_bnb3_params, cv=10, n_jobs=6, verbose=10);

In [132]:
gs_bnb3.fit(X_train, y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   15.5s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   16.0s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   49.6s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.5min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  2.9min
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:  3.7min
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:  4.3min
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:  4.9min
[Parallel(n_jobs=6)]: Done 120 out of 120 | elapsed:  5.8min finished


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(binary=True,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'our',
                                                                    'ourselv',
                                                                    'you',
                                                                    "you'r",
                                                                    "you'v",
                                                                    "you'll",
                                                                    "you'd",
               

In [133]:
gs_bnb3.best_params_

{'cv__max_features': 15000, 'cv__ngram_range': (1, 1)}

In [134]:
gs_bnb3.best_score_

0.6246346844698216

In [135]:
gs_bnb3.score(X_train, y_train)

0.7485407269832847

In [136]:
gs_bnb3.score(X_test, y_test)

0.6195781933943494

In [155]:
preds=gs_bnb3.predict(X_test)
f1_score(y_test, preds)

0.5747330960854093

In [137]:
gs_bnb3.score(X_train_long, y_train_long)

0.8235939643347051

In [138]:
gs_bnb3.score(X_test_long, y_test_long)

0.6319444444444444

It seems like all our high feature models are overfitting for marginal increases in accuracy. Let's try to limit these.

In [156]:
pipe_mnb4= Pipeline([('cv', CountVectorizer(tokenizer=snowball_tokens2, stop_words=custom_stopwords)),
                     ('mnb', MultinomialNB())])
pipe_mnb4_params = {'cv__max_features': [2000,300,400,5000,6000,7000]}

gs_mnb4 = GridSearchCV(pipe_mnb4, pipe_mnb4_params, cv=5, n_jobs=-1, verbose=10)

In [157]:
gs_mnb4.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:   59.2s remaining:   34.2s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:  1.0min remaining:   18.2s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:  1.2min remaining:    8.2s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.3min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'our',
                                                                    'ourselv',
                                                                    'you',
                                                                    "you'r",
                                                                    "you'v",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
         

In [158]:
gs_mnb4.best_params_

{'cv__max_features': 7000}

In [159]:
gs_mnb4.best_score_

0.6203236910056262

In [160]:
gs_mnb4.score(X_train, y_train)

0.7115282568320509

In [161]:
gs_mnb4.score(X_test, y_test)

0.6219657779546359

In [139]:
pipe_bnb4= Pipeline([('cv', CountVectorizer(stop_words=custom_stopwords, binary=True)),
                     ('bnb', BernoulliNB())])
pipe_bnb4_params = {'cv__max_features': [2000,3000,4000,5000, 6000, 7000],
                   'cv__tokenizer': [snowball_tokens, snowball_tokens2]}

gs_bnb4 = GridSearchCV(pipe_bnb4, pipe_bnb4_params, cv=5, n_jobs=6, verbose=10);

In [140]:
gs_bnb4.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   15.5s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   16.1s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   48.0s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.4min
[Parallel(n_jobs=6)]: Done  56 out of  60 | elapsed:  2.7min remaining:   11.5s
[Parallel(n_jobs=6)]: Done  60 out of  60 | elapsed:  2.7min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(binary=True,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'our',
                                                                    'ourselv',
                                                                    'you',
                                                                    "you'r",
                                                                    "you'v",
                                                                    "you'll",
                                                                    "you'd",
                

In [141]:
gs_bnb4.best_params_

{'cv__max_features': 4000,
 'cv__tokenizer': <function __main__.snowball_tokens2(text)>}

In [142]:
gs_bnb4.best_score_

0.6190627763041557

In [143]:
gs_bnb4.score(X_train, y_train)

0.6840673918811356

In [144]:
gs_bnb4.score(X_test, y_test)

0.6136092319936332

# Notes from running a bunch of different gridsearches:

Increasing features and n-grams slightly increases accuracy, but at the expense of serious overfitting. Most of our "good" models are in the range of 61 +-1% accuracy for the test set.

Using difference from training set accuracy as an overfitting metric, we have gaps of 7% in our best Bernoulli models. We can get an extra 0.5% in accuracy in a Multinomial NB model, but at the cost of increasing the training/test gap to 16%.

Increasing cross validation from 5 to 10 fold slightly increased model performance, but changed parameters towards overfitting.

I think the Bernoulli Naive Bayesian model with 4000 max features and with snowball_tokens2 is the winner.

After this mess of trial and error, let's fully analyze our model in a new workbook.

In [145]:
canada_df.to_csv('..\data\canada_subreddit_comments.csv', index=False)