### First bring in the necessary import statements to EDA and initial modeling to examine the data.

In [62]:
import re
import nltk
import spacy
import string
import imblearn
import numpy as np
import pandas as pd
import seaborn as sns
import texthero as hr
from PIL import Image 
from wordcloud import WordCloud
from nltk import pos_tag, FreqDist
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.manifold import TSNE
from collections import defaultdict
from mpl_toolkits.mplot3d import Axes3D
from nltk.tokenize import TweetTokenizer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from matplotlib.ticker import MaxNLocator
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction import text
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from nltk.stem import WordNetLemmatizer, PorterStemmer
from imblearn.metrics import classification_report_imbalanced
from nltk.tokenize import word_tokenize, regexp_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix, confusion_matrix, classification_report, accuracy_score, precision_score

import os
import sys
import warnings
warnings.filterwarnings('ignore')
sys.setrecursionlimit(100000)
module_path = os.path.abspath(os.pardir)
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)
%matplotlib inline
pd.set_option('display.max_colwidth', None)

/Users/jax/Documents/Flatiron


### First model I'd like to use a count vectorizer and good ole logistic regression. Then I'll use naive bayes which is particularly successful with nlp because it assumes *naively* there is no interdependence amongst the variables. Also Multinomial Bayes allows me to train my model with less data and potentionally mislabeled data. So first let's split our data.

In [24]:
unique_df = pd.read_csv('data/modeling_tweets.csv')

In [25]:
y = unique_df['target'].copy()
X = unique_df.drop(columns=['target','existence_confidence'], axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1, stratify=y)
X_t, X_val, y_t, y_val = train_test_split(X, y, test_size=.25, random_state=2, stratify=y)

In [26]:
y_train

3150    1
324     2
5406    0
914     1
468     1
       ..
3120    2
2163    1
3099    1
955     1
3042    0
Name: target, Length: 4110, dtype: int64

### Let's get to modeling. First let's test out using SMOTE to eliminate our class imbalance.

In [27]:
def max_features_check(n):

#     train_docs = [doc_preparer(doc) for doc in X_train['processedtwitter']]
#     test_docs = [doc_preparer(doc) for doc in X_test['processedtwitter']]

    # Secondary train-test split to build our best model
    X_t, X_val, y_t, y_val = train_test_split(train_docs, y_train,
                                              test_size=0.25, random_state=42)
    # here's where our 'n' comes in
    cv = CountVectorizer(max_features=n)

    X_t_vec = cv.fit_transform(X_t)
    X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
    X_t_vec.columns = sorted(cv.vocabulary_)
    X_t_vec.set_index(y_t.index, inplace=True)

    # We then transform the validation set. (Do not refit the vectorizer!)

    X_val_vec = cv.transform(X_val)
    X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
    X_val_vec.columns = sorted(cv.vocabulary_)
    X_val_vec.set_index(y_val.index, inplace=True)

    mnb = MultinomialNB()

    mnb.fit(X_t_vec, y_t)
    y_hat = mnb.predict(X_val_vec)
    
    
    return accuracy_score(y_val, y_hat)

In [41]:
imb_params= {
'count__decode_error':['ignore', 'replace'],
'count__analyzer' : ['word', 'char', 'char_wb'],
'model__solver' : ['sag', 'saga'],
'model__max_iter' :[4000]}

In [42]:
imb_pipe = imbPipeline([('count', CountVectorizer()),('sm', SMOTE()),('model', LogisticRegression())])

In [43]:
imb_gs = GridSearchCV(imb_pipe, param_grid=imb_params, cv = 5, verbose = 5, n_jobs = -1)

In [45]:
imbmodel = imb_gs.fit(X_train['lemmed_tweets'].values.astype('U'), y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [47]:
imb_best = imbmodel.best_estimator_

imby_trn_pred = imb_best.predict(X_train['lemmed_tweets'].values.astype('U'))
imby_tst_pred = imb_best.predict(X_test['lemmed_tweets'].values.astype('U'))

    
print('\t\tThe Train Results')
print(classification_report(y_train, imby_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, imby_tst_pred))

		The Train Results
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       766
           1       0.94      0.93      0.94      2100
           2       0.90      0.90      0.90      1244

    accuracy                           0.92      4110
   macro avg       0.91      0.92      0.92      4110
weighted avg       0.92      0.92      0.92      4110


		The Test Results
              precision    recall  f1-score   support

           0       0.49      0.59      0.54       256
           1       0.73      0.63      0.68       700
           2       0.52      0.56      0.54       415

    accuracy                           0.60      1371
   macro avg       0.58      0.60      0.58      1371
weighted avg       0.62      0.60      0.61      1371



### Ok, we have not bad results but definitely overfit and not our best. Since we used SMOTE to address the class imbalnce, ideally now we would focus on accuracy since false negatives and positives bear the same weight. Now let's try it without SMOTE.

In [48]:
cv_params= {
'count__decode_error':['strict', 'ignore', 'replace'],
'count__analyzer' : ['word', 'char', 'char_wb'],
'count__max_df' : [.95],
'count__min_df' : [.05],
'model__solver' : ['lbfgs','sag', 'saga'],
'model__max_iter' :[4000]}
    

first_pipe = Pipeline([('count', CountVectorizer()),
                    ('model', LogisticRegression())])

In [49]:
cv_gs = GridSearchCV(first_pipe, param_grid=cv_params, cv = 5, verbose = 5, n_jobs = -1)

In [51]:
lrmodel = cv_gs.fit(X_train['lemmed_tweets'].values.astype('U'), y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [52]:
lr_best = lrmodel.best_estimator_

fy_trn_pred = lr_best.predict(X_train['lemmed_tweets'].values.astype('U'))
fy_tst_pred = lr_best.predict(X_test['lemmed_tweets'].values.astype('U'))

    
print('\t\tThe Train Results')
print(classification_report(y_train, fy_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, fy_tst_pred))

		The Train Results
              precision    recall  f1-score   support

           0       0.44      0.06      0.10       766
           1       0.53      0.95      0.68      2100
           2       0.50      0.09      0.15      1244

    accuracy                           0.52      4110
   macro avg       0.49      0.37      0.31      4110
weighted avg       0.50      0.52      0.41      4110


		The Test Results
              precision    recall  f1-score   support

           0       0.40      0.06      0.11       256
           1       0.53      0.94      0.67       700
           2       0.43      0.08      0.13       415

    accuracy                           0.52      1371
   macro avg       0.45      0.36      0.31      1371
weighted avg       0.47      0.52      0.41      1371



### Holy cajolly wildly overfit to training buuuut a start. Now for my random forest classifier search it took a bit of time as well so be fair warned.

In [54]:
#Param setup for Gridsearch
tf_params = {
 'tf__max_features':[100, 500, 2000, None],
 'tf__ngram_range': [(1, 1), (1, 2), (2, 2), None],
 'mnb__alpha': [.1, .5, .8, 1],
 'mnb__fit_prior': [True, False]}

In [55]:
mnb_pipe = Pipeline([('tf', TfidfVectorizer()),
                    ('mnb', MultinomialNB())])
mnb_gs = GridSearchCV(mnb_pipe, param_grid=tf_params, cv = 5, verbose = 1, n_jobs = -1)

In [56]:
mnb_model = mnb_gs.fit(X_train['lemmed_tweets'].values.astype('U'), y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


 0.56618005        nan 0.62919708 0.63163017 0.58199513        nan
 0.63211679 0.63406326 0.61216545        nan 0.46593674 0.46739659
 0.27664234        nan 0.57664234 0.57493917 0.35523114        nan
 0.60389294 0.60389294 0.43260341        nan 0.63284672 0.63576642
 0.54476886        nan 0.58759124 0.58759124 0.55620438        nan
 0.62214112 0.62189781 0.56836983        nan 0.63527981 0.63625304
 0.58783455        nan 0.62773723 0.62773723 0.61119221        nan
 0.46545012 0.46739659 0.27639903        nan 0.57493917 0.57712895
 0.35717762        nan 0.61435523 0.60632603 0.43698297        nan
 0.63187348 0.64817518 0.54841849        nan 0.58807786 0.58856448
 0.55717762        nan 0.62214112 0.61849148 0.56788321        nan
 0.63284672 0.63041363 0.5892944         nan 0.61922141 0.61873479
 0.61046229        nan 0.46618005 0.46569343 0.27615572        nan
 0.57737226 0.57688564 0.35766423        nan 0.61459854 0.60900243
 0.44014599        nan 0.64038929 0.64963504 0.54939173       

In [57]:
mnb_best = mnb_model.best_estimator_

mnby_trn_pred = mnb_best.predict(X_train['lemmed_tweets'].values.astype('U'))
mnby_tst_pred = mnb_best.predict(X_test['lemmed_tweets'].values.astype('U'))


    
print('\t\tThe Train Results')
print(classification_report(y_train, mnby_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, mnby_tst_pred))

		The Train Results
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       766
           1       0.94      0.93      0.94      2100
           2       0.89      0.89      0.89      1244

    accuracy                           0.92      4110
   macro avg       0.91      0.92      0.91      4110
weighted avg       0.92      0.92      0.92      4110


		The Test Results
              precision    recall  f1-score   support

           0       0.60      0.51      0.55       256
           1       0.72      0.78      0.75       700
           2       0.61      0.58      0.59       415

    accuracy                           0.67      1371
   macro avg       0.64      0.62      0.63      1371
weighted avg       0.66      0.67      0.66      1371



In [58]:
rf_params = {
 'tf__ngram_range': [(1, 2)],
 'tf__max_features':[100, 500, 2000, None],
 'tf__ngram_range': [(1, 1), (1, 2), (2, 2), None],
 'tf__min_df':[.05, None],
 'tf__max_df':[.95, None],
 'tf__stop_words': [None, 'english'],
 'rf__max_depth': [100, 500, 1000],
 'rf__min_samples_split': [100],
 'rf__max_leaf_nodes': [None]}

rf_pipe = Pipeline([('tf',  TfidfVectorizer()),
                     ('rf', RandomForestClassifier())])

In [59]:
rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, cv = 5, verbose = 1, n_jobs = -1)
rf_model = rf_gs.fit(X_train['lemmed_tweets'].values.astype('U'), y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan 0.51094891 0.51094891
 0.51094891 0.51094891        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan 0.51094891 0.51094891 0.51094891 0.51094891
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.51094891 0.51094891 0.51094891 0.51094891        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan       

In [61]:
rfy_trn_pred = rf_model.predict(X_train['lemmed_tweets'].values.astype('U'))
rfy_tst_pred = rf_model.predict(X_test['lemmed_tweets'].values.astype('U'))
    
print('\t\tThe Train Results')
print(classification_report(y_train, rfy_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, rfy_tst_pred))

		The Train Results
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       766
           1       0.51      1.00      0.68      2100
           2       0.00      0.00      0.00      1244

    accuracy                           0.51      4110
   macro avg       0.17      0.33      0.23      4110
weighted avg       0.26      0.51      0.35      4110


		The Test Results
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       256
           1       0.51      1.00      0.68       700
           2       0.00      0.00      0.00       415

    accuracy                           0.51      1371
   macro avg       0.17      0.33      0.23      1371
weighted avg       0.26      0.51      0.35      1371



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
confusion_matrix(y_test, rfy_tst_pred)

array([[  0, 256,   0],
       [  0, 700,   0],
       [  0, 415,   0]])

## Now that we have picked the best model lets get to vectorizing and our predictions.

In [65]:
# Fit the vectorizer on X_train["text"] and transform it
tfidf = TfidfVectorizer(max_features=100)
X_train_vectorized = tfidf.fit_transform(X_train['lemmed_tweets'].values.astype('U'))
Xtv_df = pd.DataFrame(X_train_vectorized.toarray(), columns=tfidf.get_feature_names())


In [66]:
idf_values = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [68]:
X_t_vec = tfidf.fit_transform(X_train['lemmed_tweets'].values.astype('U'))
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)


In [69]:
X_val_vec = tfidf.transform(X_val)
X_val_vec = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
#X_val_vec.set_index(y_val.index, inplace=True)


In [78]:
import time
start = time.time()


tf_nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords, 
                              tokenizer=tokenize, 
                              max_df=0.25, max_features=85000,
                             ngram_range=(1,3))),
    ('smt', SMOTE(random_state=42)),
    ('mnb', MultinomialNB(alpha=0.005)),
])

tf_nb_pipe.fit(train_docs, y_train)


y_trn_pred = tf_nb_pipe.predict(train_docs)
y_tst_pred = tf_nb_pipe.predict(test_docs)
    
print('\t\tThe Train Results')
print(classification_report(y_train, y_trn_pred))
print('\n\t\tThe Test Results')
print(classification_report(y_test, y_tst_pred))

end = time.time()
print(end - start, "seconds")

NameError: name 'tokenize' is not defined

In [70]:
y_pred=mnb.predict(X_val_vec)
print(accuracy_score(y_test,y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [1371, 2]

In [None]:
#y_hat = mnb.predict(X_val_vec)
#print(f1_score(y_val, y_hat))

In [None]:
#model.fit(X_train,y_train)
#y_pred=model.predict(X_test)
#print(accuracy_score(y_test,y_pred))
#print(X_train.toarray()[:2])