## CountVector, SIA Vaderization, and PolyFeature Engineering with Logistic Regression and then Regularize with Lasso, Ridge, and Elasticnet


In [1]:
# do the imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.corpus import stopwords
nltk_stops = stopwords.words('english')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import ExtraTreesClassifier


import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

##### Vaderized Dataframe has the original 10,000 posts with the SIA info bolted on

In [2]:
df_vaderized = pd.read_csv('../../datasets/df_vaderized.csv')

df_vaderized = df_vaderized.drop(['Unnamed: 0'], axis= 1)
df_vaderized.head(1)

Unnamed: 0,neg,neu,pos,compound,title,is_onion
0,0.201,0.799,0.0,-0.743,'Buddhist aliens' have visited 'Thailand’s Are...,1


##### Word vectorize with CountVectorize and bolt onto split dataframes with an index merge because of train test split stratify

In [3]:
X = df_vaderized['title']
y = df_vaderized['is_onion']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42, stratify= y)


cvec = CountVectorizer(max_features= 60_000,
                       ngram_range= (1, 4),
                       stop_words= None)

In [4]:
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [5]:
df_X_train_cvec = pd.DataFrame(X_train_cvec.todense(), columns= cvec.get_feature_names())
df_X_test_cvec = pd.DataFrame(X_test_cvec.todense(), columns= cvec.get_feature_names())

In [6]:
df_X_train = X_train.to_frame().reset_index()

df_X_train_combo = pd.concat([df_X_train, df_X_train_cvec], axis= 1)

In [7]:
df_vaderized = df_vaderized.reset_index()

In [8]:
df_X_train_combo_with_vader = pd.merge(left= df_X_train_combo,
                                 right= df_vaderized,
                                 how= 'left',
                                 left_on= 'index',
                                 right_on= 'index')

In [9]:
df_X_train_combo_with_vader = df_X_train_combo_with_vader.drop(['index', 'title_x', 'title_y', 'is_onion'],
                                                              axis= 1)

In [10]:
df_X_test = X_test.to_frame().reset_index()

df_X_test_combo = pd.concat([df_X_test, df_X_test_cvec], axis= 1)

In [11]:
df_X_test_combo_with_vader = pd.merge(left= df_X_test_combo,
                                      right= df_vaderized,
                                      how= 'left',
                                      left_on= 'index',
                                      right_on= 'index')

In [12]:
df_X_test_combo_with_vader = df_X_test_combo_with_vader.drop(['index', 'title_x', 'title_y', 'is_onion'],
                                                            axis= 1)

### Feature Eng'g time,  pick the top 25 words from CountVectorizor and combine with Vader,  then PolyFeature all

In [15]:
# ranking by correlation CORR bombs on 10s of thousands of features;  found this Extra Trees picker gem, though it was 
#     kinda eating a handful of broken glass to make it work (love those index and column params datatypes !!!   ha!)
model_cvec_feature_picker = ExtraTreesClassifier()
model_cvec_feature_picker.fit(df_X_train_cvec, y_train)
model_cvec_feature_picker.feature_importances_
feat_importances = pd.DataFrame(model_cvec_feature_picker.feature_importances_, index= df_X_train_cvec.columns)
feat_importances.nlargest(25, columns= pd.RangeIndex(start=0, stop=1, step=1)     )

Unnamed: 0,0
says,0.007713
of,0.006158
arrested,0.005633
police,0.004516
nation,0.00434
woman,0.00369
know,0.003394
all,0.003133
said,0.003049
what,0.00298


In [16]:
# feat_importances.to_csv('../datasets/top25.csv')

In [17]:
XX = pd.concat([df_X_train_combo_with_vader['says'],
                df_X_train_combo_with_vader['police'],
                df_X_train_combo_with_vader['arrested'],
                df_X_train_combo_with_vader['nation'],
                df_X_train_combo_with_vader['said what'],
                df_X_train_combo_with_vader['of'],
                df_X_train_combo_with_vader['new'],
                df_X_train_combo_with_vader['know'],
                df_X_train_combo_with_vader['this'],
                df_X_train_combo_with_vader['him gay'],
                df_X_train_combo_with_vader['to'],
                df_X_train_combo_with_vader['2018'],
                df_X_train_combo_with_vader['china'],
                df_X_train_combo_with_vader['all'],
                df_X_train_combo_with_vader['florida'],
                df_X_train_combo_with_vader['is'],
                df_X_train_combo_with_vader['report'],
                df_X_train_combo_with_vader['adopted'],
                df_X_train_combo_with_vader['ukraine'],
                df_X_train_combo_with_vader['woman'],
                df_X_train_combo_with_vader['has'],
                df_X_train_combo_with_vader['in'],
                df_X_train_combo_with_vader['charged'],
                df_X_train_combo_with_vader['said'],
                df_X_train_combo_with_vader['buggy'],
                df_X_train_combo_with_vader['pos'],            # above: most important words;  below: the sentiments
                df_X_train_combo_with_vader['neu'],
                df_X_train_combo_with_vader['neg'],
                df_X_train_combo_with_vader['compound_y']      # subreddit titles had the word 'compound' in them, so 
               ], axis= 1)                                     #    then VADER's 'compound' survives as this instead
features = list(XX.columns)
poly = PolynomialFeatures(degree= 4, include_bias= False)
XX_poly = poly.fit_transform(XX)
df_XX_poly = pd.DataFrame(XX_poly, columns= poly.get_feature_names(features))
df_XX_poly.shape

(7500, 40919)

In [18]:
df_X_train_cvec_vader_poly = pd.concat([df_X_train_combo_with_vader, df_XX_poly], axis= 1)

In [19]:
xx = pd.concat([df_X_test_combo_with_vader['says'],
                df_X_test_combo_with_vader['police'],
                df_X_test_combo_with_vader['arrested'],
                df_X_test_combo_with_vader['nation'],
                df_X_test_combo_with_vader['said what'],
                df_X_test_combo_with_vader['of'],
                df_X_test_combo_with_vader['new'],
                df_X_test_combo_with_vader['know'],
                df_X_test_combo_with_vader['this'],
                df_X_test_combo_with_vader['him gay'],
                df_X_test_combo_with_vader['to'],
                df_X_test_combo_with_vader['2018'],
                df_X_test_combo_with_vader['china'],
                df_X_test_combo_with_vader['all'],
                df_X_test_combo_with_vader['florida'],
                df_X_test_combo_with_vader['is'],
                df_X_test_combo_with_vader['report'],
                df_X_test_combo_with_vader['adopted'],
                df_X_test_combo_with_vader['ukraine'],
                df_X_test_combo_with_vader['woman'],
                df_X_test_combo_with_vader['has'],
                df_X_test_combo_with_vader['in'],
                df_X_test_combo_with_vader['charged'],
                df_X_test_combo_with_vader['said'],
                df_X_test_combo_with_vader['buggy'],
                df_X_test_combo_with_vader['pos'],
                df_X_test_combo_with_vader['neu'],
                df_X_test_combo_with_vader['neg'],
                df_X_test_combo_with_vader['compound_y']      
               ], axis= 1)               
features = list(xx.columns)
poly = PolynomialFeatures(degree= 4, include_bias= False)
xx_poly = poly.fit_transform(xx)
df_xx_poly = pd.DataFrame(xx_poly, columns= poly.get_feature_names(features))
df_xx_poly.shape

(2500, 40919)

In [20]:
df_X_test_cvec_vader_poly = pd.concat([df_X_test_combo_with_vader, df_xx_poly], axis= 1)

##### Dataframe prep is over;  dataframe has CV, SAI Vader, and feature engineering PolyFeatures (top 25 words plus Sentiment interacted)

### Model time with Logistic Regression

In [21]:
logreg = LogisticRegression(solver= 'lbfgs')
logreg.fit(df_X_train_cvec_vader_poly, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
logreg.score(df_X_train_cvec_vader_poly, y_train)

0.9950666666666667

In [23]:
logreg.score(df_X_test_cvec_vader_poly, y_test)

0.8432

In [24]:
cross_val_score(logreg, df_X_train_cvec_vader_poly, y_train, cv= 3).mean()

0.8333333333333334

### Regularization with Lasso, Ridge, and Elasticnet 


#### Lasso first...

In [25]:
logreg_lasso = LogisticRegression(solver= 'saga', penalty= 'l1')
logreg_lasso.fit(df_X_train_cvec_vader_poly, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
logreg_lasso.score(df_X_train_cvec_vader_poly, y_train)

0.7090666666666666

In [27]:
logreg_lasso.score(df_X_test_cvec_vader_poly, y_test)

0.6912

In [28]:
cross_val_score(logreg_lasso, df_X_test_cvec_vader_poly, y_test, cv= 3).mean()

0.6520197303695505

#### Ridge second...

In [29]:
logreg_ridge = LogisticRegression(solver= 'saga', penalty= 'l2')
logreg_ridge.fit(df_X_train_cvec_vader_poly, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
logreg_ridge.score(df_X_train_cvec_vader_poly, y_train)

0.7262666666666666

In [31]:
logreg_ridge.score(df_X_test_cvec_vader_poly, y_test)

0.698

In [32]:
cross_val_score(logreg_ridge, df_X_test_cvec_vader_poly, y_test, cv= 3).mean()

0.6572184560044273

#### Elasticnet third...

In [33]:
logreg_elasticnet = LogisticRegression(solver= 'saga', penalty= 'elasticnet', l1_ratio= 0.5)
logreg_elasticnet.fit(df_X_train_cvec_vader_poly, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.5, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='elasticnet',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
logreg_elasticnet.score(df_X_train_cvec_vader_poly, y_train)

0.7161333333333333

In [35]:
logreg_elasticnet.score(df_X_test_cvec_vader_poly, y_test)

0.694

In [36]:
cross_val_score(logreg_elasticnet, df_X_test_cvec_vader_poly, y_test, cv= 3).mean()

0.6568187757486319