## CountVector and SIA Vaderization with Logistic Regression

In [1]:
# do the imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.corpus import stopwords
nltk_stops = stopwords.words('english')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

##### The vaderized dataframe has complete 10,000 posts plus the Sentiment info bolted on.

In [2]:
df_vaderized = pd.read_csv('../../datasets/df_vaderized.csv')

##### Split the data, word vectorize with CountVectorizor, and bolt onto the splitted dataframes

In [3]:
X = df_vaderized['title']
y = df_vaderized['is_onion']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42, stratify= y)


cvec = CountVectorizer(max_features= 60_000,
                       ngram_range= (1, 4),
                       stop_words= None)

In [4]:
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [5]:
df_X_train_cvec = pd.DataFrame(X_train_cvec.todense(), columns= cvec.get_feature_names())
df_X_test_cvec = pd.DataFrame(X_test_cvec.todense(), columns= cvec.get_feature_names())

In [6]:
df_X_train = X_train.to_frame().reset_index()

df_X_train_combo = pd.concat([df_X_train, df_X_train_cvec], axis= 1)

##### Have to rebolt on the Sentiment via an index merge because of train-test-split stratified the original

In [7]:
df_vaderized = df_vaderized.drop(['Unnamed: 0'], axis= 1)
df_vaderized = df_vaderized.reset_index()

In [8]:
df_X_train_combo_with_vader = pd.merge(left= df_X_train_combo,
                                 right= df_vaderized,
                                 how= 'left',
                                 left_on= 'index',
                                 right_on= 'index')

In [9]:
df_X_train_combo_with_vader = df_X_train_combo_with_vader.drop(['index', 'title_x', 'title_y', 'is_onion'],
                                                              axis= 1)

In [10]:
df_X_test = X_test.to_frame().reset_index()

df_X_test_combo = pd.concat([df_X_test, df_X_test_cvec], axis= 1)

In [11]:
df_X_test_combo_with_vader = pd.merge(left= df_X_test_combo,
                                      right= df_vaderized,
                                      how= 'left',
                                      left_on= 'index',
                                      right_on= 'index')

In [12]:
df_X_test_combo_with_vader = df_X_test_combo_with_vader.drop(['index', 'title_x', 'title_y', 'is_onion'],
                                                            axis= 1)

#### Finally fit the Logistic Regression model

In [13]:
logreg = LogisticRegression(solver= 'lbfgs', C= 1e9)
logreg.fit(df_X_train_combo_with_vader, y_train)


LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
logreg.score(df_X_train_combo_with_vader, y_train)

0.9988

In [15]:
logreg.score(df_X_test_combo_with_vader, y_test)

0.8444

In [16]:
cross_val_score(logreg, df_X_train_combo_with_vader, y_train, cv= 3).mean()

0.8283999999999999