# Imports

In [36]:
from requests import get
from bs4 import BeautifulSoup
import os
from time import sleep
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import json
from wordcloud import WordCloud
import numpy as np
import pprint as pprint
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import scipy.stats as sp
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [8]:
def classify_with_new_decision_threshold(probability, new_threshold):
    if probability > new_threshold:
        return True
    else:
        return False

def sentiment_categorizer(sentiment_score_dictionary):
    compound_score = sentiment_score_dictionary['compound']
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score > -0.05 and compound_score < 0.05:
        return 'neutral'
    elif compound_score <= -0.05:
        return 'negative'

## Acquisition

In [5]:
df = pd.read_csv('trigger_warning_tweets.csv', index_col=0)

In [6]:
df.isna().sum()

title                      0
tweet                      1
trigger_scene              0
cleaned_text               3
stemmed_text               3
lemmatized_text            3
lemmatized_no_stopwords    7
stemmed_no_stopwords       5
dtype: int64

df.dropna(inplace=True)
df.isna().sum()

## Creating Bag of Words sparse matrix

In [9]:
bag_of_words = CountVectorizer()
X = bag_of_words.fit_transform(df.lemmatized_no_stopwords)
y = df.trigger_scene

In [18]:
sparse_matrix = pd.DataFrame(X.todense(), columns=bag_of_words.get_feature_names())
sparse_matrix.head()

Unnamed: 0,007,00s,010,02,03,0539,06,08,0806,08230,...,zoe,zombie,zone,zooey,zoolander,zootopia,zorx,zowee,zu,zune
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Combining features into a tuple

df.values gives an array of lists of values in each row. The following list comprehension goes through each row and combines the values into a tuple.

In [31]:
# features = [tuple(x) for x in sparse_matrix.values]

## Splitting into a train and test set.

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, stratify=y)

## Creating the model

In [37]:
mnb = MultinomialNB()
parameters = {'alpha':np.linspace(1, 50, 1)}

mnb_rs = RandomizedSearchCV(estimator=mnb, param_distributions=parameters, n_jobs=4, n_iter=25, random_state=123)
mnb_rs.fit(X_train, y_train)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          fit_params=None, iid='warn', n_iter=25, n_jobs=4,
          param_distributions={'alpha': array([1.])},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [38]:
print(mnb_rs.best_params_)
print(mnb_rs.best_score_)

{'alpha': 1.0}
0.5975948196114709


##### Best score during cross validation (splitting train set into smaller subsets) was 0.59

In [41]:
train = pd.DataFrame(dict(actual=y_train))
train['mnb_predictions'] = mnb_rs.predict(X_train)
train.head()

Unnamed: 0,actual,mnb_predictions
2469,True,True
686,False,False
656,False,False
1096,False,False
1160,False,False


In [44]:
accuracy = accuracy_score(train.actual, train.mnb_predictions)
print(f'Accuracy score: {accuracy*100:.2f}%')

Accuracy score: 94.63%


##### Checking the number of false positives, false negatives, and how often the model predicted each class.

In [45]:
pd.crosstab(train.actual, train.mnb_predictions)

mnb_predictions,False,True
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,866,70
True,46,1180


##### Score when using the whole train set was 0.94

### Evaluating on test set

In [39]:
accuracy_score(y_test, mnb_rs.predict(X_test))

0.5866851595006934

In [46]:
test = pd.DataFrame(dict(actual=y_test, mnb_predictions=mnb_rs.predict(X_test)))
test.head()

Unnamed: 0,actual,mnb_predictions
756,False,True
1593,True,False
1323,True,False
795,False,False
2355,True,True


##### Checking false positives, false negatives, and frequency of class predictions

In [47]:
pd.crosstab(test.actual, test.mnb_predictions)

mnb_predictions,False,True
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,161,151
True,147,262


### 58% accuracy overall.
Not that great, I had better accuracy from other models. I will try with bigrams, that gave me better accuracy with previous models.

## Bag of words with bigrams

In [48]:
bag_of_words = CountVectorizer(ngram_range=(2,2))
X = bag_of_words.fit_transform(df.lemmatized_no_stopwords)
y = df.trigger_scene

## Train test split

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, stratify=y)

## Create model

In [50]:
mnb = MultinomialNB()
parameters = {'alpha':np.linspace(1, 50, 1)}

mnb_bigrams_rs = RandomizedSearchCV(estimator=mnb, param_distributions=parameters, n_jobs=4, n_iter=25, random_state=123)
mnb_bigrams_rs.fit(X_train, y_train)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          fit_params=None, iid='warn', n_iter=25, n_jobs=4,
          param_distributions={'alpha': array([1.])},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [51]:
print(mnb_bigrams_rs.best_params_)
print(mnb_bigrams_rs.best_score_)

{'alpha': 1.0}
0.5115633672525439


In [54]:
train['mnb_bigrams_predictions'] = mnb_bigrams_rs.predict(X_train)
train.head()

Unnamed: 0,actual,mnb_predictions,mnb__bigrams_predictions,mnb_bigrams_predictions
2469,True,True,True,True
686,False,False,False,False
656,False,False,False,False
1096,False,False,False,False
1160,False,False,False,False


##### Checking accuracy on entire train set

In [55]:
accuracy = accuracy_score(train.actual, train.mnb_bigrams_predictions)
print(f'Accuracy score: {accuracy*100:.2f}%')

Accuracy score: 97.50%


##### Checking false positives, false negatives

In [60]:
pd.crosstab(train.actual, train.mnb_bigrams_predictions)

mnb_bigrams_predictions,False,True
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,909,27
True,27,1199


### Evaluating on test set

In [63]:
accuracy_score(y_test, mnb_bigrams_rs.predict(X_test))

0.4895977808599168

## Nothing really improved when using Naive Bayes.
I realize that I haven't 'tallied up the votes' for each movie like it did for previous models. From what I've seen, doing so improves accuracy by about 3 or 4 %. Improving the accuracy by that amount would put this model on par with the previous models, so I will simply accept the other model as my best model.