# Imports

In [28]:
from requests import get
from bs4 import BeautifulSoup
import os
from time import sleep
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import json
from wordcloud import WordCloud
import numpy as np
import pprint as pprint
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import scipy.stats as sp
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [18]:
def classify_with_new_decision_threshold(probability, new_threshold):
    if probability > new_threshold:
        return True
    else:
        return False
    
def custom_train_test_split(df, vectorizer, X, y):
    '''Takes in original dataframe, vectorizer, sparse matrix of X values, and y values in a Series. Asks if vectorizer 
        is bigram or not and if there are sentiment scores. Sentiment scores should be stored in the original dataframe as
        "compound score".
        Returns two dataframes for each class (has_scene, no_scene), 
        train and test for X and y (X_train, X_test, y_train, y_test), 
        and the train and test dataframes for predictions (train, test).'''
    
    is_bigram = str(input('Bigram? (Y/N)'))
    if is_bigram == 'Y':
        pre_split = pd.DataFrame(X.todense(), 
                                 columns=vectorizer.get_feature_names()).join(pd.DataFrame(y)).join(df.title)
        pre_split.rename(columns={'title':'title_of_movie'}, inplace=True)
        print('Bigram specified')
    else:
        print('Non-bigram')
        pre_split = pd.DataFrame(X.todense(), 
                             columns=vectorizer.get_feature_names()).join(pd.DataFrame(y)).join(df.title, 
                                                                                                  rsuffix='_of_movie')
        print('Non-bigram')
    
    has_compound_score = str(input('Sentiment score? (Y/N)'))
    if has_compound_score == 'Y':
        print('Has sentiment score.')
        pre_split = pre_split.join(df.compound_score)
        
    pre_split = pre_split.fillna(0)
    no_scene_df = pre_split[pre_split.trigger_scene == False]
    has_scene_df = pre_split[pre_split.trigger_scene == True]
    
    n_self_harm = len(has_scene_df.title_of_movie.unique())
    n_no_self_harm = len(no_scene_df.title_of_movie.unique())
    percent_in_train = 0.7

    print(f'Number of movies with self-harm scenes: {n_self_harm}')
    print(f'Number of movies with no self-harm scenes: {n_no_self_harm}')

    print('----------------------------------------------------------')

    n_self_harm_in_train = round(n_self_harm * percent_in_train)
    n_no_self_harm_in_train = round(n_no_self_harm * percent_in_train)

    print(f'Number of self-harm movies to put into the train set: {n_self_harm_in_train}')
    print(f'Number of no self-harm movies to put into the train set: {n_no_self_harm_in_train}')
    
    # X variables
    
    last_has_scene_movie_in_train = has_scene_df.title_of_movie.unique()[:n_self_harm_in_train][-1]
    index_of_last_has_scene_movie_in_train = (has_scene_df[has_scene_df.title_of_movie == last_has_scene_movie_in_train]
                                              .index[-1])

    has_scene_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,:-3]
    has_scene_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,:-3]

    last_no_scene_movie_in_train = no_scene_df.title_of_movie.unique()[:n_no_self_harm_in_train][-1]
    index_of_last_no_scene_movie_in_train = (no_scene_df[no_scene_df.title_of_movie == last_no_scene_movie_in_train]
                                              .index[-1])

    no_scene_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,:-3]
    no_scene_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,:-3]
    
    X_train = pd.concat([no_scene_rows_to_include_in_train, has_scene_rows_to_include_in_train])
    X_test = pd.concat([no_scene_rows_to_include_in_test, has_scene_rows_to_include_in_test])

    print(f'Number of rows in train: {len(X_train)}')
    print(f'Number of rows in test: {len(X_test)}')
    
    if (len(X_train) + len(X_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
    
    # y variable
    
    has_scene_class_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,-3]
    has_scene_class_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,-3]

    no_scene_class_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,-3]
    no_scene_class_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,-3]
    
    y_train = pd.concat([no_scene_class_rows_to_include_in_train, has_scene_class_rows_to_include_in_train])
    y_test = pd.concat([no_scene_class_rows_to_include_in_test, has_scene_class_rows_to_include_in_test])
    
    print(f'Number of rows in train: {len(y_train)}')
    print(f'Number of rows in test: {len(y_test)}')
    
    
    if (len(y_train) + len(y_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
        
    y_train = y_train.astype('bool')
    y_test = y_test.astype('bool')
    
    # movie titles
    
    has_scene_titles_in_train = has_scene_df.title_of_movie.loc[:index_of_last_has_scene_movie_in_train]
    has_scene_titles_in_test = has_scene_df.title_of_movie.loc[index_of_last_has_scene_movie_in_train + 1:]
    
    no_scene_titles_in_train = no_scene_df.title_of_movie.loc[:index_of_last_no_scene_movie_in_train]
    no_scene_titles_in_test = no_scene_df.title_of_movie.loc[index_of_last_no_scene_movie_in_train + 1:]
    
    titles_train = pd.concat([no_scene_titles_in_train, has_scene_titles_in_train])
    titles_test = pd.concat([no_scene_titles_in_test, has_scene_titles_in_test])
        
    # sentiment scores
    
    if has_compound_score == 'Y':
        has_scene_scores_in_train = has_scene_df.compound_score.loc[:index_of_last_has_scene_movie_in_train]
        has_scene_scores_in_test = has_scene_df.compound_score.loc[index_of_last_has_scene_movie_in_train + 1:]

        no_scene_scores_in_train = no_scene_df.compound_score.loc[:index_of_last_no_scene_movie_in_train]
        no_scene_scores_in_test = no_scene_df.compound_score.loc[index_of_last_no_scene_movie_in_train + 1:]

        scores_train = pd.concat([no_scene_scores_in_train, has_scene_scores_in_train])
        scores_test = pd.concat([no_scene_scores_in_test, has_scene_scores_in_test])
        
        X_train = X_train.join(scores_train)
        X_test = X_test.join(scores_test)
        
    # train and test prediction dataframes
    
    if has_compound_score == 'Y':
        train = pd.DataFrame(dict(actual=y_train, title=titles_train, score=scores_train))
        test = pd.DataFrame(dict(actual=y_test, title=titles_test, score=scores_test))
    else:
        train = pd.DataFrame(dict(actual=y_train, title=titles_train))
        test = pd.DataFrame(dict(actual=y_test, title=titles_test))
    
    return no_scene_df, has_scene_df, X_train, X_test, y_train, y_test, train, test

def sentiment_categorizer(sentiment_score_dictionary):
    compound_score = sentiment_score_dictionary['compound']
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score > -0.05 and compound_score < 0.05:
        return 'neutral'
    elif compound_score <= -0.05:
        return 'negative'

In [3]:
df = pd.read_csv('trigger_warning_tweets.csv', index_col=0)
df.head()

Unnamed: 0,title,tweet,trigger_scene,cleaned_text,stemmed_text,lemmatized_text,lemmatized_no_stopwords,stemmed_no_stopwords
0,spiderman_far_from_home,spiderman far from home had a joke where peter...,False,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,joke peter mistake acdc led zeppelin triggered...,joke peter mistak acdc led zeppelin ptsd becau...
1,spiderman_far_from_home,Trigger warning for all photographers before s...,False,trigger warning for all photographers before s...,trigger warn for all photograph befor see spid...,trigger warning for all photographer before se...,warning photographer seeing,warn photograph befor see
2,spiderman_far_from_home,so i just finished watching spiderman far from...,False,so i just finished watching spiderman far from...,so i just finish watch spiderman far from home...,so i just finished watching spiderman far from...,finished loved im car hearing fever got confus...,finish watch im car hear fever got confus bc t...
3,spiderman_far_from_home,Spiderman: Far From Home was a gaslighting tri...,False,spiderman far from home was a gaslighting trig...,spiderman far from home wa a gaslight trigger ...,spiderman far from home wa a gaslighting trigg...,wa gaslighting half aint nobody warned,wa gaslight half aint nobodi warn
4,spiderman_far_from_home,it trigger me every time there's a spiderman f...,False,it trigger me every time theres a spiderman fa...,it trigger me everi time there a spiderman far...,it trigger me every time there a spiderman far...,every trailer tv start nowhere playing tom hol...,everi trailer tv start nowher tom holland cri ...


In [4]:
df.isna().sum()

title                      0
tweet                      1
trigger_scene              0
cleaned_text               3
stemmed_text               3
lemmatized_text            3
lemmatized_no_stopwords    7
stemmed_no_stopwords       5
dtype: int64

In [5]:
df.dropna(inplace=True)
df.isna().sum()

title                      0
tweet                      0
trigger_scene              0
cleaned_text               0
stemmed_text               0
lemmatized_text            0
lemmatized_no_stopwords    0
stemmed_no_stopwords       0
dtype: int64

## Get Sentiment Score Using VADER

In [6]:
analyzer = SentimentIntensityAnalyzer()

In [8]:
df['sentiment_scores'] = df.lemmatized_text.apply(analyzer.polarity_scores)
df.head()

Unnamed: 0,title,tweet,trigger_scene,cleaned_text,stemmed_text,lemmatized_text,lemmatized_no_stopwords,stemmed_no_stopwords,sentiment_scores
0,spiderman_far_from_home,spiderman far from home had a joke where peter...,False,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,joke peter mistake acdc led zeppelin triggered...,joke peter mistak acdc led zeppelin ptsd becau...,"{'neg': 0.173, 'neu': 0.781, 'pos': 0.046, 'co..."
1,spiderman_far_from_home,Trigger warning for all photographers before s...,False,trigger warning for all photographers before s...,trigger warn for all photograph befor see spid...,trigger warning for all photographer before se...,warning photographer seeing,warn photograph befor see,"{'neg': 0.194, 'neu': 0.806, 'pos': 0.0, 'comp..."
2,spiderman_far_from_home,so i just finished watching spiderman far from...,False,so i just finished watching spiderman far from...,so i just finish watch spiderman far from home...,so i just finished watching spiderman far from...,finished loved im car hearing fever got confus...,finish watch im car hear fever got confus bc t...,"{'neg': 0.062, 'neu': 0.886, 'pos': 0.052, 'co..."
3,spiderman_far_from_home,Spiderman: Far From Home was a gaslighting tri...,False,spiderman far from home was a gaslighting trig...,spiderman far from home wa a gaslight trigger ...,spiderman far from home wa a gaslighting trigg...,wa gaslighting half aint nobody warned,wa gaslight half aint nobodi warn,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp..."
4,spiderman_far_from_home,it trigger me every time there's a spiderman f...,False,it trigger me every time theres a spiderman fa...,it trigger me everi time there a spiderman far...,it trigger me every time there a spiderman far...,every trailer tv start nowhere playing tom hol...,everi trailer tv start nowher tom holland cri ...,"{'neg': 0.19, 'neu': 0.81, 'pos': 0.0, 'compou..."


In [10]:
df['compound_score'] = df.sentiment_scores.apply(sentiment_categorizer)
df.head()

Unnamed: 0,title,tweet,trigger_scene,cleaned_text,stemmed_text,lemmatized_text,lemmatized_no_stopwords,stemmed_no_stopwords,sentiment_scores,compound_score
0,spiderman_far_from_home,spiderman far from home had a joke where peter...,False,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,joke peter mistake acdc led zeppelin triggered...,joke peter mistak acdc led zeppelin ptsd becau...,"{'neg': 0.173, 'neu': 0.781, 'pos': 0.046, 'co...",negative
1,spiderman_far_from_home,Trigger warning for all photographers before s...,False,trigger warning for all photographers before s...,trigger warn for all photograph befor see spid...,trigger warning for all photographer before se...,warning photographer seeing,warn photograph befor see,"{'neg': 0.194, 'neu': 0.806, 'pos': 0.0, 'comp...",negative
2,spiderman_far_from_home,so i just finished watching spiderman far from...,False,so i just finished watching spiderman far from...,so i just finish watch spiderman far from home...,so i just finished watching spiderman far from...,finished loved im car hearing fever got confus...,finish watch im car hear fever got confus bc t...,"{'neg': 0.062, 'neu': 0.886, 'pos': 0.052, 'co...",negative
3,spiderman_far_from_home,Spiderman: Far From Home was a gaslighting tri...,False,spiderman far from home was a gaslighting trig...,spiderman far from home wa a gaslight trigger ...,spiderman far from home wa a gaslighting trigg...,wa gaslighting half aint nobody warned,wa gaslight half aint nobodi warn,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...",positive
4,spiderman_far_from_home,it trigger me every time there's a spiderman f...,False,it trigger me every time theres a spiderman fa...,it trigger me everi time there a spiderman far...,it trigger me every time there a spiderman far...,every trailer tv start nowhere playing tom hol...,everi trailer tv start nowher tom holland cri ...,"{'neg': 0.19, 'neu': 0.81, 'pos': 0.0, 'compou...",negative


##### Encoding the sentiment score categories.
0 = negative
1 = neutral
2 = positive

In [30]:
encoder = LabelEncoder()
encoder.fit(df[['compound_score']])
df.compound_score = encoder.transform(df[['compound_score']])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [31]:
df.head()

Unnamed: 0,title,tweet,trigger_scene,cleaned_text,stemmed_text,lemmatized_text,lemmatized_no_stopwords,stemmed_no_stopwords,sentiment_scores,compound_score
0,spiderman_far_from_home,spiderman far from home had a joke where peter...,False,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,joke peter mistake acdc led zeppelin triggered...,joke peter mistak acdc led zeppelin ptsd becau...,"{'neg': 0.173, 'neu': 0.781, 'pos': 0.046, 'co...",0
1,spiderman_far_from_home,Trigger warning for all photographers before s...,False,trigger warning for all photographers before s...,trigger warn for all photograph befor see spid...,trigger warning for all photographer before se...,warning photographer seeing,warn photograph befor see,"{'neg': 0.194, 'neu': 0.806, 'pos': 0.0, 'comp...",0
2,spiderman_far_from_home,so i just finished watching spiderman far from...,False,so i just finished watching spiderman far from...,so i just finish watch spiderman far from home...,so i just finished watching spiderman far from...,finished loved im car hearing fever got confus...,finish watch im car hear fever got confus bc t...,"{'neg': 0.062, 'neu': 0.886, 'pos': 0.052, 'co...",0
3,spiderman_far_from_home,Spiderman: Far From Home was a gaslighting tri...,False,spiderman far from home was a gaslighting trig...,spiderman far from home wa a gaslight trigger ...,spiderman far from home wa a gaslighting trigg...,wa gaslighting half aint nobody warned,wa gaslight half aint nobodi warn,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...",2
4,spiderman_far_from_home,it trigger me every time there's a spiderman f...,False,it trigger me every time theres a spiderman fa...,it trigger me everi time there a spiderman far...,it trigger me every time there a spiderman far...,every trailer tv start nowhere playing tom hol...,everi trailer tv start nowher tom holland cri ...,"{'neg': 0.19, 'neu': 0.81, 'pos': 0.0, 'compou...",0


## Create Bag of Words Sparse Matrix

In [32]:
bag_of_bigrams = CountVectorizer(ngram_range=(2,2))
X = bag_of_bigrams.fit_transform(df.lemmatized_no_stopwords)
y = df.trigger_scene

In [33]:
has_scene_df, no_scene_df, X_train, X_test, y_train, y_test, train, test = custom_train_test_split(df, bag_of_bigrams, X, y)

Bigram? (Y/N)Y
Bigram specified
Sentiment score? (Y/N)Y
Has sentiment score.
Number of movies with self-harm scenes: 129
Number of movies with no self-harm scenes: 83
----------------------------------------------------------
Number of self-harm movies to put into the train set: 90
Number of no self-harm movies to put into the train set: 58
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up


### Any insights from sentiment score?

In [61]:
pd.crosstab(df.trigger_scene, df.compound_score, normalize='index', margins=True)

compound_score,0,1,2
trigger_scene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.44391,0.169071,0.387019
True,0.42263,0.172477,0.404893
All,0.431842,0.171002,0.397156


The distribution of the scores across the different classes seems to be the about the same. Don't expect accuracy to increase much of at all.

## Modeling
I will fit different models using different algorithms. I will also tune hyperparameters using randomized search cross validation.

### Logistic Regression

In [34]:
lr_bigrams = LogisticRegression(random_state=123)
parameters = {'C':sp.reciprocal(0.0001, 10000),
              'solver':['newton-cg', 'lbfgs', 'saga', 'liblinear']}

lr_bigrams_rs = RandomizedSearchCV(estimator=lr_bigrams, param_distributions=parameters, 
                                   n_jobs=4, random_state=123, verbose=3, n_iter=25)
lr_bigrams_rs.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:  6.9min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=25, n_jobs=4,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B992ED01D0>, 'solver': ['newton-cg', 'lbfgs', 'saga', 'liblinear']},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [35]:
print(lr_bigrams_rs.best_params_)
print(lr_bigrams_rs.best_score_)

{'C': 4.3406460764187536, 'solver': 'liblinear'}
0.5897688145597639


### Decision Tree

In [36]:
dt_bigrams = DecisionTreeClassifier(random_state=123)
parameters = {'criterion':['gini', 'entropy'],
              'max_depth':[int(x) for x in np.linspace(3, 25, 3)],
              'min_samples_split':[int(x) for x in np.linspace(2,50, 2)],
              'min_samples_leaf':[1, 2, 3, 4, 5]}

dt_bigrams_rs = RandomizedSearchCV(estimator=dt_bigrams, param_distributions=parameters, 
                                   n_jobs=4, n_iter=25, random_state=123, verbose=3)
dt_bigrams_rs.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   32.7s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:  1.7min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best'),
          fit_params=None, iid='warn', n_iter=25, n_jobs=4,
          param_distributions={'criterion': ['gini', 'entropy'], 'max_depth': [3, 14, 25], 'min_samples_split': [2, 50], 'min_samples_leaf': [1, 2, 3, 4, 5]},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [37]:
print(dt_bigrams_rs.best_params_)
print(dt_bigrams_rs.best_score_)

{'min_samples_split': 2, 'min_samples_leaf': 3, 'max_depth': 25, 'criterion': 'gini'}
0.5656665027053616


### Random Forest

In [38]:
rf_bigrams = RandomForestClassifier(random_state=123)
parameters = {'n_estimators':[int(x) for x in np.linspace(5, 50, 5)],
              'criterion':['gini', 'entropy'],
              'max_depth':[int(x) for x in np.linspace(3, 30, 3)],
              'min_samples_split':[int(x) for x in np.linspace(2, 20, 2)],
              'min_samples_leaf':[int(x) for x in np.linspace(1, 3, 1)]}

rf_bigrams_rs = RandomizedSearchCV(estimator=rf_bigrams, param_distributions=parameters, n_jobs=4, 
                           n_iter=50, random_state=123, verbose=3)
rf_bigrams_rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   21.7s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:  2.1min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=50, n_jobs=4,
          param_distributions={'n_estimators': [5, 16, 27, 38, 50], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 16, 30], 'min_samples_split': [2, 20], 'min_samples_leaf': [1]},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [39]:
print(rf_bigrams_rs.best_params_)
print(rf_bigrams_rs.best_score_)

{'n_estimators': 5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30, 'criterion': 'entropy'}
0.5725528775209051


### KNN

In [41]:
knneighbors_bigrams = KNeighborsClassifier()
parameters = {'n_neighbors':[3, 5, 7, 9, 13],
              'weights':['uniform', 'distance'],
              'algorithm':['ball_tree', 'kd_tree', 'brute'],
              'p':[1, 2],
              'metric':['minkowski', 'euclidean', 'manhattan']}

knneighbors_bigrams_rs = RandomizedSearchCV(estimator=knneighbors_bigrams, param_distributions=parameters, n_jobs=4, 
                                    n_iter=10, verbose=3, random_state=123)
knneighbors_bigrams_rs.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 24.9min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid='warn', n_iter=10, n_jobs=4,
          param_distributions={'n_neighbors': [3, 5, 7, 9, 13], 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'p': [1, 2], 'metric': ['minkowski', 'euclidean', 'manhattan']},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [42]:
print(knneighbors_bigrams_rs.best_params_)
print(knneighbors_bigrams_rs.best_score_)

{'weights': 'uniform', 'p': 1, 'n_neighbors': 13, 'metric': 'manhattan', 'algorithm': 'brute'}
0.5459911460895229


### XGBoost

In [43]:
xgbc_bigrams = xgb.sklearn.XGBClassifier(random_state=123)
parameters = {'max_depth':[int(x) for x in np.linspace(3, 13, 2)],
              'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
              'n_estimators':[int(x) for x in np.linspace(3, 15, 2)],
              'gamma':[int(x) for x in np.linspace(1, 11, 1)]}

xgbc_bigrams_rs = RandomizedSearchCV(estimator=xgbc_bigrams, param_distributions=parameters, n_jobs=4, n_iter=25,
                             verbose=3, random_state=123)
xgbc_bigrams_rs.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done  72 out of  72 | elapsed: 17.6min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=123, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
          fit_params=None, iid='warn', n_iter=25, n_jobs=4,
          param_distributions={'max_depth': [3, 13], 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3], 'n_estimators': [3, 15], 'gamma': [1]},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [44]:
print(xgbc_bigrams_rs.best_params_)
print(xgbc_bigrams_rs.best_score_)

{'n_estimators': 15, 'max_depth': 13, 'learning_rate': 0.2, 'gamma': 1}
0.559272011805214


### Logistic Regression performed the best.
0.58 accuracy score.

In [45]:
train.head()

Unnamed: 0,actual,title,score
0,False,spiderman_far_from_home,0.0
1,False,spiderman_far_from_home,0.0
2,False,spiderman_far_from_home,0.0
3,False,spiderman_far_from_home,2.0
4,False,spiderman_far_from_home,0.0


### Overall accuracy on train set.

In [47]:
accuracy_score(train.actual, lr_bigrams_rs.predict(X_train))

0.9872110181997049

In [48]:
train['predictions'] = lr_bigrams_rs.predict(X_train)
train.head()

Unnamed: 0,actual,title,score,predictions
0,False,spiderman_far_from_home,0.0,False
1,False,spiderman_far_from_home,0.0,False
2,False,spiderman_far_from_home,0.0,False
3,False,spiderman_far_from_home,2.0,False
4,False,spiderman_far_from_home,0.0,False


### Evaluate model on test set.

In [62]:
test['predictions'] = lr_bigrams_rs.predict(X_test)
accuracy_score(test.actual, test.predictions)

0.5870588235294117

## Addition of sentiment score did not improve accuracy.
It actually hurt accuracy.