# Imports

In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
from time import sleep
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import json
from wordcloud import WordCloud
import numpy as np
import pprint as pprint
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import scipy.stats as sp

In [95]:
def classify_with_new_decision_threshold(probability, new_threshold):
    if probability > new_threshold:
        return True
    else:
        return False
    
def custom_train_test_split(df, vectorizer, X, y):
    '''Takes in original dataframe, vectorizer, sparse matrix of X values, and y values in a Series. Asks if vectorizer 
        is bigram or not and if there are sentiment scores. Sentiment scores should be stored in the original dataframe as
        "compound score".
        Returns two dataframes for each class (has_scene, no_scene), 
        train and test for X and y (X_train, X_test, y_train, y_test), 
        and the train and test dataframes for predictions (train, test).'''
    
    is_bigram = str(input('Bigram? (Y/N)'))
    if is_bigram == 'Y':
        pre_split = pd.DataFrame(X.todense(), 
                                 columns=vectorizer.get_feature_names()).join(pd.DataFrame(y)).join(df.title)
        pre_split.rename(columns={'title':'title_of_movie'}, inplace=True)
        print('Bigram specified')
    else:
        print('Non-bigram')
        pre_split = pd.DataFrame(X.todense(), 
                             columns=vectorizer.get_feature_names()).join(pd.DataFrame(y)).join(df.title, 
                                                                                                  rsuffix='_of_movie')
        print('Non-bigram')
    
    has_compound_score = str(input('Sentiment score? (Y/N)'))
    if has_compound_score == 'Y':
        print('Has sentiment score.')
        pre_split = pre_split.join(df.compound_score)
        
    pre_split = pre_split.fillna(0)
    no_scene_df = pre_split[pre_split.trigger_scene == False]
    has_scene_df = pre_split[pre_split.trigger_scene == True]
    
    n_self_harm = len(has_scene_df.title_of_movie.unique())
    n_no_self_harm = len(no_scene_df.title_of_movie.unique())
    percent_in_train = 0.7

    print(f'Number of movies with self-harm scenes: {n_self_harm}')
    print(f'Number of movies with no self-harm scenes: {n_no_self_harm}')

    print('----------------------------------------------------------')

    n_self_harm_in_train = round(n_self_harm * percent_in_train)
    n_no_self_harm_in_train = round(n_no_self_harm * percent_in_train)

    print(f'Number of self-harm movies to put into the train set: {n_self_harm_in_train}')
    print(f'Number of no self-harm movies to put into the train set: {n_no_self_harm_in_train}')
    
    # X variables
    
    last_has_scene_movie_in_train = has_scene_df.title_of_movie.unique()[:n_self_harm_in_train][-1]
    index_of_last_has_scene_movie_in_train = (has_scene_df[has_scene_df.title_of_movie == last_has_scene_movie_in_train]
                                              .index[-1])

    has_scene_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,:-2]
    has_scene_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,:-2]

    last_no_scene_movie_in_train = no_scene_df.title_of_movie.unique()[:n_no_self_harm_in_train][-1]
    index_of_last_no_scene_movie_in_train = (no_scene_df[no_scene_df.title_of_movie == last_no_scene_movie_in_train]
                                              .index[-1])

    no_scene_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,:-2]
    no_scene_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,:-2]
    
    X_train = pd.concat([no_scene_rows_to_include_in_train, has_scene_rows_to_include_in_train])
    X_test = pd.concat([no_scene_rows_to_include_in_test, has_scene_rows_to_include_in_test])

    print(f'Number of rows in train: {len(X_train)}')
    print(f'Number of rows in test: {len(X_test)}')
    
    if (len(X_train) + len(X_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
    
    # y variable
    
    has_scene_class_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,-2]
    has_scene_class_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,-2]

    no_scene_class_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,-2]
    no_scene_class_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,-2]
    
    y_train = pd.concat([no_scene_class_rows_to_include_in_train, has_scene_class_rows_to_include_in_train])
    y_test = pd.concat([no_scene_class_rows_to_include_in_test, has_scene_class_rows_to_include_in_test])
    
    print(f'Number of rows in train: {len(y_train)}')
    print(f'Number of rows in test: {len(y_test)}')
    
    
    if (len(y_train) + len(y_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
        
    y_train = y_train.astype('bool')
    y_test = y_test.astype('bool')
    
    # movie titles
    
    has_scene_titles_in_train = has_scene_df.title_of_movie.loc[:index_of_last_has_scene_movie_in_train]
    has_scene_titles_in_test = has_scene_df.title_of_movie.loc[index_of_last_has_scene_movie_in_train + 1:]
    
    no_scene_titles_in_train = no_scene_df.title_of_movie.loc[:index_of_last_no_scene_movie_in_train]
    no_scene_titles_in_test = no_scene_df.title_of_movie.loc[index_of_last_no_scene_movie_in_train + 1:]
    
    titles_train = pd.concat([no_scene_titles_in_train, has_scene_titles_in_train])
    titles_test = pd.concat([no_scene_titles_in_test, has_scene_titles_in_test])
        
    # sentiment scores
    
    if has_compound_score == 'Y':
        has_scene_scores_in_train = has_scene_df.compound_score.loc[:index_of_last_has_scene_movie_in_train]
        has_scene_scores_in_test = has_scene_df.compound_score.loc[index_of_last_has_scene_movie_in_train + 1:]

        no_scene_scores_in_train = no_scene_df.compound_score.loc[:index_of_last_no_scene_movie_in_train]
        no_scene_scores_in_test = no_scene_df.compound_score.loc[index_of_last_no_scene_movie_in_train + 1:]

        scores_train = pd.concat([no_scene_scores_in_train, has_scene_scores_in_train])
        scores_test = pd.concat([no_scene_scores_in_test, has_scene_scores_in_test])
        
        X_train = X_train.join(scores_train)
        X_test = X_test.join(scores_test)
        
    # train and test prediction dataframes
    
    if has_compound_score == 'Y':
        train = pd.DataFrame(dict(actual=y_train, title=titles_train, score=scores_train))
        test = pd.DataFrame(dict(actual=y_test, title=titles_test, score=scores_test))
    else:
        train = pd.DataFrame(dict(actual=y_train, title=titles_train))
        test = pd.DataFrame(dict(actual=y_test, title=titles_test))
    
    return no_scene_df, has_scene_df, X_train, X_test, y_train, y_test, train, test

In [3]:
df = pd.read_csv('trigger_warning_tweets.csv', index_col=0)
df.head()

Unnamed: 0,title,tweet,trigger_scene,cleaned_text,stemmed_text,lemmatized_text,lemmatized_no_stopwords,stemmed_no_stopwords
0,spiderman_far_from_home,spiderman far from home had a joke where peter...,False,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,joke peter mistake acdc led zeppelin triggered...,joke peter mistak acdc led zeppelin ptsd becau...
1,spiderman_far_from_home,Trigger warning for all photographers before s...,False,trigger warning for all photographers before s...,trigger warn for all photograph befor see spid...,trigger warning for all photographer before se...,warning photographer seeing,warn photograph befor see
2,spiderman_far_from_home,so i just finished watching spiderman far from...,False,so i just finished watching spiderman far from...,so i just finish watch spiderman far from home...,so i just finished watching spiderman far from...,finished loved im car hearing fever got confus...,finish watch im car hear fever got confus bc t...
3,spiderman_far_from_home,Spiderman: Far From Home was a gaslighting tri...,False,spiderman far from home was a gaslighting trig...,spiderman far from home wa a gaslight trigger ...,spiderman far from home wa a gaslighting trigg...,wa gaslighting half aint nobody warned,wa gaslight half aint nobodi warn
4,spiderman_far_from_home,it trigger me every time there's a spiderman f...,False,it trigger me every time theres a spiderman fa...,it trigger me everi time there a spiderman far...,it trigger me every time there a spiderman far...,every trailer tv start nowhere playing tom hol...,everi trailer tv start nowher tom holland cri ...


In [4]:
df.isna().sum()

title                      0
tweet                      1
trigger_scene              0
cleaned_text               3
stemmed_text               3
lemmatized_text            3
lemmatized_no_stopwords    7
stemmed_no_stopwords       5
dtype: int64

In [5]:
df.dropna(inplace=True)
df.isna().sum()

title                      0
tweet                      0
trigger_scene              0
cleaned_text               0
stemmed_text               0
lemmatized_text            0
lemmatized_no_stopwords    0
stemmed_no_stopwords       0
dtype: int64

### Create Bag of Words

In [6]:
bag_of_words = CountVectorizer()
X = bag_of_words.fit_transform(df.lemmatized_no_stopwords)
y = df.trigger_scene

In [7]:
has_scene_df, no_scene_df, X_train, X_test, y_train, y_test, train, test = custom_train_test_split(df, bag_of_words, X, y)

Bigram? (Y/N)N
Non-bigram
Non-bigram
Number of movies with self-harm scenes: 129
Number of movies with no self-harm scenes: 83
----------------------------------------------------------
Number of self-harm movies to put into the train set: 90
Number of no self-harm movies to put into the train set: 58
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up


### Modeling

### Logistic Regression

In [8]:
lr = LogisticRegression(random_state=123)
parameters = {'C':sp.reciprocal(0.0001, 10000),
              'solver':['newton-cg', 'lbfgs', 'saga', 'liblinear']}

lr_rs = RandomizedSearchCV(estimator=lr, param_distributions=parameters, n_jobs=4, random_state=123)
lr_rs.fit(X_train, y_train)



KeyboardInterrupt: 

In [None]:
print(lr_rs.best_params_)
print(lr_rs.best_score_)

### Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state=123)
parameters = {'criterion':['gini', 'entropy'],
              'max_depth':[int(x) for x in np.linspace(3, 25, 3)],
              'min_samples_split':[int(x) for x in np.linspace(2,50, 2)],
              'min_samples_leaf':[1, 2, 3, 4, 5]}

dt_rs = RandomizedSearchCV(estimator=dt, param_distributions=parameters, n_jobs=4, n_iter=25, random_state=123)
dt_rs.fit(X_train, y_train)

In [None]:
print(dt_rs.best_params_)
print(dt_rs.best_score_)

### Random Forest

In [9]:
rf = RandomForestClassifier(random_state=123)
parameters = {'n_estimators':[int(x) for x in np.linspace(5, 50, 5)],
              'criterion':['gini', 'entropy'],
              'max_depth':[int(x) for x in np.linspace(3, 30, 3)],
              'min_samples_split':[int(x) for x in np.linspace(2, 20, 2)],
              'min_samples_leaf':[int(x) for x in np.linspace(1, 3, 1)]}

rf_rs = RandomizedSearchCV(estimator=rf, param_distributions=parameters, n_jobs=4, n_iter=50, random_state=123)
rf_rs.fit(X_train, y_train)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=50, n_jobs=4,
          param_distributions={'n_estimators': [5, 16, 27, 38, 50], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 16, 30], 'min_samples_split': [2, 20], 'min_samples_leaf': [1]},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [10]:
print(rf_rs.best_params_)
print(rf_rs.best_score_)

{'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30, 'criterion': 'gini'}
0.5804230201672406


### K-Nearest Neighbors

In [56]:
knneighbors = KNeighborsClassifier()
parameters = {'n_neighbors':[3, 5, 7, 9, 13],
              'weights':['uniform', 'distance'],
              'algorithm':['ball_tree', 'kd_tree', 'brute'],
              'p':[1, 2],
              'metric':['minkowski', 'euclidean', 'manhattan']}

knneighbors_rs = RandomizedSearchCV(estimator=knneighbors, param_distributions=parameters, n_jobs=4, 
                                    n_iter=10, verbose=10, random_state=123)
knneighbors_rs.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  6.7min
[Parallel(n_jobs=4)]: Done  27 out of  30 | elapsed: 10.4min remaining:  1.2min
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 11.4min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid='warn', n_iter=10, n_jobs=4,
          param_distributions={'n_neighbors': [3, 5, 7, 9, 13], 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'p': [1, 2], 'metric': ['minkowski', 'euclidean', 'manhattan']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [57]:
print(knneighbors_rs.best_params_)
print(knneighbors_rs.best_score_)

{'weights': 'distance', 'p': 1, 'n_neighbors': 13, 'metric': 'euclidean', 'algorithm': 'brute'}
0.5425479586817511


### XGBoost

In [59]:
xgbc = xgb.sklearn.XGBClassifier(random_state=123)
parameters = {'max_depth':[int(x) for x in np.linspace(3, 13, 2)],
              'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
              'n_estimators':[int(x) for x in np.linspace(3, 15, 2)],
              'gamma':[int(x) for x in np.linspace(1, 11, 1)]}

xgbc_rs = RandomizedSearchCV(estimator=xgbc, param_distributions=parameters, n_jobs=4, n_iter=25,
                             verbose=3, random_state=123)
xgbc_rs.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done  72 out of  72 | elapsed:  4.1min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=123, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
          fit_params=None, iid='warn', n_iter=25, n_jobs=4,
          param_distributions={'max_depth': [3, 13], 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3], 'n_estimators': [3, 15], 'gamma': [1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [62]:
print(xgbc_rs.best_params_)
print(xgbc_rs.best_score_)

{'n_estimators': 3, 'max_depth': 13, 'learning_rate': 0.1, 'gamma': 1}
0.5779636005902608


### Random Forest was the best with an accuracy of 0.58
Moving forward to decision threshold adjustment

In [11]:
train['rf_predictions'] = rf_rs.predict(X_train)
train.head()

Unnamed: 0,actual,title,rf_predictions
0,False,spiderman_far_from_home,False
1,False,spiderman_far_from_home,True
2,False,spiderman_far_from_home,False
3,False,spiderman_far_from_home,True
4,False,spiderman_far_from_home,True


In [12]:
pd.crosstab(train.actual, train.rf_predictions)

rf_predictions,False,True
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,347,553
True,1,1132


In [13]:
train['rf_probabilities'] = rf_rs.predict_proba(X_train)[:,1]
train.head()

Unnamed: 0,actual,title,rf_predictions,rf_probabilities
0,False,spiderman_far_from_home,False,0.448312
1,False,spiderman_far_from_home,True,0.548708
2,False,spiderman_far_from_home,False,0.31298
3,False,spiderman_far_from_home,True,0.547394
4,False,spiderman_far_from_home,True,0.595494


In [14]:
train['rf_predictions_at_0_57'] = train.rf_probabilities.apply(classify_with_new_decision_threshold, new_threshold=0.57)
train.head()

Unnamed: 0,actual,title,rf_predictions,rf_probabilities,rf_predictions_at_0_57
0,False,spiderman_far_from_home,False,0.448312,False
1,False,spiderman_far_from_home,True,0.548708,False
2,False,spiderman_far_from_home,False,0.31298,False
3,False,spiderman_far_from_home,True,0.547394,False
4,False,spiderman_far_from_home,True,0.595494,True


In [15]:
accuracy_score(train.actual, train.rf_predictions_at_0_57)

0.8652238071815052

##### Accuracy prior to tally is 0.85.
Now I will tally up the votes and check accuracy again.

In [16]:
counts_rf = train.groupby(['title', 'rf_predictions_at_0_57']).title.count().unstack().fillna(0)
counts_rf.head()

rf_predictions_at_0_57,False,True
title,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6.0,0.0
1408,1.0,16.0
237,0.0,20.0
28_days_later,0.0,1.0
3_idiots,0.0,4.0


In [17]:
counts_rf['final_prediction'] = counts_rf.idxmax(axis=1)
counts_rf.head()

rf_predictions_at_0_57,False,True,final_prediction
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6.0,0.0,False
1408,1.0,16.0,True
237,0.0,20.0,True
28_days_later,0.0,1.0,True
3_idiots,0.0,4.0,True


In [18]:
results_df = pd.concat([train.groupby('title').actual.max(), counts_rf.final_prediction], axis=1)
results_df.head()

Unnamed: 0_level_0,actual,final_prediction
title,Unnamed: 1_level_1,Unnamed: 2_level_1
0,False,False
1408,True,True
237,True,True
28_days_later,True,True
3_idiots,True,True


In [19]:
accuracy_score(results_df.actual, results_df.final_prediction)

0.9795918367346939

##### Final accuracy on train set: 0.98
Now to evaluate the model on the test set.

First, I'll take a look at the test dataframe.

In [20]:
test.head()

Unnamed: 0,actual,title
900,False,the_neverending_story
901,False,the_neverending_story
902,False,the_neverending_story
903,False,the_neverending_story
904,False,the_neverending_story


In [21]:
test['rf_predictions'] = rf_rs.predict(X_test)
accuracy_score(test.actual, test.rf_predictions)

0.5470588235294118

##### Initial accuracy before decision threshold adjustment and before tally is 54%

In [22]:
test['rf_probabilities'] = rf_rs.predict_proba(X_test)[:,1]
test.head()

Unnamed: 0,actual,title,rf_predictions,rf_probabilities
900,False,the_neverending_story,True,0.520333
901,False,the_neverending_story,True,0.603513
902,False,the_neverending_story,True,0.622324
903,False,the_neverending_story,False,0.437431
904,False,the_neverending_story,True,0.635616


In [23]:
test['rf_predictions_at_0_57'] = test.rf_probabilities.apply(classify_with_new_decision_threshold, new_threshold=0.57)
accuracy_score(test.actual, test.rf_predictions_at_0_57)

0.5152941176470588

##### Accuracy actually goes down when adjusting the decision threshold.

In [24]:
counts_rf_test = test.groupby(['title', 'rf_predictions_at_0_57']).title.count().unstack().fillna(0)
counts_rf_test.head()

rf_predictions_at_0_57,False,True
title,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,0.0
always_be_my_maybe,0.0,4.0
bad_times_at_the_el_royale,0.0,2.0
booksmart,1.0,5.0
crazy_rich_asians,6.0,11.0


In [25]:
counts_rf_test['final_prediction'] = counts_rf_test.idxmax(axis=1)
counts_rf_test.head()

rf_predictions_at_0_57,False,True,final_prediction
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,0.0,False
always_be_my_maybe,0.0,4.0,True
bad_times_at_the_el_royale,0.0,2.0,True
booksmart,1.0,5.0,True
crazy_rich_asians,6.0,11.0,True


In [26]:
results_df_test = pd.concat([test.groupby('title').actual.max(), counts_rf_test.final_prediction], axis=1)
results_df_test.head()

Unnamed: 0_level_0,actual,final_prediction
title,Unnamed: 1_level_1,Unnamed: 2_level_1
0,False,False
always_be_my_maybe,False,True
bad_times_at_the_el_royale,False,True
booksmart,False,True
crazy_rich_asians,False,True


In [27]:
accuracy_score(results_df_test.actual, results_df_test.final_prediction)

0.5555555555555556

##### Final Accuracy: 0.55
Not good. I will try with bigrams now.

## Create Bag of Bigrams

In [28]:
bag_of_bigrams = CountVectorizer(ngram_range=(2,2))
X = bag_of_bigrams.fit_transform(df.lemmatized_no_stopwords)
y = df.trigger_scene

### Split into train and test sets
Along with creating a train and test dataframe.

In [29]:
has_scene_df, no_scene_df, X_train, X_test, y_train, y_test, train, test = custom_train_test_split(df, bag_of_bigrams, X, y)

Bigram? (Y/N)Y
Bigram specified
Number of movies with self-harm scenes: 129
Number of movies with no self-harm scenes: 83
----------------------------------------------------------
Number of self-harm movies to put into the train set: 90
Number of no self-harm movies to put into the train set: 58
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up


### Modeling
### Logistic regression

In [30]:
lr_bigrams = LogisticRegression(random_state=123)
parameters = {'C':sp.reciprocal(0.0001, 10000),
              'solver':['newton-cg', 'lbfgs', 'saga', 'liblinear']}

lr_bigrams_rs = RandomizedSearchCV(estimator=lr_bigrams, param_distributions=parameters, 
                                   n_jobs=4, random_state=123, verbose=3, n_iter=25)
lr_bigrams_rs.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:  7.4min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=25, n_jobs=4,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FA5ED6D748>, 'solver': ['newton-cg', 'lbfgs', 'saga', 'liblinear']},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring=None, verbose=3)

In [31]:
print(lr_bigrams_rs.best_params_)
print(lr_bigrams_rs.best_score_)

{'C': 0.26777338551129726, 'solver': 'saga'}
0.5927201180521396


### Decision Tree

In [None]:
dt_bigrams = DecisionTreeClassifier(random_state=123)
parameters = {'criterion':['gini', 'entropy'],
              'max_depth':[int(x) for x in np.linspace(3, 25, 3)],
              'min_samples_split':[int(x) for x in np.linspace(2,50, 2)],
              'min_samples_leaf':[1, 2, 3, 4, 5]}

dt_bigrams_rs = RandomizedSearchCV(estimator=dt_bigrams, param_distributions=parameters, 
                                   n_jobs=4, n_iter=25, random_state=123, verbose=3)
dt_bigrams_rs.fit(X_train, y_train)

In [None]:
print(dt_bigrams_rs.best_params_)
print(dt_bigrams_rs.best_score_)

### Random Forest

In [None]:
rf_bigrams = RandomForestClassifier(random_state=123)
parameters = {'n_estimators':[int(x) for x in np.linspace(5, 50, 5)],
              'criterion':['gini', 'entropy'],
              'max_depth':[int(x) for x in np.linspace(3, 30, 3)],
              'min_samples_split':[int(x) for x in np.linspace(2, 20, 2)],
              'min_samples_leaf':[int(x) for x in np.linspace(1, 3, 1)]}

rf_bigrams_rs = RandomizedSearchCV(estimator=rf_bigrams, param_distributions=parameters, n_jobs=4, 
                           n_iter=50, random_state=123, verbose=3)
rf_bigrams_rs.fit(X_train, y_train)

In [None]:
print(rf_bigrams_rs.best_params_)
print(rf_bigrams_rs.best_score_)

### K-Nearest Neighbors

In [None]:
knneighbors_bigrams = KNeighborsClassifier()
parameters = {'n_neighbors':[3, 5, 7, 9, 13],
              'weights':['uniform', 'distance'],
              'algorithm':['ball_tree', 'kd_tree', 'brute'],
              'p':[1, 2],
              'metric':['minkowski', 'euclidean', 'manhattan']}

knneighbors_bigrams_rs = RandomizedSearchCV(estimator=knneighbors_bigrams, param_distributions=parameters, n_jobs=4, 
                                    n_iter=10, verbose=3, random_state=123)
knneighbors_bigrams_rs.fit(X_train, y_train)

In [None]:
print(knneighbors_bigrams_rs.best_params_)
print(knneighbors_bigrams_rs.best_score_)

### XGBoost

In [None]:
xgbc_bigrams = xgb.sklearn.XGBClassifier(random_state=123)
parameters = {'max_depth':[int(x) for x in np.linspace(3, 13, 2)],
              'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
              'n_estimators':[int(x) for x in np.linspace(3, 15, 2)],
              'gamma':[int(x) for x in np.linspace(1, 11, 1)]}

xgbc_bigrams_rs = RandomizedSearchCV(estimator=xgbc_bigrams, param_distributions=parameters, n_jobs=4, n_iter=25,
                             verbose=3, random_state=123)
xgbc_bigrams_rs.fit(X_train, y_train)

In [None]:
print(xgbc_bigrams_rs.best_params_)
print(xgbc_bigrams_rs.best_score_)

### Logistic Regression performed the best with 0.59 accuracy.

In [32]:
train.head()

Unnamed: 0,actual,title
0,False,spiderman_far_from_home
1,False,spiderman_far_from_home
2,False,spiderman_far_from_home
3,False,spiderman_far_from_home
4,False,spiderman_far_from_home


In [33]:
train['lr_bigrams_predictions'] = lr_bigrams_rs.predict(X_train)
train.head()

Unnamed: 0,actual,title,lr_bigrams_predictions
0,False,spiderman_far_from_home,False
1,False,spiderman_far_from_home,True
2,False,spiderman_far_from_home,False
3,False,spiderman_far_from_home,False
4,False,spiderman_far_from_home,False


In [34]:
accuracy_score(train.actual, train.lr_bigrams_predictions)

0.9650762420068864

In [35]:
pd.crosstab(train.actual, train.lr_bigrams_predictions)

lr_bigrams_predictions,False,True
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,832,68
True,3,1130


##### Apparently, this model doesn't need the decision threshold to be adjusted.
I had originally adjusted the threshold because majority of the predictions were True, which wasn't very useful. Here though, there is a better split in predicitons between the two classes. I will move on without making an adjustment.

In [36]:
counts_df_bigrams_train = train.groupby(['title', 'lr_bigrams_predictions']).title.count().unstack().fillna(0)
counts_df_bigrams_train['final_prediction'] = counts_df_bigrams_train.idxmax(axis=1)
counts_df_bigrams_train.head()

lr_bigrams_predictions,False,True,final_prediction
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6.0,0.0,False
1408,0.0,17.0,True
237,0.0,20.0,True
28_days_later,0.0,1.0,True
3_idiots,0.0,4.0,True


In [37]:
results_df_bigrams_train = pd.concat([train.groupby('title').actual.max(), counts_df_bigrams_train.final_prediction],
                                     axis=1)
results_df_bigrams_train.head()

Unnamed: 0_level_0,actual,final_prediction
title,Unnamed: 1_level_1,Unnamed: 2_level_1
0,False,False
1408,True,True
237,True,True
28_days_later,True,True
3_idiots,True,True


In [38]:
accuracy_score(results_df_bigrams_train.actual, results_df_bigrams_train.final_prediction)

0.9931972789115646

### Final accuracy on train set is 0.99
### Evaluate model performance on test set.

In [39]:
test.head()

Unnamed: 0,actual,title
900,False,the_neverending_story
901,False,the_neverending_story
902,False,the_neverending_story
903,False,the_neverending_story
904,False,the_neverending_story


In [40]:
test['lr_bigrams_predictions'] = lr_bigrams_rs.predict(X_test)
test.head()

Unnamed: 0,actual,title,lr_bigrams_predictions
900,False,the_neverending_story,True
901,False,the_neverending_story,True
902,False,the_neverending_story,True
903,False,the_neverending_story,True
904,False,the_neverending_story,True


In [41]:
counts_df_bigrams_test = test.groupby(['title', 'lr_bigrams_predictions']).title.count().unstack().fillna(0)
counts_df_bigrams_test['final_prediciton'] = counts_df_bigrams_test.idxmax(axis=1)
counts_df_bigrams_test.head()

lr_bigrams_predictions,False,True,final_prediciton
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,0.0,False
always_be_my_maybe,0.0,4.0,True
bad_times_at_the_el_royale,0.0,2.0,True
booksmart,0.0,6.0,True
crazy_rich_asians,1.0,16.0,True


In [42]:
results_df_bigrams_test = pd.concat([test.groupby('title').actual.max(), counts_df_bigrams_test.final_prediciton], axis=1)
results_df_bigrams_test.head()

Unnamed: 0_level_0,actual,final_prediciton
title,Unnamed: 1_level_1,Unnamed: 2_level_1
0,False,False
always_be_my_maybe,False,True
bad_times_at_the_el_royale,False,True
booksmart,False,True
crazy_rich_asians,False,True


In [43]:
accuracy_score(results_df_bigrams_test.actual, results_df_bigrams_test.final_prediciton)

0.6349206349206349

In [44]:
pd.crosstab(test.actual, test.lr_bigrams_predictions)

lr_bigrams_predictions,False,True
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
False,16,339
True,13,482


### Final accuracy on test set: 0.63
##### When evaluating on the test set, I can see that the model is predicting True most of the time.
Threshold adjustment is required after all.

In [45]:
test.head()

Unnamed: 0,actual,title,lr_bigrams_predictions
900,False,the_neverending_story,True
901,False,the_neverending_story,True
902,False,the_neverending_story,True
903,False,the_neverending_story,True
904,False,the_neverending_story,True


In [46]:
test['lr_probabilities'] = lr_bigrams_rs.predict_proba(X_test)[:,1]
test.head()

Unnamed: 0,actual,title,lr_bigrams_predictions,lr_probabilities
900,False,the_neverending_story,True,0.58423
901,False,the_neverending_story,True,0.58423
902,False,the_neverending_story,True,0.605539
903,False,the_neverending_story,True,0.573666
904,False,the_neverending_story,True,0.537602


In [47]:
test['predictions_at_0_57'] = test.lr_probabilities.apply(classify_with_new_decision_threshold, new_threshold=0.57)
test.head()

Unnamed: 0,actual,title,lr_bigrams_predictions,lr_probabilities,predictions_at_0_57
900,False,the_neverending_story,True,0.58423,True
901,False,the_neverending_story,True,0.58423,True
902,False,the_neverending_story,True,0.605539,True
903,False,the_neverending_story,True,0.573666,True
904,False,the_neverending_story,True,0.537602,False


In [48]:
counts_df_bigrams_test = test.groupby(['title', 'predictions_at_0_57']).title.count().unstack().fillna(0)
counts_df_bigrams_test['final_prediciton'] = counts_df_bigrams_test.idxmax(axis=1)
counts_df_bigrams_test.head()

predictions_at_0_57,False,True,final_prediciton
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,0.0,False
always_be_my_maybe,0.0,4.0,True
bad_times_at_the_el_royale,0.0,2.0,True
booksmart,1.0,5.0,True
crazy_rich_asians,4.0,13.0,True


In [49]:
results_df_bigrams_test = pd.concat([test.groupby('title').actual.max(), counts_df_bigrams_test.final_prediciton], axis=1)
results_df_bigrams_test.head()

Unnamed: 0_level_0,actual,final_prediciton
title,Unnamed: 1_level_1,Unnamed: 2_level_1
0,False,False
always_be_my_maybe,False,True
bad_times_at_the_el_royale,False,True
booksmart,False,True
crazy_rich_asians,False,True


In [50]:
accuracy_score(results_df_bigrams_test.actual, results_df_bigrams_test.final_prediciton)

0.6190476190476191

##### Adjusting the decision threshold actually leads to decreased accuracy.
I'll leave it prior to decision threshold adjustment.

### This Bag of Words model performed the same as the best TFIDF model.
I will use this Bag of Words model since it doesn't require a threshold adjustment. I will try to incorporate sentiment analysis into the model, perhaps that will improve the accuracy.