# Imports

In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
from time import sleep
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import json
from wordcloud import WordCloud
import numpy as np
import pprint as pprint
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import scipy.stats as sp
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [18]:
def classify_with_new_decision_threshold(probability, new_threshold):
    if probability > new_threshold:
        return True
    else:
        return False
    
def custom_train_test_split(df, vectorizer, X, y):
    '''Takes in original dataframe, vectorizer, sparse matrix of X values, and y values in a Series. Asks if vectorizer 
        is bigram or not and if there are sentiment scores. Sentiment scores should be stored in the original dataframe as
        "compound score".
        Returns two dataframes for each class (has_scene, no_scene), 
        train and test for X and y (X_train, X_test, y_train, y_test), 
        and the train and test dataframes for predictions (train, test).'''
    
    is_bigram = str(input('Bigram? (Y/N)'))
    if is_bigram == 'Y':
        pre_split = pd.DataFrame(X.todense(), 
                                 columns=vectorizer.get_feature_names()).join(pd.DataFrame(y)).join(df.title)
        pre_split.rename(columns={'title':'title_of_movie'}, inplace=True)
        print('Bigram specified')
    else:
        print('Non-bigram')
        pre_split = pd.DataFrame(X.todense(), 
                             columns=vectorizer.get_feature_names()).join(pd.DataFrame(y)).join(df.title, 
                                                                                                  rsuffix='_of_movie')
        print('Non-bigram')
    
    has_compound_score = str(input('Sentiment score? (Y/N)'))
    if has_compound_score == 'Y':
        print('Has sentiment score.')
        pre_split = pre_split.join(df.compound_score)
        
    pre_split = pre_split.fillna(0)
    no_scene_df = pre_split[pre_split.trigger_scene == False]
    has_scene_df = pre_split[pre_split.trigger_scene == True]
    
    n_self_harm = len(has_scene_df.title_of_movie.unique())
    n_no_self_harm = len(no_scene_df.title_of_movie.unique())
    percent_in_train = 0.7

    print(f'Number of movies with self-harm scenes: {n_self_harm}')
    print(f'Number of movies with no self-harm scenes: {n_no_self_harm}')

    print('----------------------------------------------------------')

    n_self_harm_in_train = round(n_self_harm * percent_in_train)
    n_no_self_harm_in_train = round(n_no_self_harm * percent_in_train)

    print(f'Number of self-harm movies to put into the train set: {n_self_harm_in_train}')
    print(f'Number of no self-harm movies to put into the train set: {n_no_self_harm_in_train}')
    
    # X variables
    
    last_has_scene_movie_in_train = has_scene_df.title_of_movie.unique()[:n_self_harm_in_train][-1]
    index_of_last_has_scene_movie_in_train = (has_scene_df[has_scene_df.title_of_movie == last_has_scene_movie_in_train]
                                              .index[-1])

    has_scene_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,:-3]
    has_scene_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,:-3]

    last_no_scene_movie_in_train = no_scene_df.title_of_movie.unique()[:n_no_self_harm_in_train][-1]
    index_of_last_no_scene_movie_in_train = (no_scene_df[no_scene_df.title_of_movie == last_no_scene_movie_in_train]
                                              .index[-1])

    no_scene_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,:-3]
    no_scene_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,:-3]
    
    X_train = pd.concat([no_scene_rows_to_include_in_train, has_scene_rows_to_include_in_train])
    X_test = pd.concat([no_scene_rows_to_include_in_test, has_scene_rows_to_include_in_test])

    print(f'Number of rows in train: {len(X_train)}')
    print(f'Number of rows in test: {len(X_test)}')
    
    if (len(X_train) + len(X_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
    
    # y variable
    
    has_scene_class_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,-3]
    has_scene_class_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,-3]

    no_scene_class_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,-3]
    no_scene_class_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,-3]
    
    y_train = pd.concat([no_scene_class_rows_to_include_in_train, has_scene_class_rows_to_include_in_train])
    y_test = pd.concat([no_scene_class_rows_to_include_in_test, has_scene_class_rows_to_include_in_test])
    
    print(f'Number of rows in train: {len(y_train)}')
    print(f'Number of rows in test: {len(y_test)}')
    
    
    if (len(y_train) + len(y_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
        
    y_train = y_train.astype('bool')
    y_test = y_test.astype('bool')
    
    # movie titles
    
    has_scene_titles_in_train = has_scene_df.title_of_movie.loc[:index_of_last_has_scene_movie_in_train]
    has_scene_titles_in_test = has_scene_df.title_of_movie.loc[index_of_last_has_scene_movie_in_train + 1:]
    
    no_scene_titles_in_train = no_scene_df.title_of_movie.loc[:index_of_last_no_scene_movie_in_train]
    no_scene_titles_in_test = no_scene_df.title_of_movie.loc[index_of_last_no_scene_movie_in_train + 1:]
    
    titles_train = pd.concat([no_scene_titles_in_train, has_scene_titles_in_train])
    titles_test = pd.concat([no_scene_titles_in_test, has_scene_titles_in_test])
        
    # sentiment scores
    
    if has_compound_score == 'Y':
        has_scene_scores_in_train = has_scene_df.compound_score.loc[:index_of_last_has_scene_movie_in_train]
        has_scene_scores_in_test = has_scene_df.compound_score.loc[index_of_last_has_scene_movie_in_train + 1:]

        no_scene_scores_in_train = no_scene_df.compound_score.loc[:index_of_last_no_scene_movie_in_train]
        no_scene_scores_in_test = no_scene_df.compound_score.loc[index_of_last_no_scene_movie_in_train + 1:]

        scores_train = pd.concat([no_scene_scores_in_train, has_scene_scores_in_train])
        scores_test = pd.concat([no_scene_scores_in_test, has_scene_scores_in_test])
        
        X_train = X_train.join(scores_train)
        X_test = X_test.join(scores_test)
        
    # train and test prediction dataframes
    
    if has_compound_score == 'Y':
        train = pd.DataFrame(dict(actual=y_train, title=titles_train, score=scores_train))
        test = pd.DataFrame(dict(actual=y_test, title=titles_test, score=scores_test))
    else:
        train = pd.DataFrame(dict(actual=y_train, title=titles_train))
        test = pd.DataFrame(dict(actual=y_test, title=titles_test))
    
    return no_scene_df, has_scene_df, X_train, X_test, y_train, y_test, train, test

def sentiment_categorizer(sentiment_score_dictionary):
    compound_score = sentiment_score_dictionary['compound']
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score > -0.05 and compound_score < 0.05:
        return 'neutral'
    elif compound_score <= -0.05:
        return 'negative'

In [3]:
df = pd.read_csv('trigger_warning_tweets.csv', index_col=0)
df.head()

Unnamed: 0,title,tweet,trigger_scene,cleaned_text,stemmed_text,lemmatized_text,lemmatized_no_stopwords,stemmed_no_stopwords
0,spiderman_far_from_home,spiderman far from home had a joke where peter...,False,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,joke peter mistake acdc led zeppelin triggered...,joke peter mistak acdc led zeppelin ptsd becau...
1,spiderman_far_from_home,Trigger warning for all photographers before s...,False,trigger warning for all photographers before s...,trigger warn for all photograph befor see spid...,trigger warning for all photographer before se...,warning photographer seeing,warn photograph befor see
2,spiderman_far_from_home,so i just finished watching spiderman far from...,False,so i just finished watching spiderman far from...,so i just finish watch spiderman far from home...,so i just finished watching spiderman far from...,finished loved im car hearing fever got confus...,finish watch im car hear fever got confus bc t...
3,spiderman_far_from_home,Spiderman: Far From Home was a gaslighting tri...,False,spiderman far from home was a gaslighting trig...,spiderman far from home wa a gaslight trigger ...,spiderman far from home wa a gaslighting trigg...,wa gaslighting half aint nobody warned,wa gaslight half aint nobodi warn
4,spiderman_far_from_home,it trigger me every time there's a spiderman f...,False,it trigger me every time theres a spiderman fa...,it trigger me everi time there a spiderman far...,it trigger me every time there a spiderman far...,every trailer tv start nowhere playing tom hol...,everi trailer tv start nowher tom holland cri ...


In [4]:
df.isna().sum()

title                      0
tweet                      1
trigger_scene              0
cleaned_text               3
stemmed_text               3
lemmatized_text            3
lemmatized_no_stopwords    7
stemmed_no_stopwords       5
dtype: int64

In [5]:
df.dropna(inplace=True)
df.isna().sum()

title                      0
tweet                      0
trigger_scene              0
cleaned_text               0
stemmed_text               0
lemmatized_text            0
lemmatized_no_stopwords    0
stemmed_no_stopwords       0
dtype: int64

## Get Sentiment Score Using VADER

In [6]:
analyzer = SentimentIntensityAnalyzer()

In [8]:
df['sentiment_scores'] = df.lemmatized_text.apply(analyzer.polarity_scores)
df.head()

Unnamed: 0,title,tweet,trigger_scene,cleaned_text,stemmed_text,lemmatized_text,lemmatized_no_stopwords,stemmed_no_stopwords,sentiment_scores
0,spiderman_far_from_home,spiderman far from home had a joke where peter...,False,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,joke peter mistake acdc led zeppelin triggered...,joke peter mistak acdc led zeppelin ptsd becau...,"{'neg': 0.173, 'neu': 0.781, 'pos': 0.046, 'co..."
1,spiderman_far_from_home,Trigger warning for all photographers before s...,False,trigger warning for all photographers before s...,trigger warn for all photograph befor see spid...,trigger warning for all photographer before se...,warning photographer seeing,warn photograph befor see,"{'neg': 0.194, 'neu': 0.806, 'pos': 0.0, 'comp..."
2,spiderman_far_from_home,so i just finished watching spiderman far from...,False,so i just finished watching spiderman far from...,so i just finish watch spiderman far from home...,so i just finished watching spiderman far from...,finished loved im car hearing fever got confus...,finish watch im car hear fever got confus bc t...,"{'neg': 0.062, 'neu': 0.886, 'pos': 0.052, 'co..."
3,spiderman_far_from_home,Spiderman: Far From Home was a gaslighting tri...,False,spiderman far from home was a gaslighting trig...,spiderman far from home wa a gaslight trigger ...,spiderman far from home wa a gaslighting trigg...,wa gaslighting half aint nobody warned,wa gaslight half aint nobodi warn,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp..."
4,spiderman_far_from_home,it trigger me every time there's a spiderman f...,False,it trigger me every time theres a spiderman fa...,it trigger me everi time there a spiderman far...,it trigger me every time there a spiderman far...,every trailer tv start nowhere playing tom hol...,everi trailer tv start nowher tom holland cri ...,"{'neg': 0.19, 'neu': 0.81, 'pos': 0.0, 'compou..."


In [10]:
df['compound_score'] = df.sentiment_scores.apply(sentiment_categorizer)
df.head()

Unnamed: 0,title,tweet,trigger_scene,cleaned_text,stemmed_text,lemmatized_text,lemmatized_no_stopwords,stemmed_no_stopwords,sentiment_scores,compound_score
0,spiderman_far_from_home,spiderman far from home had a joke where peter...,False,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,joke peter mistake acdc led zeppelin triggered...,joke peter mistak acdc led zeppelin ptsd becau...,"{'neg': 0.173, 'neu': 0.781, 'pos': 0.046, 'co...",negative
1,spiderman_far_from_home,Trigger warning for all photographers before s...,False,trigger warning for all photographers before s...,trigger warn for all photograph befor see spid...,trigger warning for all photographer before se...,warning photographer seeing,warn photograph befor see,"{'neg': 0.194, 'neu': 0.806, 'pos': 0.0, 'comp...",negative
2,spiderman_far_from_home,so i just finished watching spiderman far from...,False,so i just finished watching spiderman far from...,so i just finish watch spiderman far from home...,so i just finished watching spiderman far from...,finished loved im car hearing fever got confus...,finish watch im car hear fever got confus bc t...,"{'neg': 0.062, 'neu': 0.886, 'pos': 0.052, 'co...",negative
3,spiderman_far_from_home,Spiderman: Far From Home was a gaslighting tri...,False,spiderman far from home was a gaslighting trig...,spiderman far from home wa a gaslight trigger ...,spiderman far from home wa a gaslighting trigg...,wa gaslighting half aint nobody warned,wa gaslight half aint nobodi warn,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...",positive
4,spiderman_far_from_home,it trigger me every time there's a spiderman f...,False,it trigger me every time theres a spiderman fa...,it trigger me everi time there a spiderman far...,it trigger me every time there a spiderman far...,every trailer tv start nowhere playing tom hol...,everi trailer tv start nowher tom holland cri ...,"{'neg': 0.19, 'neu': 0.81, 'pos': 0.0, 'compou...",negative


## Create Bag of Words Sparse Matrix

In [11]:
bag_of_bigrams = CountVectorizer(ngram_range=(2,2))
X = bag_of_bigrams.fit_transform(df.lemmatized_no_stopwords)
y = df.trigger_scene

In [19]:
has_scene_df, no_scene_df, X_train, X_test, y_train, y_test, train, test = custom_train_test_split(df, bag_of_bigrams, X, y)

Bigram? (Y/N)Y
Bigram specified
Sentiment score? (Y/N)Y
Has sentiment score.
Number of movies with self-harm scenes: 129
Number of movies with no self-harm scenes: 83
----------------------------------------------------------
Number of self-harm movies to put into the train set: 90
Number of no self-harm movies to put into the train set: 58
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up


In [20]:
X_test

Unnamed: 0,007 novel,00s version,010 recommend,02 second,03 2018,03 snes,0539 debbie,06 tony,08 back,0806 email,...,zone right,zone start,zooey deschanel,zoolander acenterforants,zootopia going,zorx gamepad,zu belegen,zu perk,zune glengarry,compound_score
900,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
901,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
902,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,negative
903,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
904,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,negative
905,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neutral
906,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
907,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,negative
908,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
909,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neutral
