# Imports

In [8]:
from requests import get
from bs4 import BeautifulSoup
import os
from time import sleep
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import json
from wordcloud import WordCloud
import numpy as np
import pprint as pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import scipy.stats as sp
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
def classify_with_new_decision_threshold(probability, new_threshold):
    if probability > new_threshold:
        return True
    else:
        return False
    
def custom_train_test_split(df, vectorizer, X, y):
    '''Takes in original dataframe, vectorizer, sparse matrix of X values, and y values in a Series. 
        Returns two dataframes for each class (has_scene, no_scene), 
        train and test for X and y (X_train, X_test, y_train, y_test), 
        and the train and test dataframes for predictions (train, test).'''
    pre_split = pd.DataFrame(X.todense(), 
                             columns=bag_of_words.get_feature_names()).join(pd.DataFrame(y)).join(df.title, 
                                                                                                  rsuffix='_of_movie')
    pre_split = pre_split.fillna(0)
    no_scene_df = pre_split[pre_split.trigger_scene == False]
    has_scene_df = pre_split[pre_split.trigger_scene == True]
    
    n_self_harm = len(has_scene_df.title_of_movie.unique())
    n_no_self_harm = len(no_scene_df.title_of_movie.unique())
    percent_in_train = 0.7

    print(f'Number of movies with self-harm scenes: {n_self_harm}')
    print(f'Number of movies with no self-harm scenes: {n_no_self_harm}')

    print('----------------------------------------------------------')

    n_self_harm_in_train = round(n_self_harm * percent_in_train)
    n_no_self_harm_in_train = round(n_no_self_harm * percent_in_train)

    print(f'Number of self-harm movies to put into the train set: {n_self_harm_in_train}')
    print(f'Number of no self-harm movies to put into the train set: {n_no_self_harm_in_train}')
    
    # X variables
    
    last_has_scene_movie_in_train = has_scene_df.title_of_movie.unique()[:n_self_harm_in_train][-1]
    index_of_last_has_scene_movie_in_train = (has_scene_df[has_scene_df.title_of_movie == last_has_scene_movie_in_train]
                                              .index[-1])

    has_scene_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,:-2]
    has_scene_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,:-2]

    last_no_scene_movie_in_train = no_scene_df.title_of_movie.unique()[:n_no_self_harm_in_train][-1]
    index_of_last_no_scene_movie_in_train = (no_scene_df[no_scene_df.title_of_movie == last_no_scene_movie_in_train]
                                              .index[-1])

    no_scene_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,:-2]
    no_scene_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,:-2]
    
    X_train = pd.concat([no_scene_rows_to_include_in_train, has_scene_rows_to_include_in_train])
    X_test = pd.concat([no_scene_rows_to_include_in_test, has_scene_rows_to_include_in_test])

    print(f'Number of rows in train: {len(X_train)}')
    print(f'Number of rows in test: {len(X_test)}')
    
    if (len(X_train) + len(X_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
    
    # y variable
    
    has_scene_class_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,-2]
    has_scene_class_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,-2]

    no_scene_class_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,-2]
    no_scene_class_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,-2]
    
    y_train = pd.concat([no_scene_class_rows_to_include_in_train, has_scene_class_rows_to_include_in_train])
    y_test = pd.concat([no_scene_class_rows_to_include_in_test, has_scene_class_rows_to_include_in_test])

    print(f'Number of rows in train: {len(y_train)}')
    print(f'Number of rows in test: {len(y_test)}')
    
    y_train = y_train.astype('bool')
    y_test = y_test.astype('bool')
    
    if (len(y_train) + len(y_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
        
    # train and test prediction dataframes
        
    train = pd.DataFrame(dict(actual=y_train))
    test = pd.DataFrame(dict(actual=y_test))
    
    return no_scene_df, has_scene_df, X_train, X_test, y_train, y_test, train, test

In [5]:
df = pd.read_csv('trigger_warning_tweets.csv', index_col=0)
df.head()

Unnamed: 0,title,tweet,trigger_scene,cleaned_text,stemmed_text,lemmatized_text,lemmatized_no_stopwords,stemmed_no_stopwords
0,spiderman_far_from_home,spiderman far from home had a joke where peter...,False,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,spiderman far from home had a joke where peter...,joke peter mistake acdc led zeppelin triggered...,joke peter mistak acdc led zeppelin ptsd becau...
1,spiderman_far_from_home,Trigger warning for all photographers before s...,False,trigger warning for all photographers before s...,trigger warn for all photograph befor see spid...,trigger warning for all photographer before se...,warning photographer seeing,warn photograph befor see
2,spiderman_far_from_home,so i just finished watching spiderman far from...,False,so i just finished watching spiderman far from...,so i just finish watch spiderman far from home...,so i just finished watching spiderman far from...,finished loved im car hearing fever got confus...,finish watch im car hear fever got confus bc t...
3,spiderman_far_from_home,Spiderman: Far From Home was a gaslighting tri...,False,spiderman far from home was a gaslighting trig...,spiderman far from home wa a gaslight trigger ...,spiderman far from home wa a gaslighting trigg...,wa gaslighting half aint nobody warned,wa gaslight half aint nobodi warn
4,spiderman_far_from_home,it trigger me every time there's a spiderman f...,False,it trigger me every time theres a spiderman fa...,it trigger me everi time there a spiderman far...,it trigger me every time there a spiderman far...,every trailer tv start nowhere playing tom hol...,everi trailer tv start nowher tom holland cri ...


In [6]:
df.isna().sum()

title                      0
tweet                      1
trigger_scene              0
cleaned_text               3
stemmed_text               3
lemmatized_text            3
lemmatized_no_stopwords    7
stemmed_no_stopwords       5
dtype: int64

In [7]:
df.dropna(inplace=True)
df.isna().sum()

title                      0
tweet                      0
trigger_scene              0
cleaned_text               0
stemmed_text               0
lemmatized_text            0
lemmatized_no_stopwords    0
stemmed_no_stopwords       0
dtype: int64

### Create Bag of Words

In [9]:
bag_of_words = CountVectorizer()
X = bag_of_words.fit_transform(df.lemmatized_no_stopwords)
y = df.trigger_scene

In [48]:
has_scene_df, no_scene_df, X_train, X_test, y_train, y_test, train, test = custom_train_test_split(df, bag_of_words, X, y)

Number of movies with self-harm scenes: 129
Number of movies with no self-harm scenes: 83
----------------------------------------------------------
Number of self-harm movies to put into the train set: 90
Number of no self-harm movies to put into the train set: 58
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up
Number of rows in train: 2033
Number of rows in test: 850
Number of rows match up


### Modeling

In [49]:
lr = LogisticRegression(random_state=123)
parameters = {'C':sp.reciprocal(0.0001, 10000),
              'solver':['newton-cg', 'lbfgs', 'saga', 'liblinear']}

lr_rs = RandomizedSearchCV(estimator=lr, param_distributions=parameters, n_jobs=4)
lr_rs.fit(X_train, y_train)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=4,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001AD19DF87F0>, 'solver': ['newton-cg', 'lbfgs', 'saga', 'liblinear']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [51]:
print(lr_rs.best_params_)
print(lr_rs.best_score_)

{'C': 0.010532190137107452, 'solver': 'newton-cg'}
0.5656665027053616


In [52]:
dt = DecisionTreeClassifier(random_state=123)
parameters = {'criterion':['gini', 'entropy'],
              'max_depth':[int(x) for x in np.linspace(3, 25, 3)],
              'min_samples_split':[int(x) for x in np.linspace(2,50, 2)],
              'min_samples_leaf':[1, 2, 3, 4, 5]}

dt_rs = RandomizedSearchCV(estimator=dt, param_distributions=parameters, n_jobs=4, n_iter=25)
dt_rs.fit(X_train, y_train)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best'),
          fit_params=None, iid='warn', n_iter=25, n_jobs=4,
          param_distributions={'criterion': ['gini', 'entropy'], 'max_depth': [3, 14, 25], 'min_samples_split': [2, 50], 'min_samples_leaf': [1, 2, 3, 4, 5]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [53]:
print(dt_rs.best_params_)
print(dt_rs.best_score_)

{'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 25, 'criterion': 'entropy'}
0.5764879488440728


In [None]:
rf = RandomForestClassifier(random_state=123)
parameters = {'n_estimators':[int(x) for x in np.linspace(5, 50, 5)],
              'criterion':['gini', 'entropy'],
              'max_depth':[int(x) for x in np.linspace(3, 30, 3)],
              'min_samples_split':[int(x) for x in np.linspace(2, 20, 2)],
              'min_samples_leaf':[int(x) for x in np.linspace(1, 3, 1)]}

rf_rs = RandomizedSearchCV(estimator=rf, param_distributions=parameters, n_jobs=4, n_iter=50)
rf_rs.fit(X_train, y_train)



In [None]:
print(rf_rs.best_params_)
print(rf_rs.best_score_)