# Imports

In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
from time import sleep
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import json
from wordcloud import WordCloud
import numpy as np
import pprint as pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import scipy.stats as sp

# Read in the data

In [None]:
df = pd.read_csv('trigger_warning_tweets.csv', index_col=0)
df.head()

### There are nulls
Drop rows with null values

In [3]:
df.isna().sum()

title                      0
tweet                      1
trigger_scene              0
cleaned_text               3
stemmed_text               3
lemmatized_text            3
lemmatized_no_stopwords    7
stemmed_no_stopwords       5
dtype: int64

In [6]:
df.dropna(inplace=True)
df.isna().sum()

title                      0
tweet                      0
trigger_scene              0
cleaned_text               0
stemmed_text               0
lemmatized_text            0
lemmatized_no_stopwords    0
stemmed_no_stopwords       0
dtype: int64

### Creating the matrix of tfidf values for bigrams

In [None]:
tfidf = TfidfVectorizer(ngram_range=(2,2))
X = tfidf.fit_transform(df.lemmatized_no_stopwords)
y = df.trigger_scene

In [10]:
pd.DataFrame(X.todense(), columns=tfidf.get_feature_names()).head(10)

Unnamed: 0,007 novel,00s version,010 recommend,02 second,03 2018,03 snes,0539 debbie,06 tony,08 back,0806 email,...,zone pretty,zone right,zone start,zooey deschanel,zoolander acenterforants,zootopia going,zorx gamepad,zu belegen,zu perk,zune glengarry
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Splitting into train-test set
Making sure that tweets for the same movie are in the same data set. This is important so we can tally up the classifications for a final prediction later.

In [35]:
pre_split = pd.DataFrame(X.todense(), columns=tfidf.get_feature_names()).join(pd.DataFrame(y)).join(df.title)
pre_split = pre_split.fillna(0)

no_scene_df = pre_split[pre_split.trigger_scene == False]

has_scene_df = pre_split[pre_split.trigger_scene == True]

2883
2883


In [36]:
n_self_harm = len(has_scene_df.title.unique())
n_no_self_harm = len(no_scene_df.title.unique())
percent_in_train = 0.7

print(f'Number of movies with self-harm scenes: {n_self_harm}')
print(f'Number of movies with no self-harm scenes: {n_no_self_harm}')

print('----------------------------------------------------------')

n_self_harm_in_train = round(n_self_harm * percent_in_train)
n_no_self_harm_in_train = round(n_no_self_harm * percent_in_train)

print(f'Number of self-harm movies to put into the train set: {n_self_harm_in_train}')
print(f'Number of no self-harm movies to put into the train set: {n_no_self_harm_in_train}')

Number of movies with self-harm scenes: 129
Number of movies with no self-harm scenes: 83
----------------------------------------------------------
Number of self-harm movies to put into the train set: 90
Number of no self-harm movies to put into the train set: 58


##### Train and test X sets

In [37]:
last_has_scene_movie_in_train = has_scene_df.title.unique()[:n_self_harm_in_train][-1]
index_of_last_has_scene_movie_in_train = (has_scene_df[has_scene_df.title == last_has_scene_movie_in_train]
                                          .index[-1])

has_scene_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,:-2]
has_scene_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,:-2]

last_no_scene_movie_in_train = no_scene_df.title.unique()[:n_no_self_harm_in_train][-1]
index_of_last_no_scene_movie_in_train = (no_scene_df[no_scene_df.title == last_no_scene_movie_in_train]
                                          .index[-1])

no_scene_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,:-2]
no_scene_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,:-2]

In [38]:
X_train = pd.concat([no_scene_rows_to_include_in_train, has_scene_rows_to_include_in_train])
X_test = pd.concat([no_scene_rows_to_include_in_test, has_scene_rows_to_include_in_test])

print(f'Number of rows in train: {len(X_train)}')
print(f'Number of rows in test: {len(X_test)}')

Number of rows in train: 2033
Number of rows in test: 850


In [39]:
if (len(X_train) + len(X_test)) == df.shape[0]:
    print('Number of rows match up')
else:
    print('Number of rows do not match up')

Number of rows match up


##### Train and test y sets

In [40]:
has_scene_class_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,-2]
has_scene_class_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,-2]

no_scene_class_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,-2]
no_scene_class_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,-2]

In [41]:
y_train = pd.concat([no_scene_class_rows_to_include_in_train, has_scene_class_rows_to_include_in_train])
y_test = pd.concat([no_scene_class_rows_to_include_in_test, has_scene_class_rows_to_include_in_test])

print(f'Number of rows in train: {len(y_train)}')
print(f'Number of rows in test: {len(y_test)}')

Number of rows in train: 2033
Number of rows in test: 850


In [42]:
if (len(y_train) + len(y_test)) == df.shape[0]:
    print('Number of rows match up')
else:
    print('Number of rows do not match up')

Number of rows match up


In [43]:
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

# Modeling