# Imports

In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
from time import sleep
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import json
from wordcloud import WordCloud
import numpy as np
import pprint as pprint
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import scipy.stats as sp
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [8]:
def classify_with_new_decision_threshold(probability, new_threshold):
    if probability > new_threshold:
        return True
    else:
        return False
    
def custom_train_test_split(df, vectorizer, X, y):
    '''Takes in original dataframe, vectorizer, sparse matrix of X values, and y values in a Series. Asks if vectorizer 
        is bigram or not and if there are sentiment scores. Sentiment scores should be stored in the original dataframe as
        "compound score".
        Returns two dataframes for each class (has_scene, no_scene), 
        train and test for X and y (X_train, X_test, y_train, y_test), 
        and the train and test dataframes for predictions (train, test).'''
    
    is_bigram = str(input('Bigram? (Y/N)'))
    if is_bigram == 'Y':
        pre_split = pd.DataFrame(X.todense(), 
                                 columns=vectorizer.get_feature_names()).join(pd.DataFrame(y)).join(df.title)
        pre_split.rename(columns={'title':'title_of_movie'}, inplace=True)
        print('Bigram specified')
    else:
        print('Non-bigram')
        pre_split = pd.DataFrame(X.todense(), 
                             columns=vectorizer.get_feature_names()).join(pd.DataFrame(y)).join(df.title, 
                                                                                                  rsuffix='_of_movie')
        print('Non-bigram')
    
    has_compound_score = str(input('Sentiment score? (Y/N)'))
    if has_compound_score == 'Y':
        print('Has sentiment score.')
        pre_split = pre_split.join(df.compound_score)
        
    pre_split = pre_split.fillna(0)
    no_scene_df = pre_split[pre_split.trigger_scene == False]
    has_scene_df = pre_split[pre_split.trigger_scene == True]
    
    n_self_harm = len(has_scene_df.title_of_movie.unique())
    n_no_self_harm = len(no_scene_df.title_of_movie.unique())
    percent_in_train = 0.7

    print(f'Number of movies with self-harm scenes: {n_self_harm}')
    print(f'Number of movies with no self-harm scenes: {n_no_self_harm}')

    print('----------------------------------------------------------')

    n_self_harm_in_train = round(n_self_harm * percent_in_train)
    n_no_self_harm_in_train = round(n_no_self_harm * percent_in_train)

    print(f'Number of self-harm movies to put into the train set: {n_self_harm_in_train}')
    print(f'Number of no self-harm movies to put into the train set: {n_no_self_harm_in_train}')
    
    # X variables
    
    last_has_scene_movie_in_train = has_scene_df.title_of_movie.unique()[:n_self_harm_in_train][-1]
    index_of_last_has_scene_movie_in_train = (has_scene_df[has_scene_df.title_of_movie == last_has_scene_movie_in_train]
                                              .index[-1])

    has_scene_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,:-3]
    has_scene_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,:-3]

    last_no_scene_movie_in_train = no_scene_df.title_of_movie.unique()[:n_no_self_harm_in_train][-1]
    index_of_last_no_scene_movie_in_train = (no_scene_df[no_scene_df.title_of_movie == last_no_scene_movie_in_train]
                                              .index[-1])

    no_scene_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,:-3]
    no_scene_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,:-3]
    
    X_train = pd.concat([no_scene_rows_to_include_in_train, has_scene_rows_to_include_in_train])
    X_test = pd.concat([no_scene_rows_to_include_in_test, has_scene_rows_to_include_in_test])

    print(f'Number of rows in train: {len(X_train)}')
    print(f'Number of rows in test: {len(X_test)}')
    
    if (len(X_train) + len(X_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
    
    # y variable
    
    has_scene_class_rows_to_include_in_train = has_scene_df.loc[:index_of_last_has_scene_movie_in_train].iloc[:,-3]
    has_scene_class_rows_to_include_in_test = has_scene_df.loc[index_of_last_has_scene_movie_in_train + 1:].iloc[:,-3]

    no_scene_class_rows_to_include_in_train = no_scene_df.loc[:index_of_last_no_scene_movie_in_train].iloc[:,-3]
    no_scene_class_rows_to_include_in_test = no_scene_df.loc[index_of_last_no_scene_movie_in_train + 1:].iloc[:,-3]
    
    y_train = pd.concat([no_scene_class_rows_to_include_in_train, has_scene_class_rows_to_include_in_train])
    y_test = pd.concat([no_scene_class_rows_to_include_in_test, has_scene_class_rows_to_include_in_test])
    
    print(f'Number of rows in train: {len(y_train)}')
    print(f'Number of rows in test: {len(y_test)}')
    
    
    if (len(y_train) + len(y_test)) == df.shape[0]:
        print('Number of rows match up')
    else:
        print('Number of rows do not match up')
        
    y_train = y_train.astype('bool')
    y_test = y_test.astype('bool')
    
    # movie titles
    
    has_scene_titles_in_train = has_scene_df.title_of_movie.loc[:index_of_last_has_scene_movie_in_train]
    has_scene_titles_in_test = has_scene_df.title_of_movie.loc[index_of_last_has_scene_movie_in_train + 1:]
    
    no_scene_titles_in_train = no_scene_df.title_of_movie.loc[:index_of_last_no_scene_movie_in_train]
    no_scene_titles_in_test = no_scene_df.title_of_movie.loc[index_of_last_no_scene_movie_in_train + 1:]
    
    titles_train = pd.concat([no_scene_titles_in_train, has_scene_titles_in_train])
    titles_test = pd.concat([no_scene_titles_in_test, has_scene_titles_in_test])
        
    # sentiment scores
    
    if has_compound_score == 'Y':
        has_scene_scores_in_train = has_scene_df.compound_score.loc[:index_of_last_has_scene_movie_in_train]
        has_scene_scores_in_test = has_scene_df.compound_score.loc[index_of_last_has_scene_movie_in_train + 1:]

        no_scene_scores_in_train = no_scene_df.compound_score.loc[:index_of_last_no_scene_movie_in_train]
        no_scene_scores_in_test = no_scene_df.compound_score.loc[index_of_last_no_scene_movie_in_train + 1:]

        scores_train = pd.concat([no_scene_scores_in_train, has_scene_scores_in_train])
        scores_test = pd.concat([no_scene_scores_in_test, has_scene_scores_in_test])
        
        X_train = X_train.join(scores_train)
        X_test = X_test.join(scores_test)
        
    # train and test prediction dataframes
    
    if has_compound_score == 'Y':
        train = pd.DataFrame(dict(actual=y_train, title=titles_train, score=scores_train))
        test = pd.DataFrame(dict(actual=y_test, title=titles_test, score=scores_test))
    else:
        train = pd.DataFrame(dict(actual=y_train, title=titles_train))
        test = pd.DataFrame(dict(actual=y_test, title=titles_test))
    
    return no_scene_df, has_scene_df, X_train, X_test, y_train, y_test, train, test

def sentiment_categorizer(sentiment_score_dictionary):
    compound_score = sentiment_score_dictionary['compound']
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score > -0.05 and compound_score < 0.05:
        return 'neutral'
    elif compound_score <= -0.05:
        return 'negative'

## Acquisition

In [5]:
df = pd.read_csv('trigger_warning_tweets.csv', index_col=0)

In [6]:
df.isna().sum()

title                      0
tweet                      1
trigger_scene              0
cleaned_text               3
stemmed_text               3
lemmatized_text            3
lemmatized_no_stopwords    7
stemmed_no_stopwords       5
dtype: int64

df.dropna(inplace=True)
df.isna().sum()

## Creating Bag of Words sparse matrix

## Combining features into a tuple