In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from __future__ import division
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Concatenate, Input, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.preprocessing import text, sequence

import tensorflow as tf
import tensorflow_hub as hub
import logging

from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

logging.basicConfig(level=logging.INFO)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


**Load Data**

In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
full_clean_df = pd.read_excel("../data/full_clean_df.xlsx", index_col=0)

In [6]:
labels_name_list = ['NotHate', 'Racist', 'Sexist', 'Homophobe', 'Religion', 'OtherHate']

In [8]:
full_clean_df.head(5)

Unnamed: 0,Tweets,NotHate,Racist,Sexist,Homophobe,Religion,OtherHate,tweets_train,tweets_emoji_train,tweets_nig_train
0,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,1,0,0,0,0,0,nigga momma youngboy spit real shit nigga,nigga momma youngboy spit real shit nigga,momma youngboy spit real shit
1,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,1,1,0,0,0,0,xxsugvngxx ran holy nigga today,xxsugvngxx ran holy nigga today loudly_crying_...,xxsugvngxx ran holy today
2,“EVERYbody calling you Nigger now!” https://t....,1,1,0,0,0,0,everybody call nigger,everybody call nigger,everybody call nigger
3,“ real ass bitch give a fuck boutta nigga” htt...,1,0,0,0,0,0,real bitch give fuck boutta nigga,real bitch give fuck boutta nigga,real bitch give fuck boutta
4,@WhiteHouse @realDonaldTrump Fuck ice. White s...,0,1,0,0,0,1,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage


In [7]:
# max length of tweet
max([len(i) for i in full_clean_df['tweets_train'].apply(lambda x: x.split(' '))])

24

## Sentiments

In [13]:
sia = SentimentIntensityAnalyzer()

In [14]:
full_clean_df['sentiment'] = full_clean_df['tweets_train'].apply(lambda x: sia.polarity_scores(x).get('compound'))

In [15]:
full_clean_df.head()

Unnamed: 0,Tweets,NotHate,Racist,Sexist,Homophobe,Religion,OtherHate,tweets_train,tweets_emoji_train,tweets_nig_train,sentiment
0,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,1,0,0,0,0,0,nigga momma youngboy spit real shit nigga,nigga momma youngboy spit real shit nigga,momma youngboy spit real shit,-0.5574
1,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,1,1,0,0,0,0,xxsugvngxx ran holy nigga today,xxsugvngxx ran holy nigga today loudly_crying_...,xxsugvngxx ran holy today,0.0
2,“EVERYbody calling you Nigger now!” https://t....,1,1,0,0,0,0,everybody call nigger,everybody call nigger,everybody call nigger,-0.6486
3,“ real ass bitch give a fuck boutta nigga” htt...,1,0,0,0,0,0,real bitch give fuck boutta nigga,real bitch give fuck boutta nigga,real bitch give fuck boutta,-0.8074
4,@WhiteHouse @realDonaldTrump Fuck ice. White s...,0,1,0,0,0,1,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage,-0.7906


**Split Data**

In [16]:
train_cols = ['tweets_train', 'sentiment']

In [76]:
X_train, X_test, y_train, y_test =  train_test_split(full_clean_df[train_cols], full_clean_df[labels_name_list], test_size=0.33, random_state=12)

In [21]:
X_train.shape

(95995, 2)

Define function to plot history

In [22]:
def plot_history(history):
    # Plot loss and AUC 
    fig = plt.figure(figsize=(10,5))

    #plt.subplot(1, 2, 1)
    plt.plot(history.epoch, history.history['val_loss'], 'g-', label='Validation data')
    plt.plot(history.epoch, history.history['loss'], 'r--', label='Training data')
    plt.grid(True)
    plt.xlabel('Number of epochs')
    plt.ylabel('Loss on training/validation data')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

    #plt.subplot(1, 2, 2)
    fig = plt.figure(figsize=(10,5))
    plt.plot(history.epoch, history.history['val_auc'], 'g-', label='Validation data')
    plt.plot(history.epoch, history.history['auc'], 'r--', label='Training data')
    plt.grid(True)
    plt.xlabel('Number of epochs')
    plt.ylabel('AUC on training/validation data')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

## Pre-Process Data

Tokenize and Pad Tweets only

In [71]:
X_train_tweets = X_train['tweets_train']
X_test_tweets = X_test['tweets_train']
X_val_tweets = X_val['tweets_train']

In [77]:
train_sa = np.array(X_train['sentiment'])
test_sa = np.array(X_test['sentiment'])
val_sa = np.array(X_val['sentiment'])

In [78]:
y = np.array(y_train)

In [79]:
# Tokenize Text (Represent each word by a number)
max_features = 10000
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train_tweets)

# Keep all tweets to exactly 25 words
maxlen = 25

train_encoded = tokenizer.texts_to_sequences(X_train_tweets)
train_padded = sequence.pad_sequences(train_encoded, maxlen=maxlen)

test_encoded = tokenizer.texts_to_sequences(X_test_tweets)
test_padded = sequence.pad_sequences(test_encoded, maxlen=maxlen)

## Load GloVe

In [27]:
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!sudo apt install unzip
!unzip glove.twitter.27B.zip

--2021-04-22 05:24:28--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2021-04-22 05:24:28--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2021-04-22 05:24:28--  http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [appli

In [28]:
# Load the embedding file
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

EMBEDDING_FILE = 'glove.twitter.27B.100d.txt'
# Map each word to its word vector
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, 'r', encoding='UTF-8'))

In [29]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index)+1)

#change below line if computing normal stats is too slow
embedding_matrix = embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if self.run_code(code, result):


# 5-folds Cross Validation

In [30]:
batch_size = 256
epochs = 5
embed_size = 100
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

### Define Model

In [80]:
def compile_model(max_features=max_features, embed_size=embed_size, embedding_matrix=embedding_matrix, maxlen=maxlen):
  nlp_input = Input(shape=(maxlen,), name='nlp_input') # tweets input
  meta_input = Input(shape=(1,), name='meta_input') # sentiment analysis input

  # EMBEDDING
  emb = Embedding(output_dim=embed_size, input_dim=max_features, weights=[embedding_matrix], input_length=maxlen, trainable=False)(nlp_input)

  # LSTM
  nlp_out = Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3))(emb)

  # CONCAT LAYER
  conc = Concatenate()([nlp_out, meta_input])
  dense1 = Dense(64, activation='relu')(conc)
  out = Dense(6, activation='sigmoid')(dense1)
  model = keras.Model(inputs=[nlp_input , meta_input], outputs=out)

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['Precision', 'Recall', 'AUC'])

  return model

In [83]:
def one_fold(X_train_padded, X_train_sa, y_train, X_val_padded, X_val_sa, y_val, batch_size, epochs, es):
  model = compile_model()

  history = model.fit(x=[X_train_padded, X_train_sa], y=y_train, validation_data=([X_val_padded, X_val_sa], y_val), batch_size=batch_size, epochs=epochs, callbacks=[es])
  y_pred = model.predict([X_val_padded,  X_val_sa])
  predictions = [[1 if i >=0.3 else 0 for i in pred] for pred in y_pred]

  score = metrics.f1_score(y_val, predictions, average='macro')
  return score

In [84]:
kf = KFold(n_splits=5)
f1_scores = []

for train_index, val_index in kf.split(train_padded):
  X_train_padded, X_val_padded = train_padded[train_index], train_padded[val_index]
  X_train_sa, X_val_sa = train_sa[train_index], train_sa[val_index]
  y_train, y_val = y[train_index], y[val_index]

  f1_scores.append(one_fold(X_train_padded, X_train_sa, y_train, X_val_padded, X_val_sa, y_val, batch_size=batch_size, epochs=5, es=es))













Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5












Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5












Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5












Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5












Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [85]:
f1_scores

[0.5477645505053437,
 0.5531480161605454,
 0.553671417445957,
 0.5541537512971975,
 0.5538317701748637]

In [86]:
np.mean(f1_scores)

0.5525139011167813

In [87]:
# import baseline f1_score
import pickle
with open("/content/drive/My Drive/Colab Notebooks/Group Project/lstm_baseline.txt", "rb") as fp:
   lstm_baseline = pickle.load(fp)

In [88]:
# t-test
from scipy import stats
stats.ttest_rel(lstm_baseline, f1_scores)

Ttest_relResult(statistic=-0.7528256814627696, pvalue=0.4934360031621986)