In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from __future__ import division

from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.preprocessing import text, sequence
from keras.layers import Dropout, SpatialDropout1D
from keras.layers import Conv1D, MaxPooling1D, Conv2D
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling2D

import tensorflow as tf
import tensorflow_hub as hub
import logging

logging.basicConfig(level=logging.INFO)

**Load Data**

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
full_clean_df = pd.read_excel("../data/full_clean_df.xlsx", index_col=0)

In [None]:
labels_name_list = ['NotHate', 'Racist', 'Sexist', 'Homophobe', 'Religion', 'OtherHate']

In [None]:
# max length of tweet
max([len(i) for i in full_clean_df['tweets_emoji_train'].apply(lambda x: x.split(' '))])

65

**Split Data**

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(full_clean_df['tweets_emoji_train'], np.array(full_clean_df[labels_name_list]), test_size=0.33, random_state=12)

In [None]:
X = X_train
y = y_train

In [None]:
X.shape

(95995,)

Define function to plot history

In [None]:
def plot_history(history):
    # Plot loss and AUC 
    fig = plt.figure(figsize=(10,5))

    #plt.subplot(1, 2, 1)
    plt.plot(history.epoch, history.history['val_loss'], 'g-', label='Validation data')
    plt.plot(history.epoch, history.history['loss'], 'r--', label='Training data')
    plt.grid(True)
    plt.xlabel('Number of epochs')
    plt.ylabel('Loss on training/validation data')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

    #plt.subplot(1, 2, 2)
    fig = plt.figure(figsize=(10,5))
    plt.plot(history.epoch, history.history['val_auc'], 'g-', label='Validation data')
    plt.plot(history.epoch, history.history['auc'], 'r--', label='Training data')
    plt.grid(True)
    plt.xlabel('Number of epochs')
    plt.ylabel('AUC on training/validation data')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

In [None]:
# Tokenize Text (Represent each word by a number)
max_features = 10000
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)

# Keep all tweets to exact 65 words
maxlen = 65

train_encoded = tokenizer.texts_to_sequences(X)
train_padded = sequence.pad_sequences(train_encoded, maxlen=maxlen)

test_encoded = tokenizer.texts_to_sequences(X_test)
test_padded = sequence.pad_sequences(test_encoded, maxlen=maxlen)

In [None]:
# Install gloVe twitter

!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!sudo apt install unzip
!unzip glove.twitter.27B.zip

--2021-04-21 13:45:58--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2021-04-21 13:45:58--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2021-04-21 13:45:59--  http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [appli

In [None]:
# Load the embedding file
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

EMBEDDING_FILE = 'glove.twitter.27B.100d.txt'
# Map each word to its word vector
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, 'r', encoding='UTF-8'))

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix = embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if self.run_code(code, result):


# 5-folds Cross Validation



**Function to compile Model**

In [None]:
batch_size = 256
epochs = 5
embed_size = 100
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

In [None]:
def compile_model(max_features=max_features, embed_size=100, embedding_matrix=embedding_matrix, maxlen=maxlen):
  # Define the Neural Network
  model = Sequential()
  # Non-trainable embeddidng layer
  model.add(Embedding(max_features, output_dim=embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False))
  # LSTM 
  model.add(LSTM(128, return_sequences=True))
  model.add(Dropout(0.15))
  model.add(LSTM(64))
  model.add(Dropout(0.15))
  model.add(Dense(6, activation='sigmoid'))
  model.summary()
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['Precision', 'Recall', 'AUC'])

  return model

In [None]:
def one_fold(X_train, y_train, X_val, y_val, batch_size, epochs, es):
  model = compile_model()

  history = model.fit(X_train, y_train, batch_size = batch_size, validation_data = (X_val, y_val), epochs=epochs, callbacks=[es])
  y_pred = model.predict(X_val)
  predictions = [[1 if i >=0.3 else 0 for i in pred] for pred in y_pred]

  score = metrics.f1_score(y_val, predictions, average='macro')
  return score

In [None]:
kf = KFold(n_splits=5)
f1_scores = []

for train_index, val_index in kf.split(train_padded):
  X_train, X_val = train_padded[train_index], train_padded[val_index]
  y_train, y_val = y[train_index], y[val_index]
  f1_scores.append(one_fold(X_train, y_train, X_val, y_val, batch_size = 256, epochs=5, es=es) )

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 65, 100)           1000000   
_________________________________________________________________
lstm (LSTM)                  (None, 65, 128)           117248    
_________________________________________________________________
dropout (Dropout)            (None, 65, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 6)                 390       
Total params: 1,167,046
Trainable params: 167,046
Non-trainable params: 1,000,000
________________________________________

In [None]:
f1_scores

[0.5431816667658715,
 0.5494429994525477,
 0.5395992321454626,
 0.5496570832621102,
 0.5513926961957929]

In [None]:
np.mean(f1_scores)

0.546654735564357

In [None]:
# import baseline f1_score
import pickle
with open("/content/drive/My Drive/BT4222/FINAL_CODES/lstm_baseline.txt", "rb") as fp:
   lstm_baseline = pickle.load(fp)

In [None]:
lstm_baseline

[0.5503131007793429,
 0.5536645807247731,
 0.543105274882358,
 0.5556589535657163,
 0.5508641887640409]

In [None]:
np.average(lstm_baseline)

0.5507212197432463

In [None]:
# t-test
from scipy import stats
stats.ttest_rel(lstm_baseline, f1_scores)

Ttest_relResult(statistic=3.0920600165451826, pvalue=0.0365012356527156)