In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from __future__ import division

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.preprocessing import text, sequence
from keras.layers import Dropout, SpatialDropout1D
from keras.layers import Conv1D, MaxPooling1D, Conv2D
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling2D

import tensorflow as tf
import tensorflow_hub as hub
import logging

from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from collections import Counter
from imblearn.over_sampling import SMOTE

logging.basicConfig(level=logging.INFO)



**Load Data**

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
full_clean_df = pd.read_excel("../data/full_clean_df.xlsx", index_col=0)

In [None]:
labels_name_list = ['NotHate', 'Racist', 'Sexist', 'Homophobe', 'Religion', 'OtherHate']

In [None]:
# max length of tweet
max([len(i) for i in full_clean_df['tweets_train'].apply(lambda x: x.split(' '))])

24

**Split Data**

In [None]:
full_clean_df['powerset'] = full_clean_df.apply(lambda x : 32*x['NotHate']+16*x['Racist']+8*x['Sexist']+4*x['Homophobe']+2*x['Religion']+1*x['OtherHate'],axis=1)

In [None]:
X, X_test, y, y_test =  train_test_split(full_clean_df['tweets_train'], full_clean_df['powerset'], test_size=0.33, random_state=12)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=12)

In [None]:
X_train.shape

(64316,)

In [None]:
to_drop = y_train.index[y_train.isin([k for k,v in Counter(y_train).items() if float(v) <= 5])].tolist()
len(to_drop)

13

In [None]:
len(y_train)

64316

In [None]:
X_train = X_train.drop(to_drop)
y_train = y_train.drop(to_drop)
len(y_train)

64303

Define function to plot history

In [None]:
def plot_history(history):
    # Plot loss and AUC 
    fig = plt.figure(figsize=(10,5))

    #plt.subplot(1, 2, 1)
    plt.plot(history.epoch, history.history['val_loss'], 'g-', label='Validation data')
    plt.plot(history.epoch, history.history['loss'], 'r--', label='Training data')
    plt.grid(True)
    plt.xlabel('Number of epochs')
    plt.ylabel('Loss on training/validation data')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

    #plt.subplot(1, 2, 2)
    fig = plt.figure(figsize=(10,5))
    plt.plot(history.epoch, history.history['val_auc'], 'g-', label='Validation data')
    plt.plot(history.epoch, history.history['auc'], 'r--', label='Training data')
    plt.grid(True)
    plt.xlabel('Number of epochs')
    plt.ylabel('AUC on training/validation data')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

In [None]:
# Tokenize Text (Represent each word by a number)
max_features = 10000
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
# Keep all tweets to exact 30 words
maxlen = 30
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

val_encoded = tokenizer.texts_to_sequences(X_val)
X_val = sequence.pad_sequences(val_encoded, maxlen=maxlen)

**SMOTE Training Set**

In [None]:
oversample = SMOTE(random_state=12, n_jobs=-1, k_neighbors=5)

In [None]:
%time X_train_SMOTE, y_train_SMOTE = oversample.fit_resample(X_train, y_train)



CPU times: user 12.9 s, sys: 231 ms, total: 13.2 s
Wall time: 13.1 s


In [None]:
Counter(y_train_SMOTE)

Counter({1: 26562,
         2: 26562,
         3: 26562,
         4: 26562,
         5: 26562,
         6: 26562,
         8: 26562,
         9: 26562,
         10: 26562,
         11: 26562,
         12: 26562,
         13: 26562,
         14: 26562,
         16: 26562,
         17: 26562,
         18: 26562,
         19: 26562,
         20: 26562,
         21: 26562,
         24: 26562,
         25: 26562,
         28: 26562,
         32: 26562,
         33: 26562,
         34: 26562,
         35: 26562,
         36: 26562,
         37: 26562,
         38: 26562,
         40: 26562,
         41: 26562,
         42: 26562,
         44: 26562,
         48: 26562,
         49: 26562,
         50: 26562,
         52: 26562,
         56: 26562})

In [None]:
np.save("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/X_train_SMOTE_LSTM.npy", X_train_SMOTE)
np.save("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/X_test_LSTM.npy", X_test)
np.save("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/X_val_LSTM.npy", X_val)

In [None]:
y_train_SMOTE_df = pd.DataFrame(y_train_SMOTE, columns=["powerset"])
smote_df = pd.DataFrame(y_train_SMOTE_df.apply(lambda x : [int(y) for y in format(x['powerset'], "b").zfill(6)],axis=1), columns=["classes"])
y_train_SMOTE_df[labels_name_list] = pd.DataFrame(smote_df.classes.tolist(), index= y_train_SMOTE_df.index)
y_train_SMOTE_df = y_train_SMOTE_df.drop(['powerset'], axis=1)
y_train_SMOTE_df.to_pickle("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/y_train_SMOTE_LSTM.pkl")

In [None]:
y_train_SMOTE_df.head()

Unnamed: 0,NotHate,Racist,Sexist,Homophobe,Religion,OtherHate
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,1,0,0,0,0
3,1,0,0,0,0,0
4,1,1,0,0,0,0


In [None]:
y_test_df = pd.DataFrame(y_test, columns=["powerset"])
test_df = pd.DataFrame(y_test_df.apply(lambda x : [int(y) for y in format(x['powerset'], "b").zfill(6)],axis=1), columns=["classes"])
y_test_df[labels_name_list] = pd.DataFrame(test_df.classes.tolist(), index= y_test_df.index)
y_test_df = y_test_df.drop(['powerset'], axis=1)
y_test_df.to_pickle("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/y_test_LSTM.pkl")

In [None]:
y_val_df = pd.DataFrame(y_val, columns=["powerset"])
val_df = pd.DataFrame(y_val_df.apply(lambda x : [int(y) for y in format(x['powerset'], "b").zfill(6)],axis=1), columns=["classes"])
y_val_df[labels_name_list] = pd.DataFrame(val_df.classes.tolist(), index= y_val_df.index)
y_val_df = y_val_df.drop(['powerset'], axis=1)
y_val_df.to_pickle("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/y_val_LSTM.pkl")

**LSTM + GLoVe**

In [None]:
X_train_SMOTE = np.load("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/X_train_SMOTE_LSTM.npy")
X_test = np.load("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/X_test_LSTM.npy")
X_val = np.load("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/X_val_LSTM.npy")

In [None]:
y_train_SMOTE = pd.read_pickle("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/y_train_SMOTE_LSTM.pkl")
y_test = pd.read_pickle("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/y_test_LSTM.pkl")
y_val = pd.read_pickle("/content/drive/My Drive/BT4222/SMOTE/LSTM_Glove/y_val_LSTM.pkl")

In [None]:
# Install gloVe twitter

!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!sudo apt install unzip
!unzip glove.twitter.27B.zip

--2021-04-21 11:53:25--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2021-04-21 11:53:25--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2021-04-21 11:53:25--  http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [appli

In [None]:
# Load the embedding file
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

EMBEDDING_FILE = 'glove.twitter.27B.100d.txt'
# Map each word to its word vector
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, 'r', encoding='UTF-8'))

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix = embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if self.run_code(code, result):


In [None]:
batch_size = 256
epochs = 5
embed_size = 100

In [None]:
# Define the Neural Network
model = Sequential()
# Non-trainable embeddidng layer
model.add(Embedding(max_features, output_dim=embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False))
# LSTM 
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.15))
model.add(LSTM(64))
model.add(Dropout(0.15))
model.add(Dense(6, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['Precision', 'Recall', 'AUC'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 100)           1000000   
_________________________________________________________________
lstm (LSTM)                  (None, 30, 128)           117248    
_________________________________________________________________
dropout (Dropout)            (None, 30, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 6)                 390       
Total params: 1,167,046
Trainable params: 167,046
Non-trainable params: 1,000,000
________________________________________

In [None]:
y_train_SMOTE.loc[:, y_train_SMOTE.columns != 'powerset'].head()

Unnamed: 0,NotHate,Racist,Sexist,Homophobe,Religion,OtherHate
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,1,0,0,0,0
3,1,0,0,0,0,0
4,1,1,0,0,0,0


In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
%time model.fit(X_train_SMOTE, y_train_SMOTE, batch_size = batch_size , validation_data = (X_val,y_val) , epochs = epochs, callbacks=[es])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 00003: early stopping
CPU times: user 2min 54s, sys: 18.4 s, total: 3min 13s
Wall time: 2min 18s


<tensorflow.python.keras.callbacks.History at 0x7f45cb09c390>

In [None]:
y_pred = model.predict(X_val)

In [None]:
# function to find the best threshold 
def optimal_threshold(test, predictions):
  thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
  threshold_df = pd.DataFrame({})
  
  for thres in thresholds:
      pred = predictions.copy()
    
      pred[pred >= thres] = 1
      pred[pred < thres] = 0
    
      precision = precision_score(test, pred, average='macro')
      recall = recall_score(test, pred, average='macro')
      f1 = f1_score(test, pred, average='macro')

      df = pd.DataFrame({
          'threshold': round(thres, 4),
          'prediction': round(precision, 4),
          'recall': round(recall, 4),
          'f1': round(f1, 4)
      }, index=[0])

      threshold_df = threshold_df.append(df)
    
  return threshold_df.reset_index(drop=True)

In [None]:
optimal_threshold(y_val, y_pred)

Unnamed: 0,threshold,prediction,recall,f1
0,0.1,0.2787,0.8971,0.3661
1,0.2,0.2998,0.7897,0.3906
2,0.3,0.3269,0.6793,0.4127
3,0.4,0.3614,0.5564,0.4228
4,0.5,0.4029,0.4319,0.4101
5,0.6,0.4489,0.3185,0.3674
6,0.7,0.4894,0.2208,0.2954
7,0.8,0.5187,0.1366,0.2045
8,0.9,0.5444,0.0611,0.1041


In [None]:
# optimal threshold = 0.3

lstm_pred = [[1 if i >=0.4 else 0 for i in pred] for pred in y_pred]

In [None]:
print(metrics.classification_report(y_val,  lstm_pred, digits=6))

              precision    recall  f1-score   support

           0   0.926862  0.881623  0.903677     28950
           1   0.386081  0.650808  0.484651      9837
           2   0.256599  0.505290  0.340356      4348
           3   0.289981  0.585427  0.387849      2388
           4   0.021627  0.143162  0.037577       468
           5   0.287221  0.572041  0.382427      4664

   micro avg   0.547368  0.755207  0.634706     50655
   macro avg   0.361395  0.556392  0.422756     50655
weighted avg   0.667030  0.755207  0.693638     50655
 samples avg   0.595430  0.783974  0.638389     50655



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print('Accuracy: ', accuracy_score(y_val, lstm_pred))
print('F1 score: ', f1_score(y_val, lstm_pred, average="macro"))

Accuracy:  0.21746267243284195
F1 score:  0.4227561368871449
