In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from __future__ import division

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.preprocessing import text, sequence
from keras.layers import Dropout, SpatialDropout1D
from keras.layers import Conv1D, MaxPooling1D, Conv2D
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling2D

import tensorflow as tf
import tensorflow_hub as hub
import logging

from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

logging.basicConfig(level=logging.INFO)

**Load Data**

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
full_clean_df = pd.read_excel("../data/full_clean_df.xlsx", index_col=0)

In [None]:
labels_name_list = ['NotHate', 'Racist', 'Sexist', 'Homophobe', 'Religion', 'OtherHate']

In [None]:
# max length of tweet
max([len(i) for i in full_clean_df['tweets_train'].apply(lambda x: x.split(' '))])

24

**Split Data**

In [None]:
X, X_test, y, y_test =  train_test_split(full_clean_df['tweets_train'], full_clean_df[labels_name_list], test_size=0.33, random_state=12)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=12)

In [None]:
X_train.shape

(64316,)

Define function to plot history

In [None]:
def plot_history(history):
    # Plot loss and AUC 
    fig = plt.figure(figsize=(10,5))

    #plt.subplot(1, 2, 1)
    plt.plot(history.epoch, history.history['val_loss'], 'g-', label='Validation data')
    plt.plot(history.epoch, history.history['loss'], 'r--', label='Training data')
    plt.grid(True)
    plt.xlabel('Number of epochs')
    plt.ylabel('Loss on training/validation data')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

    #plt.subplot(1, 2, 2)
    fig = plt.figure(figsize=(10,5))
    plt.plot(history.epoch, history.history['val_auc'], 'g-', label='Validation data')
    plt.plot(history.epoch, history.history['auc'], 'r--', label='Training data')
    plt.grid(True)
    plt.xlabel('Number of epochs')
    plt.ylabel('AUC on training/validation data')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

In [None]:
# Tokenize Text (Represent each word by a number)
max_features = 10000
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
# Keep all tweets to exact 30 words
maxlen = 30
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

val_encoded = tokenizer.texts_to_sequences(X_val)
val_padded = sequence.pad_sequences(val_encoded, maxlen=maxlen)

In [None]:
# Install gloVe twitter

!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!sudo apt install unzip
!unzip glove.twitter.27B.zip

In [None]:
# Load the embedding file
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

EMBEDDING_FILE = 'glove.twitter.27B.100d.txt'
# Map each word to its word vector
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, 'r', encoding='UTF-8'))

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix = embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if self.run_code(code, result):


In [None]:
batch_size = 256
epochs = 5
embed_size = 100

In [None]:
# Define the Neural Network
model = Sequential()
# Non-trainable embeddidng layer
model.add(Embedding(max_features, output_dim=embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False))
# LSTM 
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.15))
model.add(LSTM(64))
model.add(Dropout(0.15))
model.add(Dense(6, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['Precision', 'Recall', 'AUC'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 100)           1000000   
_________________________________________________________________
lstm (LSTM)                  (None, 30, 128)           117248    
_________________________________________________________________
dropout (Dropout)            (None, 30, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 6)                 390       
Total params: 1,167,046
Trainable params: 167,046
Non-trainable params: 1,000,000
________________________________________

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
model.fit(X_train, y_train, batch_size = batch_size , validation_data = (val_padded,y_val) , epochs = epochs, callbacks=[es])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f18640bdc90>

In [None]:
y_pred = model.predict(val_padded)

In [None]:
# function to find the best threshold 
def optimal_threshold(test, predictions):
  thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
  threshold_df = pd.DataFrame({})
  
  for thres in thresholds:
      pred = predictions.copy()
    
      pred[pred >= thres] = 1
      pred[pred < thres] = 0
    
      precision = precision_score(test, pred, average='macro')
      recall = recall_score(test, pred, average='macro')
      f1 = f1_score(test, pred, average='macro')

      df = pd.DataFrame({
          'threshold': round(thres, 4),
          'prediction': round(precision, 4),
          'recall': round(recall, 4),
          'f1': round(f1, 4)
      }, index=[0])

      threshold_df = threshold_df.append(df)
    
  return threshold_df.reset_index(drop=True)

In [None]:
optimal_threshold(y_val, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,threshold,prediction,recall,f1
0,0.1,0.4617,0.7107,0.5147
1,0.2,0.5945,0.6478,0.5441
2,0.3,0.5927,0.6044,0.5454
3,0.4,0.6281,0.5441,0.5316
4,0.5,0.6058,0.4002,0.4589
5,0.6,0.6511,0.3567,0.4195
6,0.7,0.6867,0.3044,0.3693
7,0.8,0.7158,0.2212,0.2738
8,0.9,0.4714,0.1277,0.1432


In [None]:
# optimal threshold = 0.3

lstm_pred = [[1 if i >=0.3 else 0 for i in pred] for pred in y_pred]

In [None]:
print(metrics.classification_report(y_val,  lstm_pred, digits=6))

              precision    recall  f1-score   support

           0   0.930423  0.994508  0.961398     28950
           1   0.449706  0.869879  0.592898      9837
           2   0.494405  0.558878  0.524668      4348
           3   0.693434  0.654523  0.673417      2388
           4   0.500000  0.002137  0.004255       468
           5   0.488132  0.546741  0.515777      4664

   micro avg   0.703770  0.866489  0.776698     50655
   macro avg   0.592683  0.604444  0.545402     50655
weighted avg   0.743771  0.866489  0.788901     50655
 samples avg   0.723092  0.898850  0.773167     50655



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print('Accuracy: ', accuracy_score(y_val, lstm_pred))
print('F1 score: ', f1_score(y_val, lstm_pred, average="macro"))

Accuracy:  0.39133179708955457
F1 score:  0.5454021878350711
