# Full Deep Attention Model

This model has all the stops pulled:
- Bidirectional LSTM
- Character Embeddings
- GloVe + Randomly-initialized Full Vocab Word Embeddings
- Self-Attention Module

## Set up

In [2]:
# 0. Some initial set-up.
from collections import Counter
import numpy as np
import os

import pandas as pd
import random
from tf_rnn_classifier import TfRNNClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report, roc_auc_score
import tensorflow as tf
import sst
from utils import evaluate, build_rnn_dataset
import utils

  from ._conv import register_converters as _register_converters


In [3]:
vsmdata_home = 'vsmdata'

glove_home = os.path.join(vsmdata_home, 'glove.6B')

data_dir = "./data/"

## Read in Data

In [4]:
train = pd.read_csv(data_dir + "train.csv").fillna(' ')
test = pd.read_csv(data_dir + "test.csv").fillna(' ')
test_labels = pd.read_csv(data_dir + "test_labels.csv")

In [5]:
glove_lookup = utils.glove2dict(
    os.path.join(vsmdata_home, 'glove.6B.100d.txt'))

## Format for RNN

In [6]:
X_rnn, Y_rnn = build_rnn_dataset(train, 0.9)

In [7]:
Y_rnn_binary = np.max(Y_rnn['train'], axis=1).reshape((-1, 1))
Y_rnn_binary_dev = np.max(Y_rnn['dev'], axis=1).reshape((-1, 1))
# print(Y_rnn_binary.shape)
print(Y_rnn_binary.shape)

(143613, 1)


In [8]:
full_glove_vocab = sorted(set(sst.get_vocab(X_rnn['train'])))
print("Embedding matrix contains %d words." % len(full_glove_vocab))

Embedding matrix contains 495027 words.


In [9]:
full_glove_embedding = np.array([
    glove_lookup[w] 
    if w in glove_lookup else utils.randvec(len(glove_lookup["hello"])) 
    for w in full_glove_vocab
])

In [10]:
full_glove_vocab.append("$UNK")
full_glove_embedding = np.vstack(
    (full_glove_embedding, utils.randvec(full_glove_embedding.shape[1])))

## Experiment #7: Glove + Char Embeds + Self-Attention

In [10]:
self_attn = TfRNNClassifier(
    full_glove_vocab,
    embedding=full_glove_embedding,
    embed_dim=100,
    hidden_dim=50,
    max_length=100,
    hidden_activation=tf.nn.tanh,
    cell_class=tf.nn.rnn_cell.GRUCell, # GRU
    train_embedding=True,
    char_embed=True,
    char_embed_dim=20,
    max_iter=20,
    word_length=12,
    bidir_rnn=True, # Bidirectional RNN!
    self_attend=True, # Self-Attention
    eta=0.01)

In [None]:
self_attn.fit(X_rnn['train'][:], Y_rnn['train'][:])

In [None]:
self_attn_preds = self_attn.predict(X_rnn['dev'][:])

In [38]:
evaluate(Y_rnn['dev'][:], self_attn_preds)

CLASS: toxic
p, r, f1: 0.6391, 0.7279, 0.6806

CLASS: severe_toxic
p, r, f1: 0.4198, 0.3481, 0.3806

CLASS: obscene
p, r, f1: 0.7969, 0.7590, 0.7775

CLASS: threat
p, r, f1: 0.4889, 0.3860, 0.4314

CLASS: insult
p, r, f1: 0.6716, 0.6104, 0.6395

CLASS: identity_hate
p, r, f1: 0.4571, 0.2286, 0.3048

average F1 score: 0.535737
macro-averaged ROC-AUC score: 0.948763


## Experiment #8: Glove + More Char Embeds + Self-Attention

In [29]:
self_attn_nochar = TfRNNClassifier(
    full_glove_vocab,
    embedding=full_glove_embedding,
    embed_dim=100,
    hidden_dim=50,
    max_length=100,
    hidden_activation=tf.nn.tanh,
    cell_class=tf.nn.rnn_cell.LSTMCell, # LSTM
    train_embedding=True,
    max_iter=3,
    word_length=12,
    bidir_rnn=True, # Bidirectional RNN!
    char_embed=True,
    self_attend=True, # Self-Attention
    dropout=0.2,
    eta=0.01,
    experiment_name="multilabel_self_ce_glove_longer")

In [30]:
self_attn_nochar.fit(X_rnn['train'][:], 
                     Y_rnn['train'][:], 
                     X_dev=X_rnn['dev'],
                     y_dev=Y_rnn['dev'])

Tensor("concat_2:0", shape=(?, 200, 50), dtype=float32)
./logs/multilabel_self_ce_glove_longer
epoch 1, iter 1, loss 0.71997, batch_time 1.323
epoch 1, iter 2, loss 0.29332, batch_time 1.220
epoch 1, iter 3, loss 0.16521, batch_time 1.249
epoch 1, iter 4, loss 0.14916, batch_time 1.207
epoch 1, iter 5, loss 0.16207, batch_time 1.208
epoch 1, iter 6, loss 0.14912, batch_time 1.234
epoch 1, iter 7, loss 0.13836, batch_time 1.220
epoch 1, iter 8, loss 0.14833, batch_time 1.179
epoch 1, iter 9, loss 0.14756, batch_time 1.239
epoch 1, iter 10, loss 0.13358, batch_time 1.236
epoch 1, iter 11, loss 0.15011, batch_time 1.218
epoch 1, iter 12, loss 0.14306, batch_time 1.206
epoch 1, iter 13, loss 0.14972, batch_time 1.204
epoch 1, iter 14, loss 0.15284, batch_time 1.210
epoch 1, iter 15, loss 0.14087, batch_time 1.193
epoch 1, iter 16, loss 0.16691, batch_time 1.178
epoch 1, iter 17, loss 0.13420, batch_time 1.213
epoch 1, iter 18, loss 0.13864, batch_time 1.192
epoch 1, iter 19, loss 0.13674, 

epoch 2, iter 164, loss 0.05833, batch_time 1.173
epoch 2, iter 165, loss 0.07066, batch_time 1.165
epoch 2, iter 166, loss 0.07019, batch_time 1.188
epoch 2, iter 167, loss 0.06135, batch_time 1.166
epoch 2, iter 168, loss 0.06821, batch_time 1.156
epoch 2, iter 169, loss 0.06371, batch_time 1.150
epoch 2, iter 170, loss 0.06267, batch_time 1.153
epoch 2, iter 171, loss 0.04853, batch_time 1.163
epoch 2, iter 172, loss 0.06057, batch_time 1.227
epoch 2, iter 173, loss 0.06567, batch_time 1.177
epoch 2, iter 174, loss 0.06977, batch_time 1.210
epoch 2, iter 175, loss 0.06869, batch_time 1.186
epoch 2, iter 176, loss 0.06692, batch_time 1.187
epoch 2, iter 177, loss 0.06452, batch_time 1.187
epoch 2, iter 178, loss 0.06907, batch_time 1.199
epoch 2, iter 179, loss 0.06883, batch_time 1.178
epoch 2, iter 180, loss 0.06262, batch_time 1.155
epoch 2, iter 181, loss 0.05421, batch_time 1.164
epoch 2, iter 182, loss 0.07205, batch_time 1.161
epoch 2, iter 183, loss 0.07084, batch_time 1.159


epoch 3, iter 327, loss 0.04998, batch_time 1.186
epoch 3, iter 328, loss 0.05668, batch_time 1.187
epoch 3, iter 329, loss 0.04813, batch_time 1.161
epoch 3, iter 330, loss 0.04383, batch_time 1.163
epoch 3, iter 331, loss 0.04780, batch_time 1.145
epoch 3, iter 332, loss 0.05016, batch_time 1.167
epoch 3, iter 333, loss 0.05848, batch_time 1.181
epoch 3, iter 334, loss 0.05781, batch_time 1.184
epoch 3, iter 335, loss 0.04380, batch_time 1.177
epoch 3, iter 336, loss 0.04786, batch_time 1.166
epoch 3, iter 337, loss 0.04542, batch_time 1.173
epoch 3, iter 338, loss 0.04189, batch_time 1.162
epoch 3, iter 339, loss 0.04472, batch_time 1.168
epoch 3, iter 340, loss 0.04413, batch_time 1.178
epoch 3, iter 341, loss 0.04824, batch_time 1.199
epoch 3, iter 342, loss 0.04885, batch_time 1.172
epoch 3, iter 343, loss 0.04843, batch_time 1.147
epoch 3, iter 344, loss 0.03998, batch_time 1.151
epoch 3, iter 345, loss 0.04776, batch_time 1.209
epoch 3, iter 346, loss 0.04604, batch_time 1.185


<tf_rnn_classifier.TfRNNClassifier at 0x7f9d4f29f2e8>

In [31]:
self_attn_nochar_preds = self_attn_nochar.predict(X_rnn['dev'][:])

In [32]:
evaluate(Y_rnn['dev'][:], self_attn_nochar_preds)

CLASS: toxic
p, r, f1: 0.7013, 0.7113, 0.7063

CLASS: severe_toxic
p, r, f1: 0.4923, 0.2370, 0.3200

CLASS: obscene
p, r, f1: 0.7426, 0.7202, 0.7312

CLASS: threat
p, r, f1: 0.0000, 0.0000, 0.0000

CLASS: insult
p, r, f1: 0.6793, 0.6059, 0.6405

CLASS: identity_hate
p, r, f1: 0.0000, 0.0000, 0.0000

average F1 score: 0.399667
weighted avg. F1 scored: 0.643811
macro-averaged ROC-AUC score: 0.960022


In [15]:
self_attn_nochar_preds = self_attn_nochar.predict(X_rnn['train'][:])

In [16]:
evaluate(Y_rnn['train'][:], self_attn_nochar_preds)

CLASS: toxic
p, r, f1: 0.8776, 0.7252, 0.7942

CLASS: severe_toxic
p, r, f1: 0.5645, 0.1438, 0.2293

CLASS: obscene
p, r, f1: 0.7975, 0.7597, 0.7781

CLASS: threat
p, r, f1: 0.0000, 0.0000, 0.0000

CLASS: insult
p, r, f1: 0.7407, 0.6701, 0.7036

CLASS: identity_hate
p, r, f1: 0.0000, 0.0000, 0.0000

average F1 score: 0.417536
weighted avg. F1 scored: 0.701436
macro-averaged ROC-AUC score: 0.974005


## Experiment #8: The Binary Task

In [22]:
self_ce_binary = TfRNNClassifier(
    full_glove_vocab,
    embedding=full_glove_embedding,
    embed_dim=100,
    hidden_dim=50,
    max_length=100,
    hidden_activation=tf.nn.tanh,
    cell_class=tf.nn.rnn_cell.LSTMCell, # LSTM
    train_embedding=True,
    max_iter=2,
    word_length=12,
    bidir_rnn=True, # Bidirectional RNN!
    char_embed=True,
    self_attend=True, # Self-Attention
    dropout=0.2,
    tol=1e-12,
    eta=0.01,
    eval_every=5,
    experiment_name="self-ce-glove-short")

In [23]:
self_ce_binary.fit(X_rnn['train'][:], 
                   Y_rnn_binary[:], 
                   X_dev=X_rnn['dev'],
                   y_dev=Y_rnn_binary_dev[:])

Tensor("concat_2:0", shape=(?, 200, 50), dtype=float32)
./logs/self-ce-glove-short
epoch 1, iter 1, loss 0.67013, batch_time 1.407
epoch 1, iter 2, loss 0.35103, batch_time 1.230
epoch 1, iter 3, loss 0.33026, batch_time 1.207
epoch 1, iter 4, loss 0.34386, batch_time 1.231
epoch 1, iter 5, loss 0.34761, batch_time 1.198
epoch 1, iter 6, loss 0.35414, batch_time 1.207
epoch 1, iter 7, loss 0.31585, batch_time 1.205
epoch 1, iter 8, loss 0.27872, batch_time 1.197
epoch 1, iter 9, loss 0.35890, batch_time 1.192
epoch 1, iter 10, loss 0.33638, batch_time 1.195
epoch 1, iter 11, loss 0.32713, batch_time 1.254
epoch 1, iter 12, loss 0.34946, batch_time 1.230
epoch 1, iter 13, loss 0.33819, batch_time 1.200
epoch 1, iter 14, loss 0.31012, batch_time 1.179
epoch 1, iter 15, loss 0.31772, batch_time 1.206
epoch 1, iter 16, loss 0.30849, batch_time 1.214
epoch 1, iter 17, loss 0.36337, batch_time 1.191
epoch 1, iter 18, loss 0.32529, batch_time 1.200
epoch 1, iter 19, loss 0.31991, batch_time 1

epoch 2, iter 165, loss 0.13542, batch_time 1.193
epoch 2, iter 166, loss 0.11626, batch_time 1.199
epoch 2, iter 167, loss 0.10454, batch_time 1.198
epoch 2, iter 168, loss 0.13531, batch_time 1.204
epoch 2, iter 169, loss 0.13681, batch_time 1.217
epoch 2, iter 170, loss 0.12034, batch_time 1.195
epoch 2, iter 171, loss 0.12356, batch_time 1.186
epoch 2, iter 172, loss 0.12175, batch_time 1.205
epoch 2, iter 173, loss 0.13468, batch_time 1.209
epoch 2, iter 174, loss 0.11555, batch_time 1.195
epoch 2, iter 175, loss 0.11436, batch_time 1.265
epoch 2, iter 176, loss 0.13883, batch_time 1.223
epoch 2, iter 177, loss 0.11451, batch_time 1.203
epoch 2, iter 178, loss 0.13066, batch_time 1.194
epoch 2, iter 179, loss 0.12993, batch_time 1.185
epoch 2, iter 180, loss 0.11209, batch_time 1.233
epoch 2, iter 181, loss 0.11206, batch_time 1.208
epoch 2, iter 182, loss 0.10221, batch_time 1.204
epoch 2, iter 183, loss 0.13616, batch_time 1.187
epoch 2, iter 184, loss 0.10583, batch_time 1.160


<tf_rnn_classifier.TfRNNClassifier at 0x7efb3e452ef0>

In [24]:
self_ce_binary_preds = self_ce_binary.predict(X_rnn['dev'])

In [25]:
evaluate(Y_rnn_binary_dev, self_ce_binary_preds)

p, r, f1: 0.8537, 0.6636, 0.7468

average F1 score: 0.746767


## 2-Network Ensemble Experiment

In [30]:
self_ce_binary2 = TfRNNClassifier(
    full_glove_vocab,
    embedding=full_glove_embedding,
    embed_dim=100,
    hidden_dim=50,
    max_length=100,
    hidden_activation=tf.nn.tanh,
    cell_class=tf.nn.rnn_cell.LSTMCell, # LSTM
    train_embedding=True,
    max_iter=2,
    word_length=12,
    bidir_rnn=True, # Bidirectional RNN!
    char_embed=True,
    self_attend=True, # Self-Attention
    dropout=0.2,
    tol=1e-12,
    eta=0.01,
    eval_every=5,
    experiment_name="self-ce-glove-short2")

In [31]:
self_ce_binary2.fit(X_rnn['train'][:], 
                   Y_rnn_binary[:], 
                   X_dev=X_rnn['dev'],
                   y_dev=Y_rnn_binary_dev[:])

Tensor("concat_2:0", shape=(?, 200, 50), dtype=float32)
./logs/self-ce-glove-short2
epoch 1, iter 1, loss 0.74346, batch_time 1.328
epoch 1, iter 2, loss 0.39696, batch_time 1.186
epoch 1, iter 3, loss 0.34094, batch_time 1.209
epoch 1, iter 4, loss 0.32383, batch_time 1.272
epoch 1, iter 5, loss 0.36832, batch_time 1.192
epoch 1, iter 6, loss 0.36991, batch_time 1.183
epoch 1, iter 7, loss 0.35453, batch_time 1.199
epoch 1, iter 8, loss 0.32273, batch_time 1.198
epoch 1, iter 9, loss 0.28137, batch_time 1.194
epoch 1, iter 10, loss 0.38317, batch_time 1.175
epoch 1, iter 11, loss 0.34817, batch_time 1.203
epoch 1, iter 12, loss 0.34096, batch_time 1.206
epoch 1, iter 13, loss 0.34210, batch_time 1.203
epoch 1, iter 14, loss 0.35123, batch_time 1.191
epoch 1, iter 15, loss 0.32478, batch_time 1.185
epoch 1, iter 16, loss 0.28586, batch_time 1.181
epoch 1, iter 17, loss 0.32770, batch_time 1.198
epoch 1, iter 18, loss 0.32448, batch_time 1.211
epoch 1, iter 19, loss 0.31625, batch_time 

epoch 2, iter 165, loss 0.10353, batch_time 1.191
epoch 2, iter 166, loss 0.12919, batch_time 1.193
epoch 2, iter 167, loss 0.13411, batch_time 1.175
epoch 2, iter 168, loss 0.10446, batch_time 1.184
epoch 2, iter 169, loss 0.12686, batch_time 1.194
epoch 2, iter 170, loss 0.11491, batch_time 1.210
epoch 2, iter 171, loss 0.11871, batch_time 1.197
epoch 2, iter 172, loss 0.09856, batch_time 1.218
epoch 2, iter 173, loss 0.11561, batch_time 1.208
epoch 2, iter 174, loss 0.11223, batch_time 1.220
epoch 2, iter 175, loss 0.09169, batch_time 1.210
epoch 2, iter 176, loss 0.13819, batch_time 1.216
epoch 2, iter 177, loss 0.12205, batch_time 1.177
epoch 2, iter 178, loss 0.11086, batch_time 1.199
epoch 2, iter 179, loss 0.12394, batch_time 1.180
epoch 2, iter 180, loss 0.14298, batch_time 1.193
epoch 2, iter 181, loss 0.10058, batch_time 1.204
epoch 2, iter 182, loss 0.13303, batch_time 1.208
epoch 2, iter 183, loss 0.10883, batch_time 1.191
epoch 2, iter 184, loss 0.11197, batch_time 1.196


<tf_rnn_classifier.TfRNNClassifier at 0x7efb60379278>

In [27]:
self_ce_binary_preds2 = self_ce_binary2.predict(X_rnn['dev'])

NameError: name 'self_ce_binary2' is not defined

In [28]:
evaluate(Y_rnn_binary_dev, self_ce_binary_preds2)

NameError: name 'self_ce_binary_preds2' is not defined

In [37]:
print(np.mean(np.hstack((self_ce_binary_preds, self_ce_binary_preds2)), axis=1, keepdims=True).shape)

(15958, 1)


In [38]:
# Average predictions over both networks
evaluate(Y_rnn_binary_dev, 
         np.mean(np.hstack((self_ce_binary_preds, 
                            self_ce_binary_preds2)), 
                 axis=1, keepdims=True))

p, r, f1: 0.8764, 0.6776, 0.7642

average F1 score: 0.764244


## 3-Network Ensemble Experiment


In [40]:
self_ce_binary3 = TfRNNClassifier(
    full_glove_vocab,
    embedding=full_glove_embedding,
    embed_dim=100,
    hidden_dim=50,
    max_length=100,
    hidden_activation=tf.nn.tanh,
    cell_class=tf.nn.rnn_cell.LSTMCell, # LSTM
    train_embedding=True,
    max_iter=2,
    word_length=12,
    bidir_rnn=True, # Bidirectional RNN!
    char_embed=True,
    self_attend=True, # Self-Attention
    dropout=0.2,
    tol=1e-12,
    eta=0.01,
    eval_every=5,
    experiment_name="self-ce-glove-short2")

In [41]:
self_ce_binary3.fit(X_rnn['train'][:], 
                   Y_rnn_binary[:], 
                   X_dev=X_rnn['dev'],
                   y_dev=Y_rnn_binary_dev[:])

Tensor("concat_2:0", shape=(?, 200, 50), dtype=float32)
./logs/self-ce-glove-short2
epoch 1, iter 1, loss 0.92377, batch_time 1.355
epoch 1, iter 2, loss 0.35723, batch_time 1.203
epoch 1, iter 3, loss 0.37418, batch_time 1.189
epoch 1, iter 4, loss 0.29872, batch_time 1.200
epoch 1, iter 5, loss 0.32991, batch_time 1.195
epoch 1, iter 6, loss 0.32063, batch_time 1.180
epoch 1, iter 7, loss 0.30011, batch_time 1.209
epoch 1, iter 8, loss 0.33972, batch_time 1.180
epoch 1, iter 9, loss 0.31641, batch_time 1.204
epoch 1, iter 10, loss 0.33033, batch_time 1.194
epoch 1, iter 11, loss 0.34008, batch_time 1.210
epoch 1, iter 12, loss 0.34613, batch_time 1.226
epoch 1, iter 13, loss 0.32985, batch_time 1.227
epoch 1, iter 14, loss 0.31330, batch_time 1.227
epoch 1, iter 15, loss 0.31259, batch_time 1.230
epoch 1, iter 16, loss 0.34654, batch_time 1.196
epoch 1, iter 17, loss 0.31464, batch_time 1.199
epoch 1, iter 18, loss 0.29923, batch_time 1.209
epoch 1, iter 19, loss 0.30521, batch_time 

epoch 2, iter 165, loss 0.11877, batch_time 1.181
epoch 2, iter 166, loss 0.11204, batch_time 1.245
epoch 2, iter 167, loss 0.13276, batch_time 1.196
epoch 2, iter 168, loss 0.15574, batch_time 1.188
epoch 2, iter 169, loss 0.12671, batch_time 1.195
epoch 2, iter 170, loss 0.14983, batch_time 1.194
epoch 2, iter 171, loss 0.10792, batch_time 1.184
epoch 2, iter 172, loss 0.11358, batch_time 1.191
epoch 2, iter 173, loss 0.09446, batch_time 1.265
epoch 2, iter 174, loss 0.11000, batch_time 1.194
epoch 2, iter 175, loss 0.12351, batch_time 1.203
epoch 2, iter 176, loss 0.11209, batch_time 1.188
epoch 2, iter 177, loss 0.12270, batch_time 1.226
epoch 2, iter 178, loss 0.12953, batch_time 1.206
epoch 2, iter 179, loss 0.11263, batch_time 1.194
epoch 2, iter 180, loss 0.11115, batch_time 1.190
epoch 2, iter 181, loss 0.11389, batch_time 1.195
epoch 2, iter 182, loss 0.13006, batch_time 1.200
epoch 2, iter 183, loss 0.11337, batch_time 1.196
epoch 2, iter 184, loss 0.13489, batch_time 1.205


<tf_rnn_classifier.TfRNNClassifier at 0x7efb14722550>

In [42]:
self_ce_binary_preds3 = self_ce_binary3.predict(X_rnn['dev'])

In [43]:
evaluate(Y_rnn_binary_dev, self_ce_binary_preds3)

p, r, f1: 0.8102, 0.7205, 0.7627

average F1 score: 0.762728


In [44]:
# Average predictions over 3 above networks
evaluate(Y_rnn_binary_dev, 
         np.mean(np.hstack((self_ce_binary_preds, 
                            self_ce_binary_preds2,
                            self_ce_binary_preds3)), 
                 axis=1, keepdims=True))

p, r, f1: 0.8784, 0.6903, 0.7730

average F1 score: 0.773035
