In [1]:
%matplotlib inline  
import matplotlib.pyplot as plt

from keras.layers import Bidirectional, Input, LSTM, Dense, Activation, Conv1D, Flatten, Embedding, MaxPooling1D, Dropout
from keras.layers import Add, Concatenate, Lambda, Reshape, Permute, Average, Layer, TimeDistributed, Multiply, GRU, BatchNormalization
#from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
from gensim.models import Word2Vec
from keras.models import Sequential, Model
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from sklearn.utils import shuffle
import pickle
from sklearn.model_selection import train_test_split
from autocorrect import spell
import spacy
from spacy.gold import GoldParse
nlp = spacy.load('en')
import re
from sklearn.utils import shuffle
import keras
import joblib
from keras.utils.vis_utils import plot_model
import keras.backend as K
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import defaultdict
import tokenizer_util as tu

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('train.csv')

In [3]:
pred_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [4]:
df['total_classes'] = df['toxic']+df['severe_toxic']+df['obscene']+df['threat']+df['insult']+df['identity_hate']

In [5]:
comment_col = 'comment_text'

In [6]:
#df[comment_col] = df[comment_col].astype(str).apply(lambda x : x.replace("'", "").replace('"',''))

In [7]:
df[comment_col] = df[comment_col].apply(lambda x: re.sub('[0-9]','',x))

In [8]:
comment_list = df[comment_col].tolist()
n_classes = 1

In [9]:
tokenizer = joblib.load('tokenizer_100')
final_emb_matrix = joblib.load('embedding_100')
print('Total vocabulary is {0}'.format(final_emb_matrix.shape[0]))

Total vocabulary is 233222


### Replacing all the unknown words with UNK. This will have no impact on training as all the words are known

In [10]:
print ("The vocabulary size is: {0}".format(len(tokenizer.word_index)))
print (tokenizer.texts_to_sequences([tu.replace_unknown_words_with_UNK("DFLSDKJFLS ADFSDF was Infosys CEO", tokenizer)]))

The vocabulary size is: 233221
[[65247, 65247, 36, 65247, 65247]]


In [11]:
class_count = []
for col in pred_cols:
    class_count.append((col,len(df[df[col]==1])))
print (class_count)

[('toxic', 15294), ('severe_toxic', 1595), ('obscene', 8449), ('threat', 478), ('insult', 7877), ('identity_hate', 1405)]


In [12]:
train, test = train_test_split(df, test_size=0.10, random_state=42)
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total_classes
34117,5b02208daa29a40f,Outrageous!!!!! \n\nThis block is outrageous a...,0,0,0,0,0,0,0
6579,1190ddc487465bd2,Except that you would never dare say something...,0,0,0,0,0,0,0
59858,a0473abe447e04e3,Thanks for your reply and your explanation; yo...,0,0,0,0,0,0,0
86152,e6763dac9d770096,or attempted generalization,0,0,0,0,0,0,0
7620,1446437fe8605add,You seem to be vandalising the article. Why a...,1,0,0,0,0,0,1


In [13]:
XVal = tokenizer.texts_to_sequences(test.astype(str)[comment_col].tolist())
#YTrain = np.array(train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']])
#YVal = np.array(test[['toxic','severe_toxic','obscene','threat','insult','identity_hate']])

In [14]:
def ys(dftox, predcols):
    ys = []
    for col in predcols:
        ys.append(np.array(dftox[col].tolist()))
    return ys

In [15]:
def ys_unified(dftox, predcols):
    ys = dftox[predcols].values
    return ys

In [16]:
YTrain = ys(train, pred_cols)
YVal = ys(test, pred_cols)

In [17]:
"""
Attention Layer with works follows the math from https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf
This layer only computes the weights, does not multiply the RNN output with the weights. This layer has to be
followed by a Multiply layer, followed by Reshape, followed by a Lambda for summing.
"""
class ATTNWORD(Layer):
    def __init__(self,output_dim, **kwargs):
        self.output_dim = output_dim
        #self.supports_masking = True
        super(ATTNWORD, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        print('The input shape is: {}'.format(input_shape))
        self.kernel = self.add_weight(name='kernel', 
                                      shape=(input_shape[-1], self.output_dim),
                                      initializer='uniform',
                                      trainable=True)
        self.input_shape_bk = input_shape
        super(ATTNWORD, self).build(input_shape)  

    def call(self, x,mask=None):
        print ('kernel shape', self.kernel.shape)
        print ('Input shape', x.shape)
        product = K.dot(x, self.kernel)
        product = K.reshape(product, (-1, self.output_dim, self.input_shape_bk[1]))

        x_norm  = K.softmax(product)
        print ('Norm shape', x_norm.shape)
        x_norm = K.reshape(x_norm, (-1, self.input_shape_bk[1],self.output_dim))

        return x_norm
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1], self.output_dim)


In [18]:
"""
A attenion layer, built on the basis of https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf.
Takes care of all the atention compute, Takes array of input - Bidirectional RNN output and the TanH layer output.
Usage ATTNWORD_COMPLETE(1)([tanh_output, rnn_output])
"""
class ATTNWORD_COMPLETE(Layer):
    def __init__(self,output_dim, **kwargs):
        self.output_dim = output_dim
        #self.supports_masking = True
        super(ATTNWORD, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        print('The input shape is: {}'.format(input_shape))
        self.kernel = self.add_weight(name='kernel', 
                                      shape=(input_shape[0][-1], self.output_dim),
                                      initializer='uniform',
                                      trainable=True)
        self.input_shape_bk = input_shape
        super(ATTNWORD, self).build(input_shape)  

    def call(self, x,mask=None):
        print ('kernel shape', self.kernel.shape)
        print ('Input shape', x[0].shape)
        product = K.dot(x[0], self.kernel)
        product = K.reshape(product, (-1, self.output_dim, self.input_shape_bk[0][1]))
        x_norm  = K.softmax(product)
        print ('Norm shape', x_norm.shape)
        x_norm = K.reshape(x_norm, (-1, self.input_shape_bk[0][1],self.output_dim))
        print ('reshaped Norm shape: {0} and hit shape is {1}'.format( x_norm.shape, x[1].shape))
        attn_final = x[1]*x_norm
        print ('Attn final shape', attn_final.shape)
        attn_final = K.reshape(attn_final, (-1, self.input_shape_bk[1][-1], self.input_shape_bk[0][1]))

        attn_final = K.sum(attn_final, axis=2)
        print ('Attn final shape sum', attn_final.shape)
        return attn_final
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[1][-1])        

In [19]:
"""
This method creates a model with an input of word length, followed by embedding layer and finally GRU, 
with output dim as passed in the argument.
"""
def get_word_attention(emb_matrix, word_length, optimizer, nclasses, gru_output_dim=50):
    input = Input(shape=(word_length, ), dtype='int32')
    embedding = Embedding( input_dim=emb_matrix.shape[0], output_dim=emb_matrix.shape[1], weights=[emb_matrix],input_length=word_length,trainable=True, mask_zero=True)
    sequence_input = embedding(input)
    print('embedding: ',sequence_input.shape)
    x = Bidirectional(GRU(gru_output_dim,return_sequences=True, dropout=0.1))(sequence_input)
    #x = BatchNormalization()(x)
    print('Shape after BD LSTM',x.shape)
    model = Model(input, x) 
    return model


In [20]:
"""
This method applies attention only at the word level. The last layer is a sigmoid layer with output of 1. 
The output is going to be an array, the number of output is determined by n_classes.
Here the labels are assumed to be independent of each other and probability for each label is independently calculated
using dedicated Attention layer for each.
"""
def attention_words_only(emb_matrix, word_length, n_classes, trainable=True):
    nclasses = n_classes
    preds = []
    attentions_pred = []
    input = Input(shape=(word_length, ), dtype='int32')
    embedding = Embedding( input_dim=emb_matrix.shape[0], output_dim=emb_matrix.shape[1], weights=[emb_matrix],input_length=word_length,trainable=True)
    sequence_input = embedding(input)
    print('embedding: ',sequence_input.shape)
    x = Bidirectional(GRU(50,return_sequences=True))(sequence_input)
    word_vectors = TimeDistributed(Dense(100, activation='tanh'))(x) #TanH layer as required by the paper, is external to the Attn layer.
    print('Shape after word vector',word_vectors.shape)
    h_it = x
    print('Shape after reshape word vector',h_it.shape)

    attn_final_word = [ATTNWORD_COMPLETE(1)([word_vectors, h_it]) for i in range(nclasses)]
    print('ATTN Shape', attn_final_word[0].shape)
    
    for i in range(nclasses):
        #x = Dense(128, activation='relu',trainable=trainable)(attn_final_word[i])
        #x = Dropout(0.2)(x)
        #x = Dense(128, activation='relu',trainable=trainable)(x)
        #x = Dropout(0.2)(x)
        #x = Dense(64, activation='relu',trainable=trainable)(x)
        #x = Dropout(0.2)(x)
        #x = Dense(64, activation='relu',trainable=trainable)(x)
        p = Dense(1, activation='sigmoid')(attn_final_word[i])
        preds.append(p)
    model = Model(input, preds)

    return model

In [21]:
"""
Method to return model with hierarchical attention. The output is an array of output of the size n_classes, each with 
its own sigmoid. 
"""
def get_sentence_attention(word_model , word_length, sent_length, n_classes):
    #x = Permute((2,1))(si_vects)
    nclasses = n_classes
    input = Input(shape=(sent_length, word_length ), dtype='int32')
    print(' input to sentence attn network',word_model)
    preds = []
    attentions_pred = []

    si_vects = TimeDistributed(word_model)(input)
    print('Shape after si_vects', si_vects.shape)
    u_it = TimeDistributed(TimeDistributed(Dense(100, activation='tanh')))(si_vects)
    print('Shape after word vector',u_it.shape)
    u_it = TimeDistributed(TimeDistributed(BatchNormalization()))(u_it)
    
    attn_final_word = [TimeDistributed(ATTNWORD(1))(u_it) for i in range(nclasses)]

    print('ATTN Shape', attn_final_word[0].shape)
    attn_final_word = [Multiply()([si_vects, attn_final_word[i]]) for i in range(nclasses)]#Multiply()([h_it,a_it])
    print('Multi word Shape', attn_final_word[0].shape)
    attn_final_word = [Reshape((sent_length, 100,word_length))(attn_final_word[i]) for i in range(nclasses)]
    print ('Shape of the att1 is {}'.format(attn_final_word[0].shape))
    attn_final_word = [Lambda(lambda x: K.sum(x, axis=3))(attn_final_word[i]) for i in range(nclasses)]
    print ('Shape of the lambda word is {}'.format(attn_final_word[0].shape))
    for i in range(nclasses):
        x = Bidirectional(GRU(50,return_sequences=True, dropout=0.1))(attn_final_word[i])
        #x = BatchNormalization()(x)

        print('Shape after BD LSTM',x.shape)

        u_it = TimeDistributed(Dense(100, activation='tanh'))(x)
        #u_it = BatchNormalization()(u_it)
        print('Shape after word vector',u_it.shape)

        attn_final_sent = ATTNWORD(1)(u_it)
        print ('Shape of the sent att is {}'.format(attn_final_sent.shape))

        attn_final_sent = Multiply()([x, attn_final_sent])
        print ('Shape of the multi sent att is {}'.format(attn_final_sent.shape))
        attn_final_sent = Reshape((100,sent_length))(attn_final_sent)
        attn_final_sent = Lambda(lambda x: K.sum(x, axis=2))(attn_final_sent)
        print ('Shape of the lambda sent att is {}'.format(attn_final_sent.shape))
        #p = Dense(100, activation='relu')(attn_final_sent)
        #p = BatchNormalization()(p)
        p = Dense(1, activation='sigmoid')(attn_final_sent)
        preds.append(p)
    model = Model(input, preds)

    return model


In [22]:
def get_sentence_attention_combined_output(word_model , word_length, sent_length, n_classes):
    #x = Permute((2,1))(si_vects)
    nclasses = n_classes
    input = Input(shape=(sent_length, word_length ), dtype='int32')
    print(' input to sentence attn network',word_model)
    attentions_pred = []
    #print(output.summary())
    si_vects = TimeDistributed(word_model)(input)
    print('Shape after si_vects', si_vects.shape)
    u_it = TimeDistributed(TimeDistributed(Dense(100, activation='tanh')))(si_vects)
    print('Shape after word vector',u_it.shape)
    #h_it = TimeDistributed(Reshape((100,word_length)))(si_vects)
    #print('Shape after reshape word vector',h_it.shape)

    attn_final_word = [TimeDistributed(ATTNWORD(1))(u_it) for i in range(nclasses)]
    #a_it = Reshape(( word_length, 1))(a_it)
    #h_it = Reshape((word_length, 512))(h_it)
    print('ATTN Shape', attn_final_word[0].shape)
    attn_final_word = [Multiply()([si_vects, attn_final_word[i]]) for i in range(nclasses)]#Multiply()([h_it,a_it])
    print('Multi word Shape', attn_final_word[0].shape)
    attn_final_word = [Reshape((sent_length, 100,word_length))(attn_final_word[i]) for i in range(nclasses)]
    print ('Shape of the att1 is {}'.format(attn_final_word[0].shape))
    attn_final_word = [Lambda(lambda x: K.sum(x, axis=3))(attn_final_word[i]) for i in range(nclasses)]
    print ('Shape of the lambda word is {}'.format(attn_final_word[0].shape))
    attn_sents_for_all_classes = []
    for i in range(nclasses):
        x = Bidirectional(GRU(50,return_sequences=True))(attn_final_word[i])
        #x = Bidirectional(LSTM(256,return_sequences=True))(x)
        print('Shape after BD LSTM',x.shape)
        #x1 = Permute((2,1))(x)
        #print('Shape after permute',x1.shape)
        u_it = TimeDistributed(Dense(100, activation='tanh'))(x)
        print('Shape after word vector',u_it.shape)
        #h_it = Reshape((100,sent_length))(x)
        attn_final_sent = ATTNWORD(1)(u_it)
        print ('Shape of the sent att is {}'.format(attn_final_sent.shape))
        #attentions_pred.append(attn_final)
        attn_final_sent = Multiply()([x, attn_final_sent])
        print ('Shape of the multi sent att is {}'.format(attn_final_sent.shape))
        attn_final_sent = Reshape((100,sent_length))(attn_final_sent)
        attn_final_sent = Lambda(lambda x: K.sum(x, axis=2))(attn_final_sent)
        print ('Shape of the lambda sent att is {}'.format(attn_final_sent.shape))
        attn_sents_for_all_classes.append(attn_final_sent)
    x = Concatenate()(attn_sents_for_all_classes)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    #x = Dense(128, activation='relu')(x)
    #x = Dropout(0.2)(x)
    #x = Dense(64, activation='relu')(x)
    #x = Dropout(0.2)(x)
    x = Dense(64, activation='relu')(x)
    preds = Dense(nclasses, activation='sigmoid')(x)
    
    model = Model(input, preds)

    return model


In [23]:
"""
This returns LSTM based model. There are 6 output classes, all soft sharing the parameters of a common network.
"""
def get_model_soft_sharing_lstm_singleoutput(emb_matrix, sentence_length, word_length, learning_rate=0.001, n_classes=1, decay=0.1, combined_model=False):
    
    rmsprop = optimizers.Adam(lr=learning_rate, clipnorm=0.1, clipvalue=0.05,decay=0.1)# 
    word_model = get_word_attention(emb_matrix, word_length, rmsprop, n_classes)
    if not combined_model:
        model = get_sentence_attention(word_model, word_length, sentence_length, n_classes)
    else:
        model = get_sentence_attention_combined_output(word_model, word_length, sentence_length, n_classes)
    #model = attention_words_only(emb_matrix, word_length, 1)#sent_model
    #model.add(Activation('softmax'))
    #adam = optimizers.Adam(clipnorm=0.1,lr=learning_rate, clipvalue=0.05, decay=0.1)
    model.compile(loss='binary_crossentropy', optimizer=rmsprop,metrics=['accuracy'])
    #model.compile(loss='mse', optimizer=adam,metrics=['accuracy'])

    print (model.summary())

    return model

In [24]:
# Callbacks are passed to the model fit the `callbacks` argument in `fit`,
# which takes a list of callbacks. You can pass any number of callbacks.
callbacks_list = [
    # This callback will interrupt training when we have stopped improving
    keras.callbacks.EarlyStopping(
        # This callback will monitor the validation accuracy of the model
        monitor='val_loss',
        # Training will be interrupted when the accuracy
        # has stopped improving for *more* than 1 epochs (i.e. 2 epochs)
        patience=10,
    ),
    # This callback will save the current weights after every epoch
    keras.callbacks.ModelCheckpoint(
        filepath='/Users/mayoor/dev/kaggle/tc/models/resnet_best_model_han_split.h5',  # Path to the destination model file
        # The two arguments below mean that we will not overwrite the
        # model file unless `val_loss` has improved, which
        # allows us to keep the best model every seen during training.
        monitor='val_loss',
        save_best_only=True,
    ),
    
       keras.callbacks.ReduceLROnPlateau(
           # This callback will monitor the validation loss of the model
           monitor='val_loss',
           # It will divide the learning by 10 when it gets triggered
           factor=0.1,
           # It will get triggered after the validation loss has stopped improving
           # for at least 10 epochs
           patience=3,
) ,

    keras.callbacks.TensorBoard(
        # Log files will be written at this location
        log_dir='/Users/mayoor/dev/kaggle/tc/logs',
        # We will record activation histograms every 1 epoch
        histogram_freq=1
        
) 


]


In [25]:
train.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total_classes
119879,811ed72c51830f42,REDIRECT Talk:John Loveday (experimental physi...,0,0,0,0,0,0,0
103694,2acc7c7d0386401f,Back it up. Post the line here with the refere...,0,0,0,0,0,0,0
131932,c1f95b89050a9ee4,I won't stop that. Sometimes Germanic equals G...,1,0,0,0,0,0,1
146867,32e8bdecfe1d66f0,"""\n\n British Bands? \n\nI think you've mista...",0,0,0,0,0,0,0
121958,8c6c5e4228fb6ba8,You are WRONG. \n\nJustin Thompson is mentione...,0,0,0,0,0,0,0


#### Use the X_train_* to create XTrains and YTrains.

In [26]:
final_emb_matrix.shape

(233222, 100)

In [27]:
model = get_model_soft_sharing_lstm_singleoutput(final_emb_matrix, 30, 10, learning_rate=0.01, n_classes=6, decay=0.1, combined_model=False)


embedding:  (?, 10, 100)
Shape after BD LSTM (?, ?, 100)
 input to sentence attn network <keras.engine.training.Model object at 0x122f90320>
Shape after si_vects (?, 30, 10, 100)
Shape after word vector (?, 30, 10, 100)
The input shape is: (None, 10, 100)
kernel shape (100, 1)
Input shape (?, 10, 100)
Norm shape (?, 1, 10)
The input shape is: (None, 10, 100)
kernel shape (100, 1)
Input shape (?, 10, 100)
Norm shape (?, 1, 10)
The input shape is: (None, 10, 100)
kernel shape (100, 1)
Input shape (?, 10, 100)
Norm shape (?, 1, 10)
The input shape is: (None, 10, 100)
kernel shape (100, 1)
Input shape (?, 10, 100)
Norm shape (?, 1, 10)
The input shape is: (None, 10, 100)
kernel shape (100, 1)
Input shape (?, 10, 100)
Norm shape (?, 1, 10)
The input shape is: (None, 10, 100)
kernel shape (100, 1)
Input shape (?, 10, 100)
Norm shape (?, 1, 10)
ATTN Shape (?, 30, 10, 1)
Multi word Shape (?, 30, 10, 100)
Shape of the att1 is (?, 30, 100, 10)
Shape of the lambda word is (?, 30, 100)
Shape after

In [28]:
plot_model(model,to_file='attn_model_multi_rework_sent_allclasses_bn.png')

In [29]:
tu.sentence_tokenizer("This is a great day!", 25)

['This is a great day !']

In [30]:
lookup_words = tu.get_word_reverse_lookup(tokenizer)

In [31]:
comment_list = train.astype(str)[comment_col].tolist()
xtrain = tu.pad_sentences_sent(comment_list,30,10, tokenizer)

In [32]:
#ct = tu.sent_counter(comment_list)

In [33]:
#pd.Series(ct).describe(percentiles=[.10,.20,.30,.40,.50,.60,.70,.8,.90,1])

In [34]:
#plt.hist(pd.Series(ct))
tokenizer.word_index['unk']
tokenizer.texts_to_sequences([tu.replace_unknown_words_with_UNK("I am hosbnahf? where everyone is considered asdkfjsla", tokenizer)
])

[[65247, 86, 65247, 28, 139, 518, 11, 411, 65247]]

In [35]:
print(xtrain[0][:])

[[    0     0     0     0  5284    16    16    16    16    16]
 [    0     0     0    18   190    11  5284     8 47785    16]
 [15195 32761    11    48    10 13330   399    71   375     3]
 [ 7029     8  2370   106    45   312   236     7    55     2]
 [    0    35  1504    22    18  1250     7    10   307     1]
 [   90     3   387    90     3    27    41    29   830    49]
 [ 7076 75701     8    26   121    42 13465  2129     8 20245]
 [    0     0     0     0     0     0 11530    28    28    28]
 [   31    55     7     9    26     4  4915     2  1112     8]
 [10688     7     2   818     7   230   725    64    82     9]
 [  134     4   227  2863     8   375    49  7407    28    28]
 [   27    55     7     9   471 75702 10127  1251    15 11374]
 [41466    32  6413 10883     3  1659     7     2   343     7]
 [    0     0     0     0     0     0  8132     8  1112    28]
 [    0     0     0     0     0     0     0     0 12745     1]
 [   18 15195 32761 33436  1885    39   219    24  2934

In [36]:
test_comment_list = test.astype(str)[comment_col].tolist()
xval = tu.pad_sentences_sent(test_comment_list,30,10, tokenizer)
print (xval.shape)

(15958, 30, 10)


In [37]:
def get_label_stat(y):
    #y = y.tolist()
    total_count = pd.Series(y).count()
    y1 = (pd.Series(y).sum()/total_count)*100
    y0 = 100-y1
    return total_count, y1, y0

In [38]:
print (type(xtrain), type(xval), type(YTrain), type(YVal))

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'list'> <class 'list'>


In [39]:
total_count_train, y1, y0 = get_label_stat(YTrain[0])
print ('Training State - Total Records: {0}, Toxic percent: {1}, Normal percent: {2}'.format(total_count_train, y1, y0))
total_count_val, y1, y0 = get_label_stat(YVal[0])
print ('Validation State - Total Records: {0}, Toxic percent: {1}, Normal percent: {2}'.format(total_count_val, y1, y0))

Training State - Total Records: 143613, Toxic percent: 9.618906366415295, Normal percent: 90.3810936335847
Validation State - Total Records: 15958, Toxic percent: 9.274345156034592, Normal percent: 90.7256548439654


In [40]:
model.fit(xtrain,YTrain ,batch_size=64, epochs=5, verbose=1, validation_data=(xval, YVal), shuffle=True, callbacks=callbacks_list)#, callbacks=callbacks_list

Train on 143613 samples, validate on 15958 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: 

In [89]:
#model.load_weights('/Users/mayoor/dev/kaggle/tc/models/resnet_best_model_han_split.h5')
xt =  tu.pad_sentences_sent(df[comment_col].astype(str).tolist(),30,10, tokenizer)


In [90]:
yt = ys(df, pred_cols)


In [91]:
v_score, v_acc = model.evaluate(xt, yt, batch_size=128)
print("\nTest score: {0}, accuracy: {1}".format(v_score, v_acc))


Test score: 0.10400655906169301, accuracy: 0.9608826165196411


## Visualization

In [41]:
print(test[test['total_classes']==2].head())
print(test[test["id"]=="936476a4c9b51863"][comment_col].tolist())
#print(test[test["id"]=="b0c9e9304f37c9d3"]['comment_text'].tolist())
#print( pad_sentences(test[test["id"]=="b0c9e9304f37c9d3"]['comment_text'], 25, 25).shape)
#print(model.predict( pad_sentences(test[test["id"]=="936476a4c9b51863"]['comment_text'], 12, 25)))
print(test[test["id"]=="936476a4c9b51863"])

                      id                                       comment_text  \
25255   42d923c91158e044  no worries \n\nWe will use another account, an...   
12791   21ec8eeb0890003d  hi kannadiga shame on you \n\nYou look like a ...   
138234  e3b8bc8940f2a818  Oaks and Homophobia\nI provided a full citatio...   
201     007f1839ada915e6  Your blatant POV pushing \n\nNeither of you gu...   

        toxic  severe_toxic  obscene  threat  insult  identity_hate  \
25255       1             0        0       0       0              1   
12791       1             0        0       0       1              0   
138234      1             0        1       0       0              0   
201         1             0        1       0       0              0   
55184       1             0        1       0       0              0   

        total_classes  
25255               2  
12791               2  
138234              2  
201                 2  
55184               2  
                     id          

In [42]:
sample_sentence = comment_list[0]
#print (sample_sentence)
imagined_sample = "fuck you.  I will come there. what a mess. you son a of bitch. This is a test "
out_of_sample = df[df["id"]=="936476a4c9b51863"][comment_col].tolist()[0]
#model.predict( tu.pad_sentences_sent([sample_sentence, imagined_sample, out_of_sample], 30,10, tokenizer))
tu.pad_sentences_sent([sample_sentence], 30,10, tokenizer).shape

(1, 43, 10)

In [None]:
nclasses = 6
word_output = [model.layers[i+1].layer.layers[-1] for i in range(nclasses)]
sent_output = [model.layers[-(nclasses*2)+i] for i in range(nclasses)]

In [63]:
def calculate_attention(sentence, sent, words):
    input_sent = tu.get_padded_words([sentence], words)
    print (input_sent.shape)
    model_word_hit_model = Model(model.inputs[0],model.layers[-3].output)
    model_word_hit = model_word_hit_model.predict(input_sent)
    attn_kernel_weight = model.layers[-2].get_weights()[0]
    print (attn_kernel_weight.shape)
    product = np.dot(model_word_hit, attn_kernel_weight)
    product = np.reshape(product, (-1, 1, 300))
    x_norm = np.exp(product)/np.sum(np.exp(product))
    return x_norm


In [42]:
attn_ws = calculate_attention("fuck you.  I will come there. what a mess. you son a of bitch. This is a test ", 1,300)

NameError: name 'calculate_attention' is not defined

In [65]:
print (attn_ws.shape)
print (np.sum(attn_ws))

(1, 1, 300)
1.0000002


In [45]:
model.layers

[<keras.engine.topology.InputLayer at 0x1231183c8>,
 <keras.layers.wrappers.TimeDistributed at 0x123046358>,
 <keras.layers.wrappers.TimeDistributed at 0x122f8a7f0>,
 <keras.layers.wrappers.TimeDistributed at 0x1239b9b38>,
 <keras.layers.wrappers.TimeDistributed at 0x1239b9668>,
 <keras.layers.wrappers.TimeDistributed at 0x11f42b2b0>,
 <keras.layers.wrappers.TimeDistributed at 0x1239a6828>,
 <keras.layers.wrappers.TimeDistributed at 0x123934b38>,
 <keras.layers.wrappers.TimeDistributed at 0x123917b00>,
 <keras.layers.wrappers.TimeDistributed at 0x1238dc4e0>,
 <keras.layers.merge.Multiply at 0x1238b8ac8>,
 <keras.layers.merge.Multiply at 0x123919ac8>,
 <keras.layers.merge.Multiply at 0x1238d7400>,
 <keras.layers.merge.Multiply at 0x1238cf630>,
 <keras.layers.merge.Multiply at 0x1238af160>,
 <keras.layers.merge.Multiply at 0x1238af0f0>,
 <keras.layers.core.Reshape at 0x1238ae0b8>,
 <keras.layers.core.Reshape at 0x1238ae588>,
 <keras.layers.core.Reshape at 0x1238ae320>,
 <keras.layers.cor

In [43]:
#word_attns, sent_attns
def get_sentence_rank(sentence, sent, words, tokenizer, class_label=0):
    #test[test["id"]=="936476a4c9b51863"][comment_col]
    input_sent = tu.pad_sentences_sent([sentence], sent, words, tokenizer) #get_padded_words([sentence], words)
    model_word_attn = Model(model.inputs[0],model.layers[-5].output)#[wa.output for wa in word_output])
    weights = model_word_attn.predict(input_sent)
    print (np.sum(weights))
    sentences_rank = np.argsort(weights,axis=1).flatten()[::-1]
    print (sentences_rank)
    return sentences_rank

In [44]:
get_sentence_rank("fuck you.  I will come there. what a mess. you son a of bitch. This is a test ", 6,50, tokenizer)

0.4231643
[0]


array([0])

In [97]:
def get_rank(sentence, sent, words):
    attn_ws = calculate_attention(sentence, sent, words)
    return np.argsort(attn_ws,axis=2).flatten()[::-1]

In [82]:
def word_rank(sentence, sent_len, word_len):
    input_sent = pad_sentences_sent([sentence], sent_len, word_len)
    model_word_attn = Model(model.inputs[0],model.layers[-11].output)#[wa.output for wa in word_output])
    attention_output = model_word_attn.predict(input_sent)
    output = attention_output.reshape(sent_len,word_len)
    return np.argsort(output,axis=1)


In [84]:
def top_ranked_words_for_sentence(sentence, sent_len, word_len, lookup_words, nwords=-1, nsents=-1):
    sent_rank = get_sentence_rank(sentence, sent_len, word_len)
    processed_sent = tu.readable_pad_sent(sentence, sent_len, word_len, lookup_words)
    word_level_rank = word_rank(sentence, sent_len, word_len)
    print ("word_level_rank>>",word_level_rank.shape)
    if nwords == -1:
        nwords = word_len
    if nsents == -1:
        nsents = sent_len
    
    ranked_data = []
    for i in range(nsents):
        s = processed_sent[sent_rank[i]]
        wrank = word_level_rank[sent_rank[i]]
        wrank = wrank.flatten()[::-1]
        ordered_words = [s[w] for w in wrank]
        ordered_words = ordered_words[:nwords]
        ranked_data.append(ordered_words)
    return ranked_data

In [77]:
def top_ranked_words(sentence, sent_len, word_len, lookup_words, nwords=-1, nsents=-1):
    w_rank = get_rank(sentence, sent_len, word_len)
    print ("$$",w_rank)
    processed_sent = tu.readable_pad_sent(sentence, sent_len, word_len, lookup_words)
    #word_level_rank = word_rank(sentence, sent_len, word_len)
    if nwords == -1:
        nwords = word_len
    
    ranked_data = []
    s = processed_sent[0]
    print (">>", s)

    ordered_words = [s[w] for w in w_rank]
    ordered_words = ordered_words[:nwords]
    ranked_data.append(ordered_words)
    return ranked_data

In [99]:
sample_sentence = comment_list[0]
imagined_sample = "fuck you.  I will come there. what a mess. you son a of bitch. This is a test "
#top_ranked_words(imagined_sample, 1, 300, nwords=50, nsents=1)
top_ranked_words_for_sentence(imagined_sample, 6, 50, nwords=10, nsents=6)

0.99999994
[0 1 2 3 4 5]
[[124, 6, 7, 44, 271, 41, 39, 5, 1651, 6, 1387, 5, 3, 636, 13, 8, 5, 706], [], [], [], [], []]
word_level_rank>> (6, 50)


[['bitch', 'fuck', 'you', 'a', 'of', 'son', 'mess', 'a', 'you', 'what'],
 [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
 [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
 [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
 [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '],
 [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']]

In [103]:
sample_train = df[df["id"]=="936476a4c9b51863"][comment_col].values[0]
print (sample_train)
top_ranked_words_for_sentence(sample_sentence, 6, 50, nwords=50, nsents=3)


1.0
[3 2 1 0 4 5]
[[5367, 13, 173, 8, 5367, 4, 1485, 157, 14888, 32097, 8, 36, 5, 13136, 393, 59, 369, 7098, 4, 2398, 93, 34, 309, 3, 42, 1, 23, 879, 15, 13, 1253, 3, 5, 292, 77, 375, 77, 19, 453, 796], [37, 7140, 5, 13495, 4, 18, 107, 30, 13277, 2061, 4, 18547, 10449, 33, 42, 3, 6, 18, 2, 5048, 1, 1127, 4, 10498, 3, 1, 838, 3, 196, 718, 51, 70, 6, 121, 2, 212, 2790, 4, 369, 37, 7515, 19, 42, 3, 6, 474], [70466, 10281, 1284, 10, 10329, 40251, 21, 6278, 10838, 1668, 3, 1, 338, 3, 7699, 4, 1127, 12603, 13, 14888, 32097, 32748, 1769, 27, 202, 16, 2871, 1626, 4, 798, 12, 141, 1072, 207, 969, 56, 767, 12, 1, 28, 606, 617, 1054, 4, 18304, 62], [4280, 28, 8460, 2, 96, 5, 299, 54, 6, 76, 18, 3850, 25, 33, 6, 51, 813, 21, 4477, 64, 211, 168, 1532, 21, 4, 46053, 1, 838, 3, 196, 1162], [], []]
word_level_rank>> (6, 50)


[['a',
  'get',
  'you',
  'life',
  'freak',
  'people',
  'yourselves',
  'you',
  'dont',
  'jobs',
  'youre',
  'do',
  'to',
  'unions',
  'wikipedia',
  'have',
  'or',
  'just',
  'and',
  'with',
  'us',
  'tormenting',
  'contributors',
  'while',
  'play',
  'here',
  'of',
  'with',
  'playing',
  'rest',
  'the',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' ',
  ' '],
 ['moron',
  'afraid',
  'wikipedia',
  'administrator',
  'of',
  'the',
  'glasses',
  'other',
  'sheesh',
  'and',
  'fun',
  'thought',
  'thick',
  'assorted',
  'of',
  'romance',
  'with',
  'the',
  'longer',
  'for',
  'cant',
  'in',
  'and',
  'any',
  'wait',
  'this',
  'america',
  'club',
  'be',
  'librarian',
  'work',
  'types',
  'wool',
  'sweaters',
  'better',
  'approved',
  'and',
  'for',
  'lauren',
  'quickly',
  'marm',
  'available',
  'page',
  'caitlin',
  ' ',
  ' ',
  ' ',
  ' ',
  'upton',
  'bio'],

In [None]:
sentence_readable = tu.readable_pad_sent(test[test["id"]=="936476a4c9b51863"][comment_col].tolist()[0], 50, 30)
print(sentences_rank)
#[sentence_readable[i] for i in sentences_rank[:10]]
print (sentence_readable)

# Test

In [4]:
test_df = pd.read_csv('test.csv')
test_df = tu.clean_up(test_df)
test_df['comment_text'] = test_df['comment_text'].apply(lambda x: tu.replace_unknown_words_with_UNK(x, tokenizer))
test_comments = test_df.astype(str)['comment_text'].tolist()
xtrain = tu.pad_sentences_sent(test_comments,6,50, tokenizer)
test_df.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then youll ...
1,0000247867823ef7,from rfc the title is fine as it is imo
2,00013b17ad220c46,sources unk ashton on lapland —
3,00017563c3f7919a,if you have a look back at the source the info...
4,00017695ad8997eb,i dont anonymously edit articles at all


In [5]:
tokenizer.word_index['unk']

178415

In [42]:
predictions = model.predict(xtrain)

In [44]:
predicted_df = pd.DataFrame(columns=['id','toxic','severe_toxic','obscene','threat','insult','identity_hate'])
predicted_df['id'] = test_df['id']
for i, k in enumerate(pred_cols):
    predicted_df[k] = predictions[:]
predicted_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.939705,0.939705,0.939705,0.939705,0.939705,0.939705
1,0000247867823ef7,0.001127,0.001127,0.001127,0.001127,0.001127,0.001127
2,00013b17ad220c46,0.002598,0.002598,0.002598,0.002598,0.002598,0.002598
3,00017563c3f7919a,0.004476,0.004476,0.004476,0.004476,0.004476,0.004476
4,00017695ad8997eb,0.00283,0.00283,0.00283,0.00283,0.00283,0.00283


In [45]:
predicted_df.to_csv('first_submission_h_n_consolidated.csv',index=False, header=True)