In [1]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Embedding,Input,Dense,LSTM,Bidirectional,GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
embedding_size = 100
max_vocab_size = 20000
path = 'data/glove.6B.100d.txt'
max_sequence_length = 100
batch_size = 128
epochs = 5
validation_split = .2
filepath_model ="data/model_weights.hdf5"

In [3]:
def process_embedding_file(path):
    with open(path,encoding='utf-8') as f:
        word2vec = dict()
        for line in f:
            out = line.split()
            word = out[0]
            word2vec[word] = np.asarray(out[1:], dtype='float32')
        return word2vec
            
word2vec = process_embedding_file(path)

In [4]:
print('length of word embeddings : {}'.format(len(word2vec)))

length of word embeddings : 400000


In [5]:
df_train = pd.read_csv('data/train.csv')
labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
#deleting records with null values
df_train.dropna(inplace=True)
sentences = df_train['comment_text'].values
target = df_train[labels].values

In [6]:
t = Tokenizer(num_words = max_vocab_size)
t.fit_on_texts(sentences)
sequences = t.texts_to_sequences(sentences)
word2idx = t.word_index
#saving tokenizer for later use
with open('model/tokenizer.pickle', 'wb') as handle:
    pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Number of unique words : {}'.format(len(word2idx)))
padded_sequences = pad_sequences(sequences,maxlen=max_sequence_length)

Number of unique words : 210337


In [7]:
# create weight matrix for words in training sentences
max_vocab_size = min(max_vocab_size,len(word2idx)+1)
embedding_matrix = np.zeros((max_vocab_size,embedding_size))
for word,index in word2idx.items():
    if index < max_vocab_size:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
#saving embedding matrix for later use
with open('model/emb_weights.pickle', 'wb') as handle:
    pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
embedding_layer =Embedding(max_vocab_size,
                           embedding_size,
                           weights=[embedding_matrix],
                           input_length=max_sequence_length,
                           trainable=False)

In [9]:
#building model
def build_model():
    input_model = Input(shape=(max_sequence_length,))
    x = embedding_layer(input_model)
    #rnn = LSTM(15, return_sequences = True)
    rnn = Bidirectional(LSTM(15, return_sequences = True))
    x = rnn(x)
    x = GlobalMaxPooling1D()(x)
    output = Dense(len(labels),activation='sigmoid')(x)
    model =Model(input_model,output)
    return model

model = build_model()
model.compile(loss = 'binary_crossentropy',optimizer=Adam(lr=.01),metrics=['accuracy'])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
#training model
history = model.fit(padded_sequences,
                    target,
                    batch_size = batch_size,
                    epochs = epochs,
                    validation_split = validation_split
                    )
model.save(filepath_model)

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
