# LSTM model

Long Short Term Memory is implemented using keras.

In [15]:
# importing libraries

import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from numpy import array
from numpy import asarray
from numpy import zeros

from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.layers import Input
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

In [3]:
# reading preprocessed train data

train = pd.read_csv('train_preprocessed.csv')

### Tokenizing the text data

Machine learning algorithms cannot understand raw text, so we convert the text into tokens of numbers or vectors for the machine to work upon. Keras implemented tokenizer is used here for the same with max_words = 5000 and maxlen for padding the sequence = 200.

In [11]:
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(train['comment_text'].values.astype('U'))

vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(train['comment_text'].values.astype('U'))

# max length for each comment = 200
x = pad_sequences(sequences, maxlen=200)
x

array([[   0,    0,    0, ...,    0,  791, 1461],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       ...,
       [ 301, 2687,   15, ...,   15,  301, 2687],
       [4845,   39,  130, ...,   39,  130, 1950],
       [   8,  562, 1462, ...,   99,    6,  584]])

In [6]:
# Converting label values to array

Y=[]
Y=pd.DataFrame(Y)

Y['toxic']=train['toxic']
Y['severe_toxic']=train['severe_toxic']
Y['obscene']=train['obscene']
Y['threat']=train['threat']
Y['insult']=train['insult']
Y['identity_hate']=train['identity_hate']

y=Y.values
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 1, 1],
       [1, 0, 0, 0, 1, 0],
       [1, 1, 0, 1, 0, 0]], dtype=int64)

In [8]:
# splitting data with test_size=0.2

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

### Embedding layer using GloVe

In [12]:
# Creating an embedding matrix 

embeddings_dictionary = dict()

glove_file = open('glove.6B.200d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 200))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

### Defining the model

In [17]:
#input layer
deep_inputs = Input(shape=(200,))  

# embedding layer
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], trainable=False)(deep_inputs)

# LSTM layer 
LSTM_Layer = LSTM(128)(embedding_layer)

# number of classes (labels)
n_classes = 6

# Dense output layer with activation function 
dense_layer = Dense(n_classes, activation='sigmoid')(LSTM_Layer)

# adding inputs and outputs to Model
model = Model(inputs=deep_inputs, outputs=dense_layer)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

# defining callbacks
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4)
]

model.summary

<bound method Network.summary of <keras.engine.training.Model object at 0x0000018DC0F9EDD8>>

In [26]:
# fitting data on model

history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 114890 samples, validate on 12766 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [27]:
# evaluating on test data

metrics = model.evaluate(x_test, y_test)

print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.06696725681000427
acc: 0.97788405418396


Accuracy of the model is found to be 97.788 %.

### Predicting on final test data

In [22]:
test = pd.read_csv('test_preprocessed.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text
0,0,00001cee341fdb12,yo btch ja rule ucceful hall ever what hatng a...
1,1,0000247867823ef7,from rfc ttle fne mo
2,2,00013b17ad220c46,ource zawe ahton lapland
3,3,00017563c3f7919a,f look back ource nformaton updated correct f...
4,4,00017695ad8997eb,anonymouly edt artcle


In [23]:
# converting into tokens using the already fitted tokenizer

sequences = tokenizer.texts_to_sequences(test['comment_text'].values.astype('U'))
test = pad_sequences(sequences, maxlen=200)
test

array([[   0,    0,    0, ...,   67,  406,   23],
       [   0,    0,    0, ...,  227,  585, 1138],
       [   0,    0,    0, ...,    0,    0,   18],
       ...,
       [   0,    0,    0, ...,   71,  376, 1481],
       [   0,    0,    0, ...,   95,   32, 3030],
       [   0,    0,    0, ...,   42, 4407,    6]])

In [25]:
# making predictions

predictions = model.predict(test, batch_size = 32)
predictions

array([[7.8824431e-01, 2.4684701e-02, 2.7665856e-01, 8.8207629e-03,
        3.5306579e-01, 1.2820670e-02],
       [1.0223834e-02, 7.2372604e-05, 2.1600914e-03, 6.0475661e-05,
        1.6437232e-03, 4.4416077e-04],
       [1.0051875e-01, 8.6456407e-03, 4.7148466e-02, 3.4774127e-03,
        3.6964547e-02, 1.1818649e-02],
       ...,
       [3.2930311e-03, 9.3068711e-06, 8.9360651e-04, 1.2402681e-05,
        3.5051693e-04, 1.0660733e-04],
       [1.0797491e-02, 6.0468625e-05, 2.6358413e-03, 2.9793109e-05,
        1.4728478e-03, 1.1371445e-03],
       [7.5652689e-01, 8.5971421e-03, 1.8018711e-01, 5.7314313e-03,
        3.2273018e-01, 8.7631727e-03]], dtype=float32)

In [33]:
# submitting on the file

submissions = pd.read_csv('sample_submission.csv')
submissions.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [35]:
submissions['toxic'] = predictions[:,0]
submissions['severe_toxic'] = predictions[:,1]
submissions['obscene'] = predictions[:,2]
submissions['threat'] = predictions[:,3]
submissions['insult'] = predictions[:,4]
submissions['identity_hate'] = predictions[:,5]

submissions.to_csv('submission_lstm.csv')
submissions.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.788244,0.024685,0.276659,0.008821,0.353066,0.012821
1,0000247867823ef7,0.010224,7.2e-05,0.00216,6e-05,0.001644,0.000444
2,00013b17ad220c46,0.100519,0.008646,0.047148,0.003477,0.036965,0.011819
3,00017563c3f7919a,0.000312,3e-06,0.000165,1.3e-05,6.3e-05,2.2e-05
4,00017695ad8997eb,0.073786,0.002617,0.034611,0.003516,0.03605,0.001155
