In [1]:
# import necessary libraries

import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt


import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalAveragePooling1D,GlobalMaxPool1D
# Difference between various GlobalAveragePooling, GlobalMaxPool
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers

import sys
# sys.setrecursionlimit(100000)

%matplotlib inline

Using TensorFlow backend.


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
prediction_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[prediction_classes]

train_sentences = train.comment_text
test_sentences = test.comment_text

In [4]:
# Define tokenizer to split, preprocess, encode input text data

max_features = 20000
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(train_sentences))
# list_tokenizer_train = tokenizer.texts_to_sequences(train_sentences)
list_tokenizer_test = tokenizer.texts_to_sequences(test_sentences)

# len_distribution = [len(arr) for arr in list_tokenizer_train]

In [5]:
# pad sequences to a fixed length input.
maxlen = 200
# X_train = pad_sequences(list_tokenizer_train, maxlen = maxlen)
X_test = pad_sequences(list_tokenizer_test, maxlen = maxlen)

In [6]:
# Model building

# Define input layer
inp = Input(shape=(200, ))

# Define embedding layer
embed_size = 128
x = Embedding(input_dim = max_features, output_dim=embed_size)(inp)
x = LSTM(60, return_sequences=True, name = 'lstm_layer1')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation = 'relu')(x)
x = Dropout(0.1)(x)
x = Dense(6, activation = 'sigmoid')(x)


In [7]:
# Model creation and compilation
model = Model(inputs = inp, outputs = x)
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Running model for 10 epochs 
batch_size = 128
epochs = 10
model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [8]:
model.load_weights('keras_model_weights.h5')

In [14]:
def load_model(model_filename):
    
    model = load_model(model_filename)
    return model
      

print('Trained model loaded')
output = model.predict(X_test)
print('Predictions calculated')




Trained model loaded
Predictions calculated


In [16]:
def make_submission_csv(output, prediction_classes):
    output_df = pd.DataFrame(output, columns = prediction_classes)
    output_df_upload = test.join(output_df)
    output_df_upload = output_df_upload[['id'] + prediction_classes]
    print('Writing predictions to csv file')
    output_df_upload.to_csv('submission.csv', index = False)
    print('File written and ready to be uploaded')

make_submission_csv(output, prediction_classes)

Writing predictions to csv file
File written and ready to be uploaded


In [17]:
output[:5]

array([[9.96652067e-01, 3.80871773e-01, 9.70974624e-01, 5.59741519e-02,
        8.96210015e-01, 2.01338068e-01],
       [1.09159970e-03, 6.76032232e-06, 1.29854627e-04, 5.41037698e-05,
        1.92135878e-04, 7.55947985e-05],
       [2.13384815e-03, 1.40314369e-05, 2.15546432e-04, 1.02011996e-04,
        3.83184932e-04, 1.43021694e-04],
       [7.11207977e-04, 1.40598831e-06, 8.71279844e-05, 1.38668702e-05,
        9.54512288e-05, 4.05575847e-05],
       [4.29602107e-03, 4.61237869e-05, 4.51119646e-04, 2.94028636e-04,
        8.17678403e-04, 3.34060343e-04]], dtype=float32)