In [12]:
import numpy as np 
import pandas as pd

In [13]:
EMBEDDING_FILE=f'/content/drive/MyDrive/glove.6B.50d.txt'
TRAIN_DATA_FILE=f'/content/drive/MyDrive/train.csv'
TEST_DATA_FILE=f'/content/drive/MyDrive/test.csv'

In [14]:
import sys 
import os
import re
import csv
import codecs
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [15]:
embed_size = 50 
max_features = 20000 
maxlen = 100 

In [16]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [17]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [18]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [19]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

  if self.run_code(code, result):


(0.020940498, 0.6441043)

In [20]:

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [21]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
model.fit(X_t, y, batch_size=32, epochs=3, validation_split=0.1);

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [23]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(f'/content/drive/MyDrive/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission.csv', index=False)



In [24]:
sample_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.995521,0.2374895,0.95473,0.0226725,0.845133,0.150563
1,0000247867823ef7,6e-05,6.589282e-08,2.2e-05,1.896541e-08,1e-05,6e-06
2,00013b17ad220c46,0.000428,3.173323e-06,0.000386,1.601002e-06,0.000138,8e-05
3,00017563c3f7919a,0.00095,1.812046e-06,0.000296,1.547683e-06,0.000197,7.1e-05
4,00017695ad8997eb,0.000992,2.312277e-06,0.000355,3.535937e-06,0.000169,4.9e-05
