In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.metrics import AUC
from keras import initializers, regularizers, constraints, optimizers, layers



In [2]:
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comments-dataset-merged/train.csv')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comments-dataset-merged/test.csv')

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [3]:
train, valid = train[train.index <= 100_000], train[train.index > 100_000]

In [4]:
y_tr = train[list_classes].values
# y_te = test[list_classes].values

list_sentences_train = train["comment_text"]
list_sentences_valid = valid["comment_text"]
list_sentences_test = test["comment_text"]

In [5]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)

print('Fitting Tokenizer to Train Data')
tokenizer.fit_on_texts(list(list_sentences_train))

print('Tokenizing Train Data')
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

print('Tokenizing Valid Data')
list_tokenized_valid = tokenizer.texts_to_sequences(list_sentences_valid)

print('Tokenizing Test Data')
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

Fitting Tokenizer to Train Data
Tokenizing Train Data
Tokenizing Valid Data
Tokenizing Test Data


In [6]:
maxlen = 200
X_tr = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_va = pad_sequences(list_tokenized_valid, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier

In [7]:
embed_size = 128
x = Embedding(max_features, embed_size)(inp)

x = LSTM(60, return_sequences=True, name='lstm_layer')(x)

x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

print('Compiling Model')
model = Model(inputs=inp, outputs=x)
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', AUC()]
)

Compiling Model


In [8]:
batch_size = 32
epochs = 5

print('Training Model')
model.fit(
    X_tr, y_tr,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.1,
    workers=4,
    use_multiprocessing=True,
    verbose=2
)

print('Training Summary')
print(model.summary())

Training Model
Epoch 1/5
2813/2813 - 137s - loss: 0.0822 - accuracy: 0.9266 - auc: 0.9425 - val_loss: 0.0459 - val_accuracy: 0.9934 - val_auc: 0.9795 - 137s/epoch - 49ms/step
Epoch 2/5
2813/2813 - 44s - loss: 0.0464 - accuracy: 0.9812 - auc: 0.9837 - val_loss: 0.0456 - val_accuracy: 0.9934 - val_auc: 0.9785 - 44s/epoch - 16ms/step
Epoch 3/5
2813/2813 - 38s - loss: 0.0406 - accuracy: 0.9713 - auc: 0.9877 - val_loss: 0.0458 - val_accuracy: 0.9932 - val_auc: 0.9769 - 38s/epoch - 14ms/step
Epoch 4/5
2813/2813 - 37s - loss: 0.0352 - accuracy: 0.9329 - auc: 0.9912 - val_loss: 0.0487 - val_accuracy: 0.9880 - val_auc: 0.9705 - 37s/epoch - 13ms/step
Epoch 5/5
2813/2813 - 35s - loss: 0.0303 - accuracy: 0.8470 - auc: 0.9934 - val_loss: 0.0500 - val_accuracy: 0.8400 - val_auc: 0.9673 - 35s/epoch - 12ms/step
Training Summary
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        

In [9]:
# print('Evaluating with Test Data')
# print(model.evaluate(X_te, y_te, return_dict=True))

In [10]:
te_pred = model.predict(X_te)



In [11]:
te_pred

array([[9.95929539e-01, 5.37416697e-01, 9.72115815e-01, 2.27549627e-01,
        8.77856314e-01, 4.25001502e-01],
       [1.88097101e-06, 2.19820315e-10, 8.24192796e-07, 3.09278336e-09,
        1.38408865e-07, 3.62911798e-08],
       [1.10582914e-04, 1.57092401e-07, 3.99636010e-05, 1.18009689e-06,
        1.65751517e-05, 3.49447987e-06],
       ...,
       [8.24565795e-05, 1.10211076e-07, 6.21571598e-05, 4.44894539e-07,
        1.04588162e-05, 5.40685733e-06],
       [2.90773387e-05, 4.23953637e-08, 2.33366391e-05, 2.36770759e-07,
        4.74982699e-06, 3.92561105e-06],
       [9.89236534e-01, 3.49880708e-03, 8.01418245e-01, 1.64160738e-04,
        4.17449862e-01, 1.62571226e-03]], dtype=float32)

In [12]:
pd.merge(left=test.id.to_frame(), right=pd.DataFrame(te_pred, columns=list_classes), left_index=True, right_index=True)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.995930,5.374167e-01,9.721158e-01,2.275496e-01,8.778563e-01,4.250015e-01
1,0000247867823ef7,0.000002,2.198203e-10,8.241928e-07,3.092783e-09,1.384089e-07,3.629118e-08
2,00013b17ad220c46,0.000111,1.570924e-07,3.996360e-05,1.180097e-06,1.657515e-05,3.494480e-06
3,00017563c3f7919a,0.000215,2.164898e-07,8.917408e-05,9.215779e-07,3.186794e-05,7.655279e-06
4,00017695ad8997eb,0.000259,4.266948e-07,1.115443e-04,2.330152e-06,3.532073e-05,8.852947e-06
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.318983,7.025733e-04,2.277705e-01,1.797692e-04,2.721924e-02,9.377764e-04
153160,fffd7a9a6eb32c16,0.000283,3.892733e-07,1.424028e-04,1.930817e-06,3.196141e-05,2.020969e-05
153161,fffda9e8d6fafa9e,0.000082,1.102111e-07,6.215716e-05,4.448945e-07,1.045882e-05,5.406857e-06
153162,fffe8f1340a79fc2,0.000029,4.239536e-08,2.333664e-05,2.367708e-07,4.749827e-06,3.925611e-06


In [13]:
pd.merge(left=test.id.to_frame(), right=pd.DataFrame(te_pred, columns=list_classes), left_index=True, right_index=True).to_csv('submissions.csv', index=False)

In [14]:
va_pred = model.predict(X_va)



In [15]:
pd.merge(left=valid.reset_index().id.to_frame(), right=pd.DataFrame(va_pred, columns=list_classes), left_index=True, right_index=True).to_csv('submissions.csv', index=False)