## word level model

In [152]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SpatialDropout1D, concatenate, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, GRU, GlobalMaxPooling1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import Callback
import matplotlib.pyplot as plt
%matplotlib inline
from gensim.models import Word2Vec
import gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## LSTM model

In [157]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
# embed_size=300

In [22]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [23]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [24]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [28]:
inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
embed_size = 300
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(LSTM(60, return_sequences=True,name='lstm_layer',dropout=0.1,recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

In [34]:
batch_size = 32
epochs = 2
hist = model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


In [35]:
y_pred = model.predict(X_t,batch_size=batch_size,verbose=1)
y_pred



array([[  2.72767415e-04,   1.27316889e-06,   4.43745776e-05,
          5.07856328e-07,   8.81044707e-06,   4.73976343e-06],
       [  5.58910193e-04,   2.18725359e-06,   7.73255306e-05,
          1.34787729e-06,   2.09507834e-05,   7.84819258e-06],
       [  4.61251801e-03,   6.44041347e-06,   4.83808020e-04,
          8.49240678e-06,   1.75300054e-04,   3.63578511e-05],
       ..., 
       [  3.14858393e-03,   9.14110387e-06,   5.46244206e-04,
          5.61349680e-06,   1.65505000e-04,   3.78799632e-05],
       [  2.82997335e-03,   2.79880032e-06,   2.57812906e-04,
          4.02483238e-06,   9.39967140e-05,   1.55896505e-05],
       [  1.47885131e-02,   1.71547072e-05,   1.23190391e-03,
          5.45882831e-05,   7.36148912e-04,   1.36105242e-04]], dtype=float32)

In [38]:
roc_auc_score(y, y_pred)

0.98933042470553367

In [39]:
y_submit = model.predict(X_te,batch_size=batch_size,verbose=1)



In [40]:
y_submit[np.isnan(y_submit)]=0
word_LSTM_submission = pd.read_csv('data/sample_submission.csv')
word_LSTM_submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_submit
word_LSTM_submission.to_csv('word_LSTM_submission.csv', index=False)

## GRU model

In [175]:
full_set = pd.read_pickle('full_cleaned_sw_trans.pkl')

In [176]:
# word2vec_model = Word2Vec.load('models/Myword2vec.model')

In [177]:
n_train = train.shape[0]

In [184]:
X_train = full_set['cleaned_text'][:n_train].fillna("fillna").values
y_train = full_set[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]][:n_train].values
X_test = full_set["cleaned_text"][n_train:].fillna("fillna").values

In [185]:
# X_train = train["comment_text"].fillna("fillna").values
# y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
# X_test = test["comment_text"].fillna("fillna").values

max_features = 30000
maxlen = 100
embed_size = 300

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = pad_sequences(X_train, maxlen=maxlen)
x_test = pad_sequences(X_test, maxlen=maxlen)

In [186]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [187]:
def get_model():
    inp = Input(shape=(maxlen, ))
#     x = word2vec_model.wv.get_keras_embedding(train_embeddings=False)(inp)
    x = Embedding(max_features, embed_size)(inp)
#     x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])
    x = Dropout(0.2)(x)
#     x = Dense(50, activation="relu")(x)
#     x = Dropout(0.1)(x)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_26 (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_27 (Embedding)        (None, 100, 300)     9000000     input_26[0][0]                   
__________________________________________________________________________________________________
bidirectional_25 (Bidirectional (None, 100, 160)     182880      embedding_27[0][0]               
__________________________________________________________________________________________________
global_average_pooling1d_23 (Gl (None, 160)          0           bidirectional_25[0][0]           
__________________________________________________________________________________________________
global_max

In [188]:
batch_size = 64
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

Train on 143613 samples, validate on 15958 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.980063 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.983599 



In [189]:
y_pred = model.predict(x_train,batch_size=batch_size,verbose=1)
roc_auc_score(y, y_pred)



0.99211544903882742

In [92]:
y_submit = model.predict(x_test,batch_size=batch_size,verbose=1)
y_submit[np.isnan(y_submit)]=0
word_GRU_submission = pd.read_csv('data/sample_submission.csv')
word_GRU_submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_submit
word_GRU_submission.to_csv('word_GRU_submission.csv', index=False)



max_features = 30000
maxlen = 100
embed_size = 300
SpatialDropout1D(0.2)
Bidirectional(GRU(80, return_sequences=True))

loss: 0.0397 - acc: 0.9849 - val_loss: 0.0464 - val_acc: 0.9826
val: ROC-AUC - epoch: 2 - score: 0.984000 
train: 0.99301532440477003
unprocessed text
kaggle: 0.9795

loss: 0.0391 - acc: 0.9850 - val_loss: 0.0469 - val_acc: 0.9832
val: ROC-AUC - epoch: 2 - score: 0.984314
train: 0.99322135847382143
processed text
kaggle: 0.9790

loss: 0.0404 - acc: 0.9846 - val_loss: 0.0461 - val_acc: 0.9827
val: ROC-AUC - epoch: 2 - score: 0.984590 
train: 0.9928273935811841
batch_size = 64 (from 32)
kaggle: 0.9792

## to-do

- char level model
- processed vs unprocessed text: slightly better validation score, but got slightly lower score on kaggle, not significant
- tune batch size: (32 vs 64, 64 slightly better)
- use pretrained word vector vs without (word2vec(skip-gram, cbow), glove, etc)
- architecture engineering introduce dropout to  reduce overfitting
- hyperparameters: max_features, maxlen, embed_size, dropoutratio, number of GRU

<br>
- language detect
