# Initial kernel based on different RNN layers

* **Text preprocessing**

Following things to be tried on the baseline:
    * Add Early Stopping callback
    * Increase max epochs - let EarlyStop do the work
    * Add Tensorboard callback, monitor training
    * Replace LSTM by GRU units and check if it changes anything
    * Add another layer of LSTM/GRU, see if things improve
    * Play around with Dense layers (add/# units/etc.)
    * Find preprocessing rules you could add to improve the quality of the data
    * Use different embeddings

In [21]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Permute, GRU, Conv1D, LSTM, Embedding, Dropout, Activation, CuDNNLSTM, CuDNNGRU, concatenate, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, BatchNormalization, SpatialDropout1D, Dot
from keras.optimizers import Adam
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras_tqdm import TQDMNotebookCallback
import keras.backend as K
from keras.callbacks import LearningRateScheduler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from functools import reduce
from keras.layers import Layer, PReLU, SpatialDropout1D
from keras import initializers
from sklearn.model_selection import cross_val_predict

In [2]:
path = '../input/'
utility_path = '../utility/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE=f'{utility_path}glove.42B.300d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [3]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [4]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

train["comment_text"] = train["comment_text"].replace("\[.?!]{1,}\s", " <eos> ").replace("\n\n", " <eop> ").replace("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","")
test["comment_text"] = test["comment_text"].replace("\[.?!]{1,}\s", " <eos> ").replace("\n\n", " <eop> ").replace("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","")

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [5]:
max_features = 200000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 200 # max number of words in a comment to use

tokenizer = Tokenizer(num_words=max_features, char_level=False)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [6]:
X_t[13]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,   147,     6,   438,  3984,  1308,     4,  1205,    34,
          35,  2200,   385,     1,    71,   550,   272,  1499,  4162,
         520,   291,   168,     2, 12977,    20,   621,    11,    44,
        1051,    97,    11,   151,    49,     6,    19,  2363,    20,
         581,    17,    30,   336,    89,     1,    71,   550,     8,
         502,    13,     8,   292,     1,   534,   135,  3312,    36,
        1128,  8224,    52,   481,    17,     3,  1341,    66,  3940,
           8,   316,   143,    52,     8,     1,    77,   832,  3212,
        1272,    62,

In [7]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        # self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))),
                        (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        # print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        # return input_shape[0], input_shape[-1]
        return input_shape[0], self.features_dim

In [8]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['target_str'] = reduce(lambda x,y: x+y, [train[col].astype(str) for col in label_cols])
train['target_str'] = train['target_str'].replace('110101', '000000').replace('110110','000000')
cvlist = list(StratifiedShuffleSplit(n_splits=5, test_size=0.05, random_state=786).split(train, train['target_str'].astype('category')))

In [18]:
import tensorflow as tf
np.random.seed(1)
#tf.random_seed(1)
def train_gru(params):
    embed_size, gru_dim, dense_dim, lr1 , lr2, decay, batch_size = params
    
    def lr_decay(epoch):
        if epoch == 0:
            return lr1
        if epoch == 1:
            return lr2
        if epoch == 2:
            return 0.001
        if epoch == 3:
            return 0.00001
        
    def get_model():
        inp = Input(shape=(maxlen,))
        emb = Embedding(max_features, embed_size,
                     )(inp)
        print(emb.shape)
        emb = SpatialDropout1D(0.5)(emb)
        tmp = Bidirectional(CuDNNGRU(int(gru_dim), return_sequences=True, return_state=True))(emb)
        x2 = tmp[0]
        state = tmp[1]
        x3 = Attention(maxlen)(x2)
        x4 = GlobalAveragePooling1D()(x2)

        x = concatenate([x3, x4, state])
        #x = avg_pool
        #x = BatchNormalization()(x)
        #x = Dropout(0.2)(x)
        x = Dense(dense_dim)(x)
        x = PReLU()(x)

        #x = BatchNormalization()(x)
        #x = Dropout(0.2)(x)
        #x = Dense(64)(x)
        #x = PReLU()(x)
        #x = Dropout(0.1)(x)
        out = Dense(6, activation="sigmoid")(x)
        opt = Adam(lr=0.001, decay=decay)
        model = Model(inputs=inp, outputs=out)
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
        return model
    
    y_trues = []
    y_preds = []
    LRDecay = LearningRateScheduler(lr_decay)
    for tr_index, val_index in cvlist:
        X_train, y_train = X_t[tr_index, :], y[tr_index, :]
        X_val, y_val = X_t[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
        model = get_model()
        model.fit(X_train, y_train, batch_size=batch_size, epochs=2, validation_split=0.0, verbose=1, 
                  callbacks=[RocAuc, LRDecay])
        y_pred = model.predict(X_val, batch_size=2048)
        print("ROC AUC for this fold is ", roc_auc_score(y_val, y_pred))
        y_trues.append(y_val)
        y_preds.append(y_pred)
        K.clear_session()
        
    score = -roc_auc_score(np.concatenate(y_trues), np.concatenate(y_preds))
    print("Overall score with params {} is {}".format(params, score))
    return score

In [11]:

#train_gru()

In [15]:
import skopt
from skopt import gp_minimize, gbrt_minimize
from skopt.space import Real, Integer, Categorical

In [20]:
space = [Integer(16, 256), #name='embed_size'),
         Integer(16, 256),# name='gru_dim'),
         Integer(64, 1024),# name='dense_dim'),
         Real(0.0001, 0.005, "log-uniform"), #name='lr1'),
         Real(1e-5, 0.002, "log-uniform"), #name='lr2'),
         Real(1e-8, 0.001, "log-uniform"), #name='decay'),
         Integer(32, 256),# name=batch'),
        ]

res_gp = gbrt_minimize(train_gru, space, n_calls=100, random_state=0)

(?, 200, 188)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.982213 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.983151 

ROC AUC for this fold is  0.9831507887571517
(?, 200, 188)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.982551 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.984904 

ROC AUC for this fold is  0.9849042706445514
(?, 200, 188)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.978703 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.981460 

ROC AUC for this fold is  0.981459628816375
(?, 200, 188)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.983140 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.985088 

ROC AUC for this fold is  0.985088204547639
(?, 200, 188)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.972825 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.978548 

ROC AUC for this fold is  0.9785480076398568
Overall score with params [188, 63, 693, 0.002718830808690577, 0.0009422336297274342, 0.00017228945369609434, 243] is -0.981908510143595
(?, 200, 37)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.979619 

Epoch 2

 ROC-AUC - epoch: 1 - score: 0.982381 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.983000 

ROC AUC for this fold is  0.9830000592787996
(?, 200, 131)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.479192 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.478979 

ROC AUC for this fold is  0.47897871742656667
(?, 200, 131)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.980220 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.981055 

ROC AUC for this fold is  0.981055035868211
(?, 200, 131)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.983936 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.984244 

ROC AUC for this fold is  0.9842439726685116
(?, 200, 131)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.972591 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.973719 

ROC AUC for this fold is  0.9737194283554632
Overall score with params [131, 224, 819, 0.004228417918653105, 2.1035428134349117e-05, 0.00022409712855921124, 131] is -0.8089338526899862
(?, 200, 232)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.977211 

Epoch 2/2
 ROC-AUC - epoc

 ROC-AUC - epoch: 1 - score: 0.977509 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.978575 

ROC AUC for this fold is  0.9785749695939608
(?, 200, 130)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.979589 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.980123 

ROC AUC for this fold is  0.9801229709965509
(?, 200, 130)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.977616 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.978647 

ROC AUC for this fold is  0.9786470731369955
(?, 200, 130)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.979844 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.980418 

ROC AUC for this fold is  0.9804184726612338
(?, 200, 130)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.971314 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.971221 

ROC AUC for this fold is  0.971220699938803
Overall score with params [130, 199, 92, 0.0009241707704121684, 1.1046784818354062e-05, 1.225116771800311e-05, 70] is -0.977704066116015
(?, 200, 248)
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.982994 

Epoch 2/2

KeyboardInterrupt: 

In [129]:
y_trues

NameError: name 'y_trues' is not defined

In [43]:
roc_auc_score(y, y_preds)

0.9861737784746518

In [41]:
model = get_model()
model.fit(X_t, y, batch_size=32, epochs=3, validation_split=0.0, verbose=1, 
              callbacks=[RocAuc, LRDecay])
y_test_preds = model.predict([X_te], batch_size=1024, verbose=1)


Epoch 1/3
 ROC-AUC - epoch: 1 - score: 0.992135 

Epoch 2/3
 ROC-AUC - epoch: 2 - score: 0.995249 

Epoch 3/3
 ROC-AUC - epoch: 3 - score: 0.995429 



In [25]:
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
y_trues = train[label_cols].values
y_preds2 = np.zeros((X_t.shape[0], len(label_cols)))
y_test_preds2 = np.zeros((X_te.shape[0], len(label_cols)))
for i, col in enumerate(label_cols):
    y = y_trues[:, i]
    #model = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_leaf=50, class_weight='balanced', n_jobs=-1)
    model = lgb.LGBMClassifier(n_estimators=100, num_leaves=5, learning_rate=0.03, 
                               subsample=0.9, colsample_bytree=0.9)
    y_preds2[:, i] = cross_val_predict(model, y_preds, y, cv=cvlist, n_jobs=1, method='predict_proba')[:,1]
    y_test_preds2[:, i] = model.fit(y_preds, y).predict_proba(y_test_preds)[:,1]
    print("Score for class {} is {}".format(col, roc_auc_score(y, y_preds2[:, i])))
print("Over auc score", roc_auc_score(y_trues, y_preds2))

Score for class toxic is 0.9791088111561013
Score for class severe_toxic is 0.9896361881404786
Score for class obscene is 0.989792727484462
Score for class threat is 0.9834375512104746
Score for class insult is 0.9851961777705942
Score for class identity_hate is 0.9837426177272286
Over auc score 0.9851523455815565


In [42]:
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[label_cols] = y_test_preds
sample_submission.to_csv('nn_submission.csv', index=False)

In [None]:
sample_submission[label_cols] = y_test_preds2
sample_submission.to_csv('nn_lgbmeta_submission.csv', index=False)