In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '2'
import numpy as np
import pandas as pd
from contextlib import redirect_stdout
import seaborn as sns
%matplotlib inline
sns.set(style="darkgrid", context='notebook', palette='deep')

from sklearn.metrics import roc_auc_score

from keras import backend as K
from keras.callbacks import Callback, EarlyStopping
from keras.models import Model, load_model

from keras.initializers import Constant
from keras.layers import Dense, Input, SpatialDropout1D, concatenate, Lambda, Dropout
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers.embeddings import Embedding

from keras.optimizers import Adam

Using TensorFlow backend.


## 1.Data Preparation

In [3]:
MAX_FEATURES = 100000
MAX_LENGTH = 150

In [5]:
x_train_val = np.load('../data/x_train_val_40000_L100.npy')
x_test = np.load('../data/x_test_40000_L100.npy')
y_train_val = np.load('../data/y_train_val.npy')

## Load Pretrained Embedding

In [6]:
embed_matrix = np.load('../pretrained/numpy_matrix/fastTest.300D.train.test.40kFeatures.npy')

## Implement Layer Norm Model
Reference: arXiv:1607.06450
- Layer Normalization
$$ LN(x) = \gamma \odot \frac{x - E(x)}{\sqrt{\sigma^2 + Var(x)}} + \beta$$

- Layer Normed GRU

$$ z^{\langle t \rangle} = \sigma (LN(W_z x^{\langle t \rangle} + U_z h^{\langle t-1 \rangle} + b_z)) $$
$$ r^{\langle t \rangle} = \sigma (LN(W_r x^{\langle t \rangle} + U_r h^{\langle t-1 \rangle} + b_r)) $$
$$ h^{\langle t \rangle} = (1 - z^{\langle t \rangle}) \odot h^{\langle t-1 \rangle} + z^{\langle t \rangle} \odot tanh(LN(W_h x^{\langle t \rangle} + U_h(r^{\langle t \rangle} \odot h^{\langle t-1 \rangle}) + b_h))$$

## Callback

In [8]:
class RocAucCallback(Callback):
    def __init__(self, validation_data=(), output_dir=None, interval=1):
        super().__init__()
        self.interval = interval
        self.x_val, self.y_val = validation_data
        self.output_log = ''
        self.output_dir = output_dir
    
    def on_train_begin(self, logs={}):
        if self.output_dir:
            if not os.path.isdir(self.output_dir):
                os.makedirs(self.output_dir)
                
    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred_val = self.model.predict(self.x_val, verbose=0, batch_size=1024)
            score_val = roc_auc_score(self.y_val, y_pred_val)
            
            lr = self.get_lr(epoch)
            new_log = 'Epoch %d, train by learning rate %.6f \n' % (epoch + 1, lr) + \
                      '    - Validation ROC-AUC score: %.4f \n' % score_val
                
            print(new_log)
            
            if self.output_dir:
                self.output_log += new_log + '\n'
                
    def on_train_end(self, logs={}):
        if self.output_dir:
            file_name = self.output_dir + '/log.txt'
            with open(file_name, 'w') as f:
                with redirect_stdout(f):
                    self.model.summary()

                f.write('\n' + self.output_log)

In [28]:
def get_model(embed_matrix, max_len, units=64):
    '''
    Arguments:
    input_shape --
    word2vec_map --
    word2idx_map --
    units -- int
    
    Returns:
    model -- a model instance in Keras
    
    '''
    
    inp = Input(shape=(max_len, ), dtype='int32', name='input')
    
    ### An awkward trick to get trainable initial state...
    one = Lambda(lambda x: K.ones((K.shape(inp)[0], 1), dtype='float32'),
                 name='constant_one')(inp)
    initial_state_front = Dense(units, use_bias=False, name='h_front')(one)
    initial_state_back = Dense(units, use_bias=False, name='h_back')(one)
    ###

    max_features, embed_dim = embed_matrix.shape
    x = Embedding(max_features, embed_dim, trainable=False,
                  weights=[embed_matrix], name='embedding')(inp)
    x = SpatialDropout1D(0.2, name='spatial_dropout')(x)
    x = Dense(200, activation='relu', name='encoder')(x)
    #x = Dense(128, activation='relu', name='encoder2')(x)
    #x = SpatialDropout1D(0.2, name='spatial_dropout2')(x)
    
    gru = GRU(units, return_sequences=True, dropout=0.2,
                  recurrent_dropout=0.1, sigma=0.5)
    x = Bidirectional(lstm, name='biGRU')(x,
                                          initial_state=[initial_state_front,
                                                         initial_state_back])
   
    avg_pool = GlobalAveragePooling1D(name='mean_pool')(x)
    max_pool = GlobalMaxPooling1D(name='max_pool')(x)
    l2_pool = Lambda(lambda x: K.sqrt(K.mean(K.square(x), axis=1)), name='l2_pool')(x)
    
    x = concatenate([avg_pool, max_pool, l2_pool], name='concatenate')
    #x = BatchNormalization(name='batchNorm')(x)
    #x = Dropout(0.2, name='dropout')(x)
    x = Dense(64, activation='relu', name='decoder')(x)
    x = Dense(6, activation='sigmoid', name='output')(x)
    
    model = Model(inputs=inp, outputs=x)
    
    return model

In [29]:
embed_matrix = np.load('../pretrained/numpy_matrix/fastTest.300D.train.test.40kFeatures.npy')
model = get_model(embed_matrix, MAX_LENGTH, units=64)
del embed_matrix
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 100)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 300)     12000300    input[0][0]                      
__________________________________________________________________________________________________
spatial_dropout (SpatialDropout (None, 100, 300)     0           embedding[0][0]                  
__________________________________________________________________________________________________
constant_one (Lambda)           (None, 1)            0           input[0][0]                      
__________________________________________________________________________________________________
encoder (D

In [30]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val,
                                                  train_size=0.95, random_state=528)

output_dir = '../output/lstm_40000_large_sigma0.5/'
    
lr_scheduler = LearningRateScheduler(step_decay)
optimizer = Adam(clipvalue=5.0)
rocAucCallback = RocAucCallback(validation_data=(x_val, y_val),
                                output_dir=output_dir,
                                interval=1)

checkpoint = ModelCheckpoint(output_dir, monitor='val_loss', verbose=1, save_best_only=False)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)

In [31]:
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(x_train, y_train,
          validation_data=(x_val, y_val), 
          batch_size=64, epochs=10, callbacks=[rocAucCallback, lr_scheduler]);

Train on 151592 samples, validate on 7979 samples
Epoch 1/10
Epoch 1, train by learning rate 0.001000 
    - Training ROC-AUC score: 0.9734 
    - Validation ROC-AUC score: 0.9611 

Epoch 2/10
Epoch 2, train by learning rate 0.001000 
    - Training ROC-AUC score: 0.9795 
    - Validation ROC-AUC score: 0.9697 



KeyboardInterrupt: 

In [2]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
y_test = self.predict(self.x_test, batch_size=1024, verbose=0)
submission = pd.read_csv("../output/sample_submission.csv")
submission[self.classes] = y_test
submission.to_csv(self.output_dir + file_name, index=False)