In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '4'
import numpy as np
from contextlib import redirect_stdout

from sklearn.metrics import roc_auc_score

from keras import backend as K
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.models import Model, load_model

from keras.initializers import Constant
from keras.layers import Dense, Input, SpatialDropout1D, concatenate, Lambda, Dropout, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Conv1D
from keras.layers.embeddings import Embedding

from keras.optimizers import Adam

Using TensorFlow backend.


## 1.Data Preparation

In [2]:
MAX_FEATURES = 100000
MAX_LENGTH = 150

In [3]:
x_train = np.load('../data/x_train_100k_L150.npy')
x_val = np.load('../data/x_val_100k_L150.npy')
x_test = np.load('../data/x_test_100k_L150.npy')
y_train = np.load('../data/y_train.npy')
y_val = np.load('../data/y_val.npy')

## Callback

In [4]:
class RocAucCallback(Callback):
    def __init__(self, validation_data=(), output_dir=None, interval=1):
        super().__init__()
        self.interval = interval
        self.x_val, self.y_val = validation_data
        self.output_log = ''
        self.output_dir = output_dir
    
    def on_train_begin(self, logs={}):
        if self.output_dir:
            if not os.path.isdir(self.output_dir):
                os.makedirs(self.output_dir)
                
    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.x_val, verbose=0, batch_size=1024)
            score_val = roc_auc_score(self.y_val, y_pred)

            new_log = 'Epoch %d - Validation ROC-AUC score: %.4f \n' % (epoch + 1, score_val)
                
            print(new_log)
            
            if self.output_dir:
                self.output_log += new_log + '\n'
                
            self.model.save_weights(self.output_dir + 'weights.%02d.h5' % (epoch + 1))
                
    def on_train_end(self, logs={}):
        if self.output_dir:
            file_name = self.output_dir + '/log.txt'
            with open(file_name, 'w') as f:
                with redirect_stdout(f):
                    self.model.summary()

                f.write('\n' + self.output_log)

In [5]:
from custom_layers import GRU_Custom, Capsule, LSTM_Custom

In [10]:
def get_model(embed_matrix, max_len, units=64):
    '''
    Arguments:
    input_shape --
    word2vec_map --
    word2idx_map --
    units -- int
    
    Returns:
    model -- a model instance in Keras
    
    '''
    
    inp = Input(shape=(max_len, ), dtype='int32', name='input')
    
    ### An awkward trick to get trainable initial state...
    one = Lambda(lambda x: K.ones((K.shape(inp)[0], 1), dtype='float32'),
                 name='constant_one')(inp)
    initial_state_front = Dense(units, use_bias=False, kernel_initializer='zero',
                                name='h_front')(one)
    initial_state_back = Dense(units, use_bias=False, kernel_initializer='zero',
                               name='h_back')(one)
    #initial_memory_front = Dense(units, use_bias=False, kernel_initializer='zero',
    #                             name='c_front')(one)
    #initial_memory_back = Dense(units, use_bias=False, kernel_initializer='zero',
    #                             name='c_back')(one)
    ###

    max_features, embed_dim = embed_matrix.shape
    x = Embedding(max_features, embed_dim, trainable=False,
                  weights=[embed_matrix], name='embedding')(inp)
    x = SpatialDropout1D(0.5, name='spatial_dropout1')(x)
    
    gru = GRU(units, return_sequences=True, recurrent_dropout=0.25, activation='relu')
    x = Bidirectional(gru, name='biGRU')(x, initial_state=[initial_state_front,
                                                           initial_state_back])
    x = Capsule(num_capsule=16, dim_capsule=10, routings=5, share_weights=True)(x)
    x = Flatten()(x)
    x = Dropout(0.25)(x)
    x = Dense(6, activation='sigmoid', name='output')(x)
    
    model = Model(inputs=inp, outputs=x)
    
    return model

In [11]:
embed_matrix = np.load('../pretrained/numpy_matrix/fastText.300D.100kFeatures.npy')
model = get_model(embed_matrix, MAX_LENGTH, units=128)
optimizer = Adam(clipvalue=5.0)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
del embed_matrix
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 150)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 150, 300)     30000300    input[0][0]                      
__________________________________________________________________________________________________
constant_one (Lambda)           (None, 1)            0           input[0][0]                      
__________________________________________________________________________________________________
spatial_dropout1 (SpatialDropou (None, 150, 300)     0           embedding[0][0]                  
__________________________________________________________________________________________________
h_front (D

In [12]:
output_dir = '../output/gru_cap_100k_preprocessed_b128/'
rocAucCallback = RocAucCallback(validation_data=(x_val, y_val),
                                output_dir=output_dir,
                                interval=1)

#early = EarlyStopping(monitor="val_loss", mode="min", patience=3)

In [44]:
model.fit(x_train, y_train,
          validation_data=(x_val, y_val), 
          batch_size=128, epochs=12, callbacks=[rocAucCallback]);

Train on 143613 samples, validate on 15958 samples
Epoch 1/12
Epoch 1 - Validation ROC-AUC score: 0.9626 

Epoch 2/12
Epoch 2 - Validation ROC-AUC score: 0.9773 

Epoch 3/12
Epoch 3 - Validation ROC-AUC score: 0.9822 

Epoch 4/12
Epoch 4 - Validation ROC-AUC score: 0.9839 

Epoch 5/12
Epoch 5 - Validation ROC-AUC score: 0.9873 

Epoch 6/12
Epoch 6 - Validation ROC-AUC score: 0.9852 

Epoch 7/12

KeyboardInterrupt: 

In [13]:
model.load_weights(output_dir + 'weights.11.h5')

In [19]:
y_pred = model.predict(x_train, batch_size=1024, verbose=1)
roc_auc_score(y_train, y_pred)



0.99439974792040375

In [14]:
import pandas as pd

In [15]:
y_test = model.predict(x_test, batch_size=1024, verbose=1)



In [16]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [17]:
submission = pd.read_csv("../output/sample_submission.csv")
submission[classes] = y_test
submission.to_csv(output_dir + 'epoch11.csv', index=False)