In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Conv1D, MaxPooling1D
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K
import re
import warnings
warnings.filterwarnings('ignore')
import gensim

import os
os.environ['OMP_NUM_THREADS'] = '4'

Using TensorFlow backend.


In [2]:
train = pd.read_csv('pre_data/df_train_tw_corr.csv', sep=';', index_col='index')
test = pd.read_csv('pre_data/df_test_tw_corr.csv', sep=';', index_col='index')
submission = pd.read_csv('data/sample_submission.csv')

y = pd.read_csv('data/train.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = y[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values


max_features = 200000
maxlen = 460
#embed_size = 200

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [3]:
from collections import defaultdict
import tqdm

In [4]:
emb_list = [i for i in os.listdir('vocs/') if i[-3:] == 'npy']

In [5]:
emb_list

['emb_lemm_corr_glove.tw_25_.npy',
 'emb_lemm_corr_glove.84_300_.npy',
 'emb_lemm_corr_wiki.de._300_.npy',
 'emb_lemm_corr_glove.tw_100_.npy',
 'emb_lemm_corr_wiki.en._300_.npy',
 'emb_lemm_corr_GoogleNe_300_.npy',
 'emb_lemm_corr_glove.tw_200_.npy',
 'emb_lemm_corr_wiki.fr._300_.npy',
 'emb_lemm_corr_glove.tw_50_.npy']

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    inp = Input(shape=(maxlen, ))
    Conv1D_list_embs = []
    max_pool_a = []
    max_pool_b = []
    max_pool_c = []
    for emb_file in emb_list:
        emb_matrix = np.load('vocs/'+emb_file)
        x = Embedding(max_features, emb_matrix.shape[1], weights=[emb_matrix])(inp)
        print(x.shape)
        a = Conv1D(64, kernel_size = 3, padding = "same", kernel_initializer = "he_uniform")(x)
        print('a', a.shape)
        b = Conv1D(64, kernel_size = 5, padding = "same", kernel_initializer = "he_uniform")(x)
        print('b', b.shape)
        c = Conv1D(64, kernel_size = 7, padding = "same", kernel_initializer = "he_uniform")(x)
        print('c', c.shape)
        max_pool_a.append(MaxPooling1D(6, strides=3)(a))
        max_pool_b.append(MaxPooling1D(6, strides=3)(b))
        max_pool_c.append(MaxPooling1D(6, strides=3)(c))
    con_a  = concatenate(max_pool_a)
    con_b  = concatenate(max_pool_b)
    con_c  = concatenate(max_pool_c)
    print(con_a.shape)
    print(con_b.shape)
    print(con_c.shape)
    #x = SpatialDropout1D(0.5)(x)
    x = concatenate([con_a, con_b, con_c])
    x = Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.5))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    x = Dropout(0.5)(conc)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
%%time
model = get_model()


batch_size = 16
epochs = 10
file_path = "models/best_model_CNN_allembs.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")

early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

[X_tra, X_val, y_tra, y_val] = train_test_split(x_train, y_train, train_size=0.85, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(X_tra)/batch_size) * epochs
lr_init, lr_fin = 0.001, 0.0001
lr_decay = exp_decay(lr_init, lr_fin, steps)
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc, check_point, early_stop], verbose=2)


y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred

(?, 460, 25)
a (?, 460, 64)
b (?, 460, 64)
c (?, 460, 64)
(?, 460, 300)
a (?, 460, 64)
b (?, 460, 64)
c (?, 460, 64)
(?, 460, 300)
a (?, 460, 64)
b (?, 460, 64)
c (?, 460, 64)
(?, 460, 100)
a (?, 460, 64)
b (?, 460, 64)
c (?, 460, 64)
(?, 460, 300)
a (?, 460, 64)
b (?, 460, 64)
c (?, 460, 64)
(?, 460, 300)
a (?, 460, 64)
b (?, 460, 64)
c (?, 460, 64)
(?, 460, 200)
a (?, 460, 64)
b (?, 460, 64)
c (?, 460, 64)
(?, 460, 300)
a (?, 460, 64)
b (?, 460, 64)
c (?, 460, 64)
(?, 460, 50)
a (?, 460, 64)
b (?, 460, 64)
c (?, 460, 64)
(?, 152, 576)
(?, 152, 576)
(?, 152, 576)
Train on 135635 samples, validate on 23936 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.973729 

Epoch 00001: val_loss improved from inf to 0.05090, saving model to models/best_model_CNN_allembs.hdf5
 - 11189s - loss: 0.0651 - acc: 0.9776 - val_loss: 0.0509 - val_acc: 0.9808
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.975039 

Epoch 00002: val_loss did not improve
 - 11047s - loss: 0.0529 - acc: 0.9806 - val_loss: 0.0514

KeyboardInterrupt: 

In [26]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, initial_epoch=4, epochs=8, validation_data=(X_val, y_val),
                 callbacks=[RocAuc, check_point, early_stop], verbose=2)

NameError: name 'model' is not defined

In [27]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, initial_epoch=8, epochs=9, validation_data=(X_val, y_val),
                 callbacks=[RocAuc, check_point, early_stop], verbose=2)

Train on 135635 samples, validate on 23936 samples
Epoch 9/9

 ROC-AUC - epoch: 9 - score: 0.986956 

Epoch 00009: val_loss did not improve
 - 885s - loss: 0.0307 - acc: 0.9876 - val_loss: 0.0436 - val_acc: 0.9845


In [28]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission_0803_2.csv', index=False)

In [29]:
! kg config -c jigsaw-toxic-comment-classification-challenge -u cerdgio86@gmail.com -p 68918082

! kg submit 'submission_0803_2.csv' -m "___s____"

0.9826


In [None]:
from keras.models import load_model
model.save('models/my_model_9829.h5')

In [None]:
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D

file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
ra_val = RocAucEvaluation(validation_data=(X_valid, Y_valid), interval = 1)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

def build_model(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0):
    inp = Input(shape = (max_len,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(dr)(x)

    x = Bidirectional(GRU(units, return_sequences = True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])

    x = Dense(6, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, Y_train, batch_size = 128, epochs = 4, validation_data = (X_valid, Y_valid), 
                        verbose = 1, callbacks = [ra_val, check_point, early_stop])
    model = load_model(file_path)
    return model