In [17]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Conv1D, BatchNormalization, PReLU
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K
import re
import warnings
warnings.filterwarnings('ignore')
import gensim

import os
os.environ['OMP_NUM_THREADS'] = '4'

In [2]:
train = pd.read_csv('pre_data/df_train_tw_corr.csv', sep=';', index_col='index')
test = pd.read_csv('pre_data/df_test_tw_corr.csv', sep=';', index_col='index')
submission = pd.read_csv('data/sample_submission.csv')

y = pd.read_csv('data/train.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = y[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values


max_features = 200000
maxlen = 460
embed_size = 200

tokenizer = text.Tokenizer(num_words=max_features, filters= '~\t\n' )
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [3]:
from collections import defaultdict
import tqdm

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format('vocs/glove.twitter.27B.200d_w2v.txt')

#def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = {key:model.wv[key] for key in model.wv.vocab.keys()}

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [18]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(250,return_sequences=True, dropout=.1, recurrent_dropout=0.2), )(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform", activation='linear')(x)
    x = BatchNormalization()(x)
    x = PReLU()(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    x = Dropout(0.35)(conc)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [19]:
%%time
model = get_model()


batch_size = 128
epochs = 4

file_path = "models/best_freez_embs.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")

early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)


[X_tra, X_val, y_tra, y_val] = train_test_split(x_train, y_train, train_size=0.85, random_state=1)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(X_tra)/batch_size) * epochs
lr_init, lr_fin = 0.001, 0.0001
lr_decay = exp_decay(lr_init, lr_fin, steps)
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

CPU times: user 3.12 s, sys: 929 ms, total: 4.05 s
Wall time: 2.88 s


In [None]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc, check_point, early_stop], verbose=2)

Train on 135635 samples, validate on 23936 samples
Epoch 1/4

 ROC-AUC - epoch: 1 - score: 0.976733 

Epoch 00001: val_loss improved from inf to 0.04717, saving model to models/best_freez_embs.hdf5
 - 1561s - loss: 0.0684 - acc: 0.9769 - val_loss: 0.0472 - val_acc: 0.9822
Epoch 2/4

 ROC-AUC - epoch: 2 - score: 0.983052 

Epoch 00002: val_loss improved from 0.04717 to 0.04368, saving model to models/best_freez_embs.hdf5
 - 1554s - loss: 0.0491 - acc: 0.9818 - val_loss: 0.0437 - val_acc: 0.9829
Epoch 3/4


In [12]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, initial_epoch=4, epochs=8, validation_data=(X_val, y_val),
                 callbacks=[RocAuc, check_point, early_stop], verbose=2)

Train on 135635 samples, validate on 23936 samples
Epoch 5/8

 ROC-AUC - epoch: 5 - score: 0.988232 

Epoch 00005: val_loss did not improve
 - 1478s - loss: 0.0409 - acc: 0.9842 - val_loss: 0.0418 - val_acc: 0.9837
Epoch 6/8

 ROC-AUC - epoch: 6 - score: 0.988511 

Epoch 00006: val_loss did not improve
 - 1480s - loss: 0.0407 - acc: 0.9842 - val_loss: 0.0413 - val_acc: 0.9840
Epoch 7/8

 ROC-AUC - epoch: 7 - score: 0.988298 

Epoch 00007: val_loss did not improve
 - 1476s - loss: 0.0389 - acc: 0.9848 - val_loss: 0.0412 - val_acc: 0.9842
Epoch 8/8

 ROC-AUC - epoch: 8 - score: 0.988585 

Epoch 00008: val_loss did not improve
 - 1478s - loss: 0.0383 - acc: 0.9851 - val_loss: 0.0417 - val_acc: 0.9833


In [None]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred

Train on 135635 samples, validate on 23936 samples
Epoch 1/4

 ROC-AUC - epoch: 1 - score: 0.970096 

Epoch 00001: val_loss improved from inf to 0.05963, saving model to models/best_freez_embs.hdf5
 - 1590s - loss: 0.0874 - acc: 0.9720 - val_loss: 0.0596 - val_acc: 0.9785
Epoch 2/4

 ROC-AUC - epoch: 2 - score: 0.975201 

Epoch 00002: val_loss improved from 0.05963 to 0.05805, saving model to models/best_freez_embs.hdf5
 - 1579s - loss: 0.0623 - acc: 0.9783 - val_loss: 0.0580 - val_acc: 0.9783
Epoch 3/4

 ROC-AUC - epoch: 3 - score: 0.980271 

Epoch 00003: val_loss improved from 0.05805 to 0.05177, saving model to models/best_freez_embs.hdf5
 - 1577s - loss: 0.0584 - acc: 0.9793 - val_loss: 0.0518 - val_acc: 0.9801
Epoch 4/4

 ROC-AUC - epoch: 4 - score: 0.984195 

Epoch 00004: val_loss improved from 0.05177 to 0.04837, saving model to models/best_freez_embs.hdf5
 - 1514s - loss: 0.0562 - acc: 0.9799 - val_loss: 0.0484 - val_acc: 0.9815
CPU times: user 2h 2min 15s, sys: 14min 13s, tota

In [7]:
model.load_weights('models/best_freez_embs.hdf5')
hist = model.fit(X_tra, y_tra, batch_size=batch_size, initial_epoch=5, epochs=12, validation_data=(X_val, y_val),
                 callbacks=[RocAuc, check_point, early_stop], verbose=2)

Train on 135635 samples, validate on 23936 samples
Epoch 6/12

 ROC-AUC - epoch: 6 - score: 0.986389 

Epoch 00006: val_loss improved from inf to 0.04796, saving model to models/best_freez_embs.hdf5
 - 1574s - loss: 0.0370 - acc: 0.9864 - val_loss: 0.0480 - val_acc: 0.9823
Epoch 7/12

 ROC-AUC - epoch: 7 - score: 0.985710 

Epoch 00007: val_loss improved from 0.04796 to 0.04676, saving model to models/best_freez_embs.hdf5
 - 1561s - loss: 0.0551 - acc: 0.9801 - val_loss: 0.0468 - val_acc: 0.9825
Epoch 8/12

 ROC-AUC - epoch: 8 - score: 0.986495 

Epoch 00008: val_loss did not improve
 - 1562s - loss: 0.0535 - acc: 0.9805 - val_loss: 0.0510 - val_acc: 0.9797
Epoch 9/12

 ROC-AUC - epoch: 9 - score: 0.986921 

Epoch 00009: val_loss improved from 0.04676 to 0.04492, saving model to models/best_freez_embs.hdf5
 - 1495s - loss: 0.0528 - acc: 0.9807 - val_loss: 0.0449 - val_acc: 0.9826
Epoch 10/12


KeyboardInterrupt: 

In [None]:
model.load_weights('models/best_freez_embs.hdf5')
hist = model.fit(X_tra, y_tra, batch_size=batch_size, initial_epoch=4, epochs=8, validation_data=(X_val, y_val),
                 callbacks=[RocAuc, check_point, early_stop], verbose=2)

In [None]:
hist

In [None]:
1+1 

In [14]:
model.load_weights('models/best_right_tockzer.hdf5')
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred

In [15]:
submission.to_csv('submission_1103.csv', index=False)

In [16]:
! kg config -c jigsaw-toxic-comment-classification-challenge -u cerdgio86@gmail.com -p 68918082

! kg submit 'submission_1103.csv' -m "___s____"

0.9834


In [29]:
from keras.models import load_model
model.save('models/my_model_9829.h5')

In [None]:
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D

file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
ra_val = RocAucEvaluation(validation_data=(X_valid, Y_valid), interval = 1)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

def build_model(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0):
    inp = Input(shape = (max_len,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(dr)(x)

    x = Bidirectional(GRU(units, return_sequences = True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])

    x = Dense(6, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, Y_train, batch_size = 128, epochs = 4, validation_data = (X_valid, Y_valid), 
                        verbose = 1, callbacks = [ra_val, check_point, early_stop])
    model = load_model(file_path)
    return model