# Simple GRU network with pretrained vectors for initialization

In [4]:
import sys, os, re, csv, codecs, gc, numpy as np, pandas as pd
import tensorflow as tf
#from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Permute, GRU, Conv1D, LSTM, Embedding, Dropout, Activation, CuDNNLSTM, CuDNNGRU, concatenate, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, BatchNormalization, SpatialDropout1D, Dot
from keras.optimizers import Adam, RMSprop
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras_tqdm import TQDMNotebookCallback
import keras.backend as K
from keras.callbacks import LearningRateScheduler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from functools import reduce
from keras.layers import Layer, PReLU, SpatialDropout1D
from keras import initializers
from sklearn.model_selection import cross_val_predict

from nltk.tokenize import word_tokenize, wordpunct_tokenize, TweetTokenizer, MWETokenizer, ToktokTokenizer
from nltk.corpus import stopwords

import unicodedata
from collections import Counter
import itertools

np.random.seed(786)

from Tokenizer import Tokenizer
from ZeroMaskedLayer import ZeroMaskedLayer
from AttentionLayer import AttentionLayer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [45]:
path = '../input/'
utility_path = '../utility/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE=f'{utility_path}glove.42B.300d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [46]:
word_tokenize("I won't do this check!!!")

['I', 'wo', "n't", 'do', 'this', 'check', '!', '!', '!']

In [47]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [98]:
def unicodeToAscii(series):
    return series.apply(lambda s: unicodedata.normalize('NFKC', str(s)))


def multiple_replace(text, adict):
    rx = re.compile('|'.join(map(re.escape, adict)))

    def one_xlat(match):
        return adict[match.group(0)]

    return rx.sub(one_xlat, text)

STOP_WORDS = set(stopwords.words( 'english' ))
# Lowercase, trim, and remove non-letter characters
def normalizeString(series):
    series = unicodeToAscii(series)
    series = series.str.lower()
    series = series.str.replace(r"(\n){1,}", " ")
    #series = series.str.replace(r"\'", "")
    series = series.str.replace(r"\-", "")
    series = series.str.replace(r"[^0-9a-zA-Z.,!?\"':]+", " ")
    series = series.str.replace("([a-z0-9]{2,}\.){2,}[a-z]{2,}", "url") 
    #Replace URL's by url
    series = series.str.replace("\d", "0")
    

    return series


In [49]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

print(train.shape, test.shape)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

#Get validation folds
train['target_str'] = reduce(lambda x,y: x+y, [train[col].astype(str) for col in list_classes])
train['target_str'] = train['target_str'].replace('110101', '000000').replace('110110','000000')
cvlist1 = list(StratifiedKFold(n_splits=10, random_state=786).split(train, train['target_str'].astype('category')))
cvlist2 = list(StratifiedShuffleSplit(n_splits=5, test_size=0.05, random_state=786).split(train, train['target_str'].astype('category')))

(159571, 8) (153164, 2)




In [74]:
for df in train, test:
    df["comment_text"] = normalizeString(df["comment_text"])

In [99]:
test.comment_text.sample(1).values[0]

' delete request someone hacked my account. i have fixed it now, but please delete this page: george walker faggot '

In [76]:
#pd.concat([train["comment_text"].astype(str), test["comment_text"].astype(str)]).reset_index(drop=True)[:len(train), :]

In [123]:
MAX_FEATURES = 80000
MAX_LEN = 150

tok = Tokenizer(max_features=MAX_FEATURES, max_len=MAX_LEN, tokenizer=word_tokenize)
X = tok.fit_transform(pd.concat([train["comment_text"].astype(str), test["comment_text"].astype(str)]))
X_train = X[:len(train), :]
X_test = X[len(train):, :]

print(X_train.shape, X_test.shape)

(159571, 150) (153164, 150)


In [87]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [124]:
word_tokenize("dick.sum")

['dick.sum']

In [None]:
TfidfTransformer()

In [122]:
tok.doc_freq.most_common(80000)[-100:]

[('realible', 3),
 ('tyrannosaurs', 3),
 ('wikitalk', 3),
 ('jaron', 3),
 ('hokku', 3),
 ('bogdangiusca', 3),
 ('lmaz', 3),
 ('kwekwe', 3),
 ('dupri', 3),
 ('mistakened', 3),
 ('stitching', 3),
 ('nonmalicious', 3),
 ('abruptness', 3),
 ('riefenstahl', 3),
 ('sapience', 3),
 ("hawai'i", 3),
 ('dispelled', 3),
 ('tekhelet', 3),
 ('kanai', 3),
 ('questionmarks', 3),
 ('ugadawgs', 3),
 ('angiotensin', 3),
 ('perlis', 3),
 ('groaning', 3),
 ('floozie', 3),
 ('squealed', 3),
 ('crackwhore', 3),
 ('spurted', 3),
 ('deflagration', 3),
 ('neusner', 3),
 ('fattie', 3),
 ('decal', 3),
 ('hfcs', 3),
 ('captialization', 3),
 ('bethe', 3),
 ('psychoenergetic', 3),
 ('trott', 3),
 ('antibias', 3),
 ('neutralist', 3),
 ('livius.org', 3),
 ('lendering', 3),
 ('reawakening', 3),
 ('kajol', 3),
 ('britishenglish', 3),
 ('valedictorian', 3),
 ('disqualifications', 3),
 ('nsuboy', 3),
 ('asist', 3),
 ('juvenal', 3),
 ('escapade', 3),
 ('intdablink', 3),
 ('leadership.aspx', 3),
 ('effete', 3),
 ('roars', 

In [130]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()


In [131]:
sp.Load("../utility/en.wiki.bpe.op200000.model")


True

In [132]:
train.loc[train.comment_text.str.contains("room."), list_classes].agg(['count', 'mean'])

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,657.0,657.0,657.0,657.0,657.0,657.0
mean,0.079148,0.0,0.041096,0.003044,0.039574,0.004566


In [141]:
sp.EncodeAsPieces("bitches.fuck")

['▁bitches', '.', 'fuck']

In [142]:
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open("../utility/en.wiki.bpe.op200000.d300.w2v.txt"))
len(embeddings_index)

187990

In [151]:
#embeddings_index
tok = Tokenizer(max_features=MAX_FEATURES, max_len=MAX_LEN, tokenizer=sp.EncodeAsPieces)
X = tok.fit_transform(pd.concat([train["comment_text"].astype(str), test["comment_text"].astype(str)]))
X_train = X[:len(train), :]
X_test = X[len(train):, :]

print(X_train.shape, X_test.shape)

(159571, 150) (153164, 150)


In [155]:
EMBED_SIZE = 300
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
oov_list =[]
def initialize_embeddings(filename, embeddings_index, tokenizer):
    #embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(filename))

    word_index = tokenizer.vocab_idx
    nb_words = min(MAX_FEATURES+1, len(word_index)+1)
    embedding_matrix = np.zeros((nb_words, EMBED_SIZE))
    for word, i in word_index.items():
        if i > MAX_FEATURES: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            
            #broken_words = sp.EncodeAsPieces(word)
            #broken_words()
            oov_list.append(word)
    return (embedding_matrix, oov_list)

In [163]:
tok.doc_freq.most_common(83000)[-100:]

[('olest', 3),
 ('▁flin', 3),
 ('▁tarski', 3),
 ('▁maiko', 3),
 ('▁sandanski', 3),
 ('▁combust', 3),
 ('▁malaga', 3),
 ('▁tapeworm', 3),
 ('▁borax', 3),
 ('▁phoen', 3),
 ('▁ival', 3),
 ('▁disob', 3),
 ('▁insideout', 3),
 ('▁heiden', 3),
 ('▁zaytsev', 3),
 ('▁deodor', 3),
 ('▁mechanization', 3),
 ('rca', 3),
 ('jill', 3),
 ('▁dissolves', 3),
 ('▁urbino', 3),
 ('tener', 3),
 ('▁sanaag', 3),
 ('urta', 3),
 ('▁dara', 3),
 ('meena', 3),
 ('▁polla', 3),
 ('▁iupui', 3),
 ('tto', 3),
 ('▁cleaving', 3),
 ('▁maye', 3),
 ('▁mussels', 3),
 ('▁delicacies', 3),
 ('eniya', 3),
 ('▁schreiber', 3),
 ('▁fitzgibbon', 3),
 ('udong', 3),
 ('▁pue', 3),
 ('▁subvers', 3),
 ('leva', 3),
 ('▁metastable', 3),
 ('▁phonograph', 3),
 ('shul', 3),
 ('sbee', 3),
 ('▁wendland', 3),
 ('▁gher', 3),
 ('equipment', 3),
 ('venomous', 3),
 ('ribo', 3),
 ('▁pessoa', 3),
 ('nicas', 3),
 ('inid', 3),
 ('▁woodward', 3),
 ('▁polyglot', 3),
 ('▁surjective', 3),
 ('▁typifies', 3),
 ('ghz', 3),
 ('▁unleashes', 3),
 ('▁diamondback',

In [156]:
embedding_matrix, oov_list = initialize_embeddings(EMBEDDING_FILE, embeddings_index, tok)
print(embedding_matrix.shape)
print(np.mean(embedding_matrix), np.std(embedding_matrix))

(80001, 300)
0.0004608204601803373 0.28929758471650163


In [157]:
print(len(oov_list))
oov_list

357


['▁jizz',
 '▁reall',
 'utzer',
 'cile',
 'copyr',
 '▁advertis',
 'incha',
 '▁cockro',
 'acuse',
 'amatory',
 'onsored',
 '▁inadvert',
 '▁clich',
 'ecially',
 '▁pagen',
 '▁attem',
 '▁answ',
 'ungeons',
 'rotum',
 '▁encycl',
 'trypt',
 '▁behav',
 '▁deleter',
 '▁antide',
 '▁excer',
 'reputable',
 '▁makedon',
 'irected',
 '▁embarr',
 '▁mahaw',
 '▁undoub',
 'olnick',
 '▁acknow',
 'schnitt',
 '▁eshte',
 '▁poved',
 '▁irond',
 '▁copyr',
 '▁metaw',
 'atever',
 '▁toky',
 '▁criter',
 'leves',
 '▁scientif',
 '▁appre',
 'ecock',
 '▁welcom',
 '▁proport',
 '▁confir',
 '▁inconsist',
 '▁pejor',
 'urday',
 'ypedia',
 'oburg',
 'eday',
 'illaz',
 'mosin',
 '▁einst',
 'itorul',
 'piff',
 '▁schnitz',
 'viously',
 '▁tomor',
 '▁zapat',
 '▁sevend',
 'ruction',
 '▁recip',
 '▁eaba',
 'ignatus',
 '▁refere',
 '▁achiev',
 '▁suce',
 '▁compre',
 'tirol',
 'namese',
 '▁resemb',
 'ailando',
 '▁yusufali',
 '▁qiw',
 'icance',
 'otre',
 '▁errone',
 '▁outrig',
 '▁disg',
 '▁nongo',
 'amilton',
 'arettes',
 'adays',
 'letel

In [170]:
from sklearn.base import BaseEstimator, ClassifierMixin
class GRUClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, gru_dim=150, dense_dim=256, batch_size=128, epochs=2, bidirectional=False, 
                 pool_type='all', initial_weights=None, optimizer='adam' ,verbose=1, out_dim=6, callbacks=None,
                spatial_drop=0.0, dropout=0.0, mask_zero=True, 
                gru_kernel_regularization = 0.0,
                gru_recurrent_regularization = 0.0,
                gru_bias_regularization = 0.0,
                embeddings_regularization = 0.0,
                ):
        
        self.gru_dim = gru_dim
        self.dense_dim = dense_dim
        self.batch_size = batch_size
        self.epochs= epochs
        self.bidirectional = bidirectional
        self.pool_type = pool_type
        self.initial_weights = initial_weights
        self.verbose = verbose
        self.callbacks = callbacks
        self.optimizer = optimizer
        self.out_dim = out_dim
        self.spatial_drop = spatial_drop
        self.dropout = dropout
        self.mask_zero = mask_zero
        self.gru_kernel_regularization = gru_kernel_regularization
        self.gru_recurrent_regularization = gru_recurrent_regularization
        self.gru_bias_regularization = gru_bias_regularization
        self.embeddings_regularization = embeddings_regularization
        
    def _build_model(self):
        inp = Input(shape=(MAX_LEN,))
        emb = Embedding(MAX_FEATURES+1, 
                        EMBED_SIZE,
                        weights=[self.initial_weights],
                        mask_zero=self.mask_zero,
                        #embeddings_regularizer=regularizers.l2(self.embeddings_regularization),
                        trainable=True)(inp)

        if self.mask_zero:
            emb = ZeroMaskedLayer()(emb)
         
        #emb2 = Dot(axes=1)([emb, emb])
        #print(emb.shape)
        emb2 = SpatialDropout1D(self.spatial_drop)(emb)
        if self.bidirectional:
            enc = Bidirectional(CuDNNGRU(int(self.gru_dim), return_sequences=True, return_state=True, stateful=False,
                                         ))(emb2)
            x = enc[0]
            state = enc[1]
        else:
            x, state = CuDNNGRU(int(self.gru_dim), return_sequences=True, return_state=True,
                            kernel_regularizer=regularizers.l2(self.gru_kernel_regularization),
                            recurrent_regularizer=regularizers.l2(self.gru_recurrent_regularization),
                            bias_regularizer=regularizers.l2(self.gru_bias_regularization)
                               )(emb2)
            #x = SpatialDropout1D(0.5)(x)
        
        if self.pool_type == 'avg':
            x = GlobalAveragePooling1D()(x)
            x = concatenate([x, state])
            
        elif self.pool_type == 'max':
            x = GlobalMaxPool1D()(x)
            x = concatenate([x, state])
            
        elif self.pool_type == 'attn':
            x = AttentionLayer(MAX_LEN)(x)
            x = concatenate([x, state])
            
        elif self.pool_type == 'all':
            #x1 = GlobalAveragePooling1D()(emb2)
            #x2 = GlobalAveragePooling1D()(x)
            #x3 = GlobalAveragePooling1D()(emb)
            x4 = GlobalMaxPool1D()(x)
            x5 = AttentionLayer(MAX_LEN)(x)
            x = concatenate([x5, x4])
    
        x = Dropout(self.dropout)(x)
        x = Dense(self.dense_dim)(x)
        x = PReLU()(x)
        
        #x = Dense(600)(x)
        #x = PReLU()(x)

        out = Dense(self.out_dim, activation="sigmoid")(x)
        if self.optimizer == 'adam':
            opt = Adam(lr=0.001, decay=0.0, clipnorm=1.0)
        elif self.optimizer == 'rmsprop':
            opt = RMSprop(clipnorm=1.0)
        model = Model(inputs=inp, outputs=out)
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model = self._build_model()
        
        if self.callbacks:
            self.model.fit(X, y, batch_size=self.batch_size, epochs=self.epochs,
                       verbose=self.verbose,
                       callbacks=self.callbacks,
                       shuffle=True)
        else:
            self.model.fit(X, y, batch_size=self.batch_size, epochs=self.epochs,
                       verbose=self.verbose,
                       shuffle=True)
        return self
    
    def predict(self, X, y=None):
        if self.model:
            y_hat = self.model.predict(X, batch_size=1024)
        else:
            raise ValueError("Model not fit yet")
        return y_hat

In [166]:
def lr_decay(epoch):
    if epoch == 0:
        return 0.0016
    if epoch == 1:
        return 0.00024
    if epoch == 2:
        return 0.001
    if epoch == 3:
        return 0.00001


def shuffle_crossvalidator(model, cvlist, X, y, lr_decay):
    y_trues = []
    y_preds = []
    scores = []
    LRDecay = LearningRateScheduler(lr_decay)

    for tr_index, val_index in cvlist:
        X_tr, y_tr = X[tr_index, :], y[tr_index, :]
        X_val, y_val = X[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

        model.set_params(**{'callbacks':[RocAuc, LRDecay]})
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        print("ROC AUC for this fold is ", score)
        y_trues.append(y_val)
        y_preds.append(y_pred)
        K.clear_session()
        gc.collect()
        #break
    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    score = roc_auc_score(y_trues, y_preds)
    print("Overall score on 10 fold CV is {}".format(score))
    
    return y_preds, y_trues, scores

def outoffold_crossvalidator(model_params, cvlist, X, y, lr_decay):
    y_preds = np.zeros(y.shape)
    LRDecay = LearningRateScheduler(lr_decay)

    for tr_index, val_index in cvlist:
        X_tr, y_tr = X[tr_index, :], y[tr_index, :]
        X_val, y_val = X[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
        
        model.set_params(**{'callbacks':[RocAuc, LRDecay]})
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        print("ROC AUC for this fold is ", roc_auc_score(y_val, y_pred))
        y_preds[val_idx] = y_pred
        K.clear_session()
        break
    score = roc_auc_score(y, y_preds)
    print("Overall score on 10 fold CV is {}".format(score))
    
    return y_preds, y_trues, score


In [174]:
import tensorflow as tf
def lr_decay(epoch):
    if epoch == 0:
        return 0.0015
    if epoch == 1:
        return 0.0003
    if epoch == 2:
        return 0.001
    if epoch == 3:
        return 0.001
    if epoch == 4:
        return 0.001
    if epoch == 5:
        return 0.001
    return 0.001

K.clear_session()
#config = tf.ConfigProto(
#        device_count = {'GPU': 1}
#    )
#sess = tf.Session(config=config)
model = GRUClassifier(gru_dim=300, dense_dim=600, initial_weights=embedding_matrix, bidirectional=True,
                    batch_size=64, epochs=2, optimizer='adam', pool_type='all', dropout=0.2, spatial_drop=0.3, mask_zero=False)

y_preds, y_trues, _ = shuffle_crossvalidator(model, cvlist2, X_train, y, lr_decay)

Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.989571 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.990403 

ROC AUC for this fold is  0.9904027520417942
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.989074 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.990222 

ROC AUC for this fold is  0.9902219043610222
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.987388 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.988239 

ROC AUC for this fold is  0.9882387141752084
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.988585 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.990234 

ROC AUC for this fold is  0.9902338683768446
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986473 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.988387 

ROC AUC for this fold is  0.988387324593393
Overall score on 10 fold CV is 0.989193948914337


In [175]:
import gc 
gc.collect()
K.clear_session()

In [178]:
parameter_list = [{'batch_size': [47.0],
  'bidirectional': [0],
  'dense_dim': [973.0],
  'dropout': [0.19862535182199834],
  'gru_bias_reg': [1.011936859273273e-08],
  'gru_dim': [358.0],
  'gru_kernel_reg': [2.0678669679829352e-10],
  'gru_recc_reg': [8.946942716621634e-07],
  'lr1': [0.0015982451490776767],
  'lr2': [0.0002459290205687559],
  'mask_zero': [1],
  'optimizer': [0],
  'pool_type': [3],
  'spatial_drop': [0.2696100622336198]},
 {'batch_size': [83.0],
  'bidirectional': [0],
  'dense_dim': [351.0],
  'dropout': [0.07833431778315075],
  'gru_bias_reg': [1.989216237371643e-09],
  'gru_dim': [478.0],
  'gru_kernel_reg': [2.1606860352426398e-10],
  'gru_recc_reg': [1.6736919208281796e-07],
  'lr1': [0.00263784102869703],
  'lr2': [0.0005711207564167526],
  'mask_zero': [1],
  'optimizer': [0],
  'pool_type': [3],
  'spatial_drop': [0.21401382410917008]},
 {'batch_size': [49.0],
  'bidirectional': [0],
  'dense_dim': [997.0],
  'dropout': [0.19115533803668047],
  'gru_bias_reg': [5.222640591389245e-10],
  'gru_dim': [399.0],
  'gru_kernel_reg': [8.078459790975857e-10],
  'gru_recc_reg': [6.100081276448957e-08],
  'lr1': [0.0019427338445684181],
  'lr2': [0.00010186610979091696],
  'mask_zero': [1],
  'optimizer': [0],
  'pool_type': [3],
  'spatial_drop': [0.22614208466560007]},
 {'batch_size': [41.0],
  'bidirectional': [0],
  'dense_dim': [973.0],
  'dropout': [0.20050865242539928],
  'gru_bias_reg': [1.1451922219328368e-08],
  'gru_dim': [392.0],
  'gru_kernel_reg': [1.0516629869555607e-09],
  'gru_recc_reg': [1.2593577396164419e-06],
  'lr1': [0.0016205788115723873],
  'lr2': [0.00011538601448660545],
  'mask_zero': [1],
  'optimizer': [0],
  'pool_type': [3],
  'spatial_drop': [0.3803897135211322]},
 {'batch_size': [37.0],
  'bidirectional': [0],
  'dense_dim': [237.0],
  'dropout': [0.12273937792021693],
  'gru_bias_reg': [2.7055793227129377e-09],
  'gru_dim': [407.0],
  'gru_kernel_reg': [1.9122269544090935e-09],
  'gru_recc_reg': [1.5269966614646778e-06],
  'lr1': [0.0019545667587842147],
  'lr2': [0.00034205962093229346],
  'mask_zero': [1],
  'optimizer': [0],
  'pool_type': [3],
  'spatial_drop': [0.239366738134983]},]

In [179]:
#Pick top 10 parameter settings, Bag models for those settings
#Try linear blending on those settings
#NUM_BAGS = 10
#cvlist3 = list(StratifiedShuffleSplit(n_splits=NUM_BAGS, test_size=0.05, random_state=786).split(y, y[:,2]))


def shuffle_train_predict(model, cvlist, X, y, X_test, lr_decay):
    y_trues = []
    y_preds = []
    y_test_preds = []
    scores = []
    LRDecay = LearningRateScheduler(lr_decay)

    for tr_index, val_index in cvlist:
        X_tr, y_tr = X[tr_index, :], y[tr_index, :]
        X_val, y_val = X[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

        model.set_params(**{'callbacks':[RocAuc, LRDecay]})
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        print("ROC AUC for this fold is ", score)
        y_trues.append(y_val)
        y_preds.append(y_pred)
        y_test_preds.append(y_test_pred)
        K.clear_session()
        gc.collect()
        #break
    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    y_test_preds = np.mean(y_test_preds, axis=0)
    print("Shape of test _preds is ", y_test_preds.shape)
    print("Means of val and test preds are {} and {}".format(np.mean(y_preds, axis=1), np.mean(y_test_preds, axis=1)))
    score = roc_auc_score(y_trues, y_preds)
    print("Overall score on 10 fold CV is {}".format(score))
    
    return y_preds, y_trues, y_test_preds

def oof_train_predict(model, cvlist, X, y, X_test, lr_decay):
    #y_trues = []
    y_test_preds = []
    scores = []
    y_preds = np.zeros(y.shape)
    LRDecay = LearningRateScheduler(lr_decay)

    for tr_index, val_index in cvlist:
        X_tr, y_tr = X[tr_index, :], y[tr_index, :]
        X_val, y_val = X[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

        model.set_params(**{'callbacks':[RocAuc, LRDecay]})
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        print("ROC AUC for this fold is ", score)
        #y_trues.append(y_val)
        y_preds[val_index, :] = y_pred
        y_test_preds.append(y_test_pred)
        K.clear_session()
        gc.collect()
        #break
    #y_trues = np.concatenate(y_trues)
    #y_preds = np.concatenate(y_preds)
    y_test_preds = np.mean(y_test_preds, axis=0)
    print("Shape of test _preds is ", y_test_preds.shape)
    print("Means of val and test preds are {} and {}".format(np.mean(y_preds, axis=0), np.mean(y_test_preds, axis=0)))
    score = roc_auc_score(y, y_preds)
    print("Overall score on 10 fold CV is {}".format(score))
    
    return y_preds, y_test_preds

def train_predict(parameter_space):
    
    def lr_decay(epoch):
        if epoch == 0:
            return parameter_space['lr1'][0]
        if epoch == 1:
            return parameter_space['lr2'][0]
    
    model = GRUClassifier(initial_weights=embedding_matrix, bidirectional=[True, False][parameter_space['bidirectional'][0]],
                          gru_dim = int(parameter_space['gru_dim'][0]),
                          dense_dim = int(parameter_space['dense_dim'][0]),
                          mask_zero = [True, False][parameter_space['mask_zero'][0]],
                          pool_type = ['avg', 'max', 'attn', 'all'][parameter_space['pool_type'][0]],
                          batch_size= int(parameter_space['batch_size'][0]), 
                          epochs=2, 
                          optimizer=["adam", "rmsprop"][parameter_space['optimizer'][0]],
                          dropout=parameter_space['dropout'][0],
                          spatial_drop=parameter_space['spatial_drop'][0],
                          gru_kernel_regularization = parameter_space["gru_kernel_reg"][0],
                          gru_recurrent_regularization = parameter_space["gru_recc_reg"][0],
                          gru_bias_regularization = parameter_space["gru_bias_reg"][0],
                          #embeddings_regularization = parameter_space["embeddings_reg"],
                          )

    #y_preds, y_trues, y_test_preds = shuffle_train_predict(model, cvlist2, X_train, y, X_test, lr_decay) 
    y_preds, y_test_preds = oof_train_predict(model, cvlist1, X_train, y, X_test, lr_decay)
    return y_preds, y_trues, y_test_preds

#####
y_preds_all = []
y_trues_all = []
y_test_preds_all = []
for params in parameter_list:
    y_preds, y_trues, y_test_preds = train_predict(params)
    y_preds_all.append(y_preds)
    y_trues_all.append(y_trues)
    y_test_preds_all.append(y_test_preds)


Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.985494 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.986598 

ROC AUC for this fold is  0.9865982493060157
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986314 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.987023 

ROC AUC for this fold is  0.9870233444184867
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986948 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.988438 

ROC AUC for this fold is  0.9884377331134698
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986578 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.988573 

ROC AUC for this fold is  0.988572557935017
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.985507 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.987536 

ROC AUC for this fold is  0.9875364667823487
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.987240 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.988611 

ROC AUC for this fold is  0.9886119001844679
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.987497 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.989103 

ROC AUC for this fold is  0.989

KeyboardInterrupt: 

In [None]:
#Check corelation between different predictions
#np.corrcoef(y_preds_all, axis=0)

In [181]:
#Try different stacking approaches
from scipy.stats import gmean, hmean

preds_mean = gmean(y_preds_all, axis=0)
print(roc_auc_score(y, preds_mean))
test_preds_mean = gmean(y_test_preds_all, axis=0)


0.9875574585296875


In [None]:
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = test_preds_mean
sample_submission.to_csv('../input/gru_spemb_5bags_submission.csv', index=False)