In [None]:
import pandas as pd
import keras
from sklearn.model_selection import train_test_split
import numpy as np
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

import time
import gc

import os
print(os.listdir("../input"))

import matplotlib.pylab as plt
from gensim.models import KeyedVectors

# Any results you write to the current directory are saved as output.

import tensorflow as tf
np.random.seed(2019)
tf.set_random_seed(2019)

In [None]:
VOCAB_SIZE = 95000
MAX_LEN = 70
EMBEDDING_SIZE = 300
# EMBEDDING_LIST = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt,\
# ../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec,\
# ../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
EMBEDDING_LIST = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt,\
../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [None]:
train_X, train_Y = train_df['question_text'].values, train_df['target'].values
test_X = test_df['question_text'].values

del train_df

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(list(train_X))

train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

train_X = pad_sequences(train_X, maxlen=MAX_LEN, padding='post')
test_X = pad_sequences(test_X, maxlen=MAX_LEN, padding='post')

**Process Embeddings**

In [None]:
def get_word_to_embedding_dict(file):
    word_to_embedding = {}
    with open(file, 'r', encoding="utf8", errors='ignore') as f:
        for i, line in enumerate(f):
            lineVec = line.split(' ')
            if len(lineVec) < EMBEDDING_SIZE:
                continue
            word_to_embedding[lineVec[0]] = np.asarray(lineVec[1:], dtype='float32')
    return word_to_embedding

In [None]:
# check if any words are missing from embedding
def check_missing(embed_dict):
    missing = set()
    found = set()
    for key in tokenizer.word_index:
        val = tokenizer.word_index[key]
        if val > VOCAB_SIZE-1:
            break
        if key not in embed_dict and key.lower() not in embed_dict:
            missing.add(key)
        else:
            found.add(key)
            
    print('Missing {0} embeddings'.format(len(missing)))
    print('Found embeddings for {:.2%} of vocab'.format(len(found) / (len(found) + len(missing))))
    
    return missing

In [None]:
def create_embedding_weights(embed_dict, embedding_matrix):
    for key in tokenizer.word_index:
        val = tokenizer.word_index[key]
        if val > VOCAB_SIZE-1:
            break
        if key in embed_dict:
            embedding_matrix[val] = embed_dict[key]
        elif key.lower() in embed_dict:
            embedding_matrix[val] = embed_dict[key.lower()]
    return embedding_matrix

In [None]:
all_embeddings = []

for embed_path in EMBEDDING_LIST.split(','):
    embed_dict = get_word_to_embedding_dict(embed_path)
    print('{0} has {1} embeddings'.format(embed_path, len(embed_dict)))
    check_missing(embed_dict)
    
    all_embs = np.stack(embed_dict.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embedding_matrix = np.random.normal(emb_mean, emb_std, (VOCAB_SIZE, EMBEDDING_SIZE))
    embedding_matrix = create_embedding_weights(embed_dict, embedding_matrix)
    
    all_embeddings.append(embedding_matrix)

del embed_dict, all_embs, embedding_matrix
gc.collect()
time.sleep(10)

**Build Model**

In [None]:
from keras.models import Sequential, Model
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D, Multiply, Reshape, Dense, Activation, Embedding, LSTM, Input, Bidirectional, Lambda, CuDNNLSTM, CuDNNGRU, GlobalMaxPool1D, Dropout
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
import tensorflow as tf
from keras.callbacks import Callback

In [None]:
# https://www.kaggle.com/qqgeogor/keras-lstm-attention-glove840b-lb-0-043
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [None]:
SPLITS = 4

In [None]:
def build_model():
    inputs = Input(shape=(MAX_LEN,))
    embeddings_prime = []
    for i, embeddings in enumerate(all_embeddings):
        embedding = Embedding(VOCAB_SIZE, EMBEDDING_SIZE, trainable=False, weights=[embeddings])(inputs)
        embeddings_prime.append(embedding)

    n_emb = len(embeddings_prime)
    stacked_embeddings = Lambda(lambda x: K.stack(x, axis=-1))(embeddings_prime)

    x = Reshape((MAX_LEN,-1))(stacked_embeddings)
    x = Bidirectional(CuDNNLSTM(n_emb,return_sequences = True))(x)
    x = Dense(n_emb, activation='sigmoid')(x)
    x = Reshape((MAX_LEN,1,n_emb))(x)
    x = Multiply()([stacked_embeddings, x])
    x = Lambda(lambda x: K.sum(x, axis=-1))(x)
#     x = SpatialDropout1D(0.1)(x)

    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    
    a = Attention(MAX_LEN)(x)
    avg_pool = GlobalAveragePooling1D()(y)
    max_pool = GlobalMaxPooling1D()(y)
    a2 = Attention(MAX_LEN)(y)
    
    x = concatenate([a, a2, avg_pool, max_pool])
    x = Dense(16, activation="relu")(x)
#     x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inputs, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
all_models = []
for i in range(SPLITS):
    model = build_model()
    all_models.append(model)

In [None]:
# https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate/code

class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve
def thresh_search(pred_val_y, y):
    precision, recall, thresholds = precision_recall_curve(y, pred_val_y)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1/precision + 1/recall)
    max_score = np.max(F)
    max_thresh = thresholds[np.argmax(F)]
    print("\n MAX F1 score at threshold {0} is {1}".format(max_thresh, max_score))
    max_predict = (pred_val_y>max_thresh).astype(int)
    return max_thresh, max_predict


In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
#cyclic

skf = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=5325)

all_history = []
train_meta = np.zeros(train_Y.shape[0])
test_meta = np.zeros(test_X.shape[0])

for train_index, val_index in skf.split(train_X, train_Y):
    train_inp, val_inp = train_X[train_index], train_X[val_index]
    train_out, val_out = train_Y[train_index], train_Y[val_index]
    model_num = len(all_history)
    
    clr = CyclicLR(base_lr=0.001, max_lr=0.002,
                   step_size=300., mode='exp_range',
                   gamma=0.99994)
    
    history = all_models[model_num].fit(train_inp, train_out, batch_size=512, epochs=4, validation_data=(val_inp, val_out),
                                       callbacks=[clr])
    
    pred_val_y = all_models[model_num].predict([val_inp], batch_size=1024, verbose=1)

    test_preds = all_models[model_num].predict([test_X], batch_size=1024, verbose=1)
    
    train_meta[val_index] = pred_val_y.reshape(-1)
    test_meta += test_preds.reshape(-1) / SPLITS
    
    all_history.append(history)

# 5 epochs is optimal

In [None]:
for i, history in enumerate(all_history):
    history_dict = history.history
    history_dict.keys()
    acc = history_dict['acc']

    loss_values = history_dict['loss']
    val_loss_values = history_dict['val_loss']
    epochs = range(1, len(acc) + 1)

    plt.figure(i)
    plt.plot(epochs, loss_values, 'bo', label='Training loss')
    plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
plt.show()

In [None]:
for i, history in enumerate(all_history):
    history_dict = history.history
    history_dict.keys()
    acc_values = history_dict['acc']

    loss_values = history_dict['loss']
    val_loss_values = history_dict['val_acc']
    epochs = range(1, len(acc) + 1)

    plt.figure(i)
    plt.plot(epochs, acc_values, 'bo', label='Training accuracy')
    plt.plot(epochs, val_loss_values, 'b', label='Validation accuracy')
    plt.title('Training and accuracy loss')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    
plt.show()

In [None]:
max_thresh, _ = thresh_search(train_meta, train_Y)
test_Y = (test_meta>max_thresh).astype(int)

out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = test_Y
out_df.to_csv("submission.csv", index=False)