In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import json

import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as K

from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_VOCAB_SIZE = 100000
EMB_SIZE = 300
MAX_SEQUENCE_LENGTH = 70

CONV_FEATURE_DIM = 128
CONV_WINDOW_SIZE = 5
FC_FEATURE_DIM = 128

NUM_CONV_LAYERS = 2
NUM_FC_LAYERS = 3

THRESHOLD = 0.32

In [None]:
DATA_DIR = '../input/'
dataset = pd.read_csv(DATA_DIR + 'train.csv')
test_dataset = pd.read_csv(DATA_DIR + 'test.csv')
question_text = list(dataset['question_text'])
test_question_text = list(test_dataset['question_text'])

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(question_text)
sequence = tokenizer.texts_to_sequences(question_text)
train_X = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
train_y = np.array(dataset['target'], dtype=np.int64)

test_sequence = tokenizer.texts_to_sequences(test_question_text)
test_X = pad_sequences(test_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [None]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(MAX_VOCAB_SIZE, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= MAX_VOCAB_SIZE: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(MAX_VOCAB_SIZE, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= MAX_VOCAB_SIZE: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(MAX_VOCAB_SIZE, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= MAX_VOCAB_SIZE: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [None]:
def f1(y_true, y_pred):
    '''
    metric from here 
    https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
    '''
    def recall(y_true, y_pred):
        """Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

class CyclicLR(tf.keras.callbacks.Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

In [None]:
def build_model(embedding_matrix=None):
    
    def conv_block(inputs):
        conv_layer = tf.keras.layers.Conv1D(int(CONV_FEATURE_DIM / 2), 
                                            CONV_WINDOW_SIZE,  
#                                             activation='tanh',
                                            padding='same')(inputs)
#         conv_layer = tf.keras.layers.Dropout(0.2)(conv_layer)
        glu_layer = tf.keras.layers.Dense(CONV_FEATURE_DIM * 2)(conv_layer)
        scored_output, output_layer = tf.split(glu_layer, 2, axis=-1)

#         output_layer = output_layer * tf.nn.sigmoid(scored_output)
#         output_layer = tf.keras.layers.Dense(CONV_FEATURE_DIM, activation='relu')(conv_layer)

        return output_layer

    def self_alignment(inputs):
        activated_scores = tf.keras.layers.Dense(1, activation='tanh')(inputs)
        aligned_scores = tf.nn.softmax(activated_scores, axis=1)
        aligned_outputs = tf.reduce_sum(aligned_scores * inputs, axis=1)

        return aligned_outputs

    features = tf.keras.layers.Input(shape=(70,))
    embedding_layer = tf.keras.layers.Embedding(MAX_VOCAB_SIZE, EMB_SIZE, weights=[embedding_matrix], trainable=False)(features)
    embedding_layer = tf.keras.layers.SpatialDropout1D(0.1)(embedding_layer)

    conv_embedding_layer = tf.keras.layers.Dense(CONV_FEATURE_DIM, activation='relu')(embedding_layer)

    for i in range(NUM_CONV_LAYERS):
        input_layer = conv_output_layer if i > 0 else conv_embedding_layer
        conv_output_layer = tf.keras.layers.Lambda(lambda x: conv_block(x))(input_layer)
        conv_output_layer = tf.keras.layers.Add()([input_layer, conv_output_layer])
        conv_output_layer = tf.keras.layers.Dropout(0.2)(conv_output_layer)
#         conv_output_layer = tf.keras.layers.BatchNormalization()(conv_output_layer)

    attention_layer = tf.keras.layers.Lambda(lambda x: self_alignment(x))(conv_output_layer)
    max_pool_layer = tf.keras.layers.GlobalMaxPooling1D()(conv_output_layer)
    avg_pool_layer = tf.keras.layers.GlobalAveragePooling1D()(conv_output_layer)

    conv_augmented_layer = tf.keras.layers.concatenate([max_pool_layer, avg_pool_layer, attention_layer], axis=-1)
    conv_augmented_layer = tf.keras.layers.Dense(FC_FEATURE_DIM)(conv_augmented_layer)

    for i in range(NUM_FC_LAYERS):
        input_layer = conv_fc_output_layer if i > 0 else conv_augmented_layer
        conv_fc_output_layer = tf.keras.layers.Dense(FC_FEATURE_DIM, activation='relu')(input_layer)
#         fc_output_layer = tf.keras.layers.Add()([input_layer, fc_output_layer])
        conv_fc_output_layer = tf.keras.layers.Dropout(0.2)(conv_fc_output_layer)
#         fc_output_layer = tf.keras.layers.BatchNormalization()(fc_output_layer)

    rnn_output_layer = tf.keras.layers.Bidirectional(tf.keras.layers.CuDNNGRU(64, return_sequences=True))(embedding_layer)
    rnn_attention_1_layer = tf.keras.layers.Lambda(lambda x: self_alignment(x))(rnn_output_layer)
    rnn_output_layer = tf.keras.layers.Bidirectional(tf.keras.layers.CuDNNGRU(64, return_sequences=True))(rnn_output_layer)
    
    rnn_attention_2_layer = tf.keras.layers.Lambda(lambda x: self_alignment(x))(rnn_output_layer)
    rnn_max_pool_layer = tf.keras.layers.GlobalMaxPooling1D()(rnn_output_layer)
    rnn_avg_pool_layer = tf.keras.layers.GlobalAveragePooling1D()(rnn_output_layer)
    
    rnn_augmented_layer = tf.keras.layers.concatenate([rnn_max_pool_layer, rnn_avg_pool_layer, 
                                                       rnn_attention_1_layer, rnn_attention_2_layer], axis=-1)
    rnn_augmented_layer = tf.keras.layers.Dense(FC_FEATURE_DIM)(rnn_augmented_layer)

    for i in range(NUM_FC_LAYERS):
        input_layer = rnn_fc_output_layer if i > 0 else rnn_augmented_layer
        rnn_fc_output_layer = tf.keras.layers.Dense(FC_FEATURE_DIM, activation='relu')(input_layer)
#         fc_output_layer = tf.keras.layers.Add()([input_layer, fc_output_layer])
        rnn_fc_output_layer = tf.keras.layers.Dropout(0.2)(rnn_fc_output_layer)
#         fc_output_layer = tf.keras.layers.BatchNormalization()(fc_output_layer)

    augmented_layer = tf.keras.layers.concatenate([rnn_fc_output_layer, conv_fc_output_layer])
    augmented_layer = tf.keras.layers.Dense(FC_FEATURE_DIM)(augmented_layer)
    for i in range(NUM_FC_LAYERS):
        input_layer = fc_output_layer if i > 0 else augmented_layer
        fc_output_layer = tf.keras.layers.Dense(FC_FEATURE_DIM, activation='relu')(input_layer)
#         fc_output_layer = tf.keras.layers.Add()([input_layer, fc_output_layer])
        fc_output_layer = tf.keras.layers.Dropout(0.2)(fc_output_layer)
#         fc_output_layer = tf.keras.layers.BatchNormalization()(fc_output_layer)

    logits = tf.keras.layers.Dense(1, activation='sigmoid', name='logits')(fc_output_layer)
    # predicts = tf.keras.layers.Lambda(lambda x: tf.round(x), name='predicts')(logits)
    model = tf.keras.Model(inputs=features, outputs=logits)

    model.compile(loss='binary_crossentropy', 
                  optimizer='adam',
                  metrics=[f1])
    
    return model

In [None]:
# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go
def train_pred(model, train_X, train_y, val_X, val_y, epochs=2, callback=None):
    best_val_y, best_test_y, best_score = None, None, 0.
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y), callbacks = callback, verbose=0)
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)
        score = metrics.f1_score(val_y, (pred_val_y > THRESHOLD).astype(int))
        pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
        
        if score > best_score:
            best_score = score
            best_val_y = pred_val_y
            best_test_y = pred_test_y
        
        print("Epoch: ", e, "-    Val F1 Score: {:.4f}".format(score))

    print('=' * 60)
    return best_val_y, best_test_y, best_score


In [None]:
word_index = tokenizer.word_index
embedding_matrix_1 = load_glove(word_index)
# embedding_matrix_3 = load_para(word_index)
# embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_3], axis = 0)
embedding_matrix = embedding_matrix_1

In [None]:
DATA_SPLIT_SEED = 2019
clr = CyclicLR(base_lr=0.001, max_lr=0.002,
               step_size=300., mode='exp_range',
               gamma=0.99994)

train_meta = np.zeros(train_y.shape)
test_meta = np.zeros(test_X.shape[0])
splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=DATA_SPLIT_SEED).split(train_X, train_y))
for idx, (train_idx, valid_idx) in enumerate(splits):
        X_train = train_X[train_idx]
        y_train = train_y[train_idx]
        X_val = train_X[valid_idx]
        y_val = train_y[valid_idx]
        model = build_model(embedding_matrix)
        pred_val_y, pred_test_y, best_score = train_pred(model, X_train, y_train, X_val, y_val, epochs = 5, callback = [clr,])
        train_meta[valid_idx] = pred_val_y.reshape(-1)
        test_meta += pred_test_y.reshape(-1) / len(splits)

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub.prediction = test_meta > THRESHOLD
sub.to_csv("submission.csv", index=False)

In [None]:
f1_score(y_true=train_y, y_pred=train_meta > THRESHOLD)