In [0]:
import os
import re
import gc
import string
import unicodedata
import operator
import numpy as np
import pandas as pd

from sklearn import utils
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_curve, cohen_kappa_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras import initializers, regularizers, constraints
from keras.layers import Activation, Wrapper
from keras.engine.topology import Layer
from keras.layers import (Input, Embedding, SpatialDropout1D, Bidirectional, LSTM,
                          CuDNNLSTM, Flatten, Concatenate, Dense, Conv1D, MaxPooling1D)
from keras.initializers import glorot_normal, orthogonal
from keras.models import Model
from keras.callbacks import (EarlyStopping, ModelCheckpoint,
                             ReduceLROnPlateau)

from keras.layers import concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNLSTM, GRU, Dense, Bidirectional, SpatialDropout1D, Conv1D
from tqdm import tqdm
tqdm.pandas()

import spacy
from spacy import displacy

from google.colab import drive
drive.mount('/content/drive')
drive.mount("/content/drive", force_remount=True)

MAX_FEATURES = int(2.5e5)
MAX_LEN = 75
NFOLDS = 5
SEED = 1
SPATIAL_DROPOUT = 0.24
RNN_UNITS = 80
MODEL_PATH = "weights_best.hdf5"
CONST_LSTM = "lstm"
CONST_GRU = "gru"
CONST_CNN = "cnn"

## Load Data

In [0]:
def load_data():
	df_train = pd.read_csv("./data/train.csv")
	df_test = pd.read_csv("./data/test.csv")

	return df_train, df_test
import os
print(os.getcwd())

In [0]:
df_train, df_test = load_data()

## EDA

In [0]:
print(df_train.head())

print(len(df_train[df_train['target'] == 1]))
print(len(df_train[df_train['target'] == 0]))

In [0]:
mapping = dict()
def to_dict(x):
    arr = x.split(" ")
    
    for word in arr:
        word = word.lower()
        if word in mapping:
            mapping[word] += 1
        else:
            mapping[word] = 1

df_train['question_text'].apply(to_dict)


In [0]:
mapping
type(mapping.values())
len(mapping.keys())

In [0]:
freq_map = pd.DataFrame.from_dict({'word': list(mapping.keys()), 'count': list(mapping.values()) })

In [0]:
freq_map.head()

In [0]:
freq_map = freq_map.sort_values(by = ["count"], ascending=False).reset_index(drop=True)
top_250k = freq_map[:250000]["count"].values.sum()
top_450k = freq_map[:450690]["count"].values.sum()
float(top_250k)/top_450k

In [0]:
freq_map = freq_map.set_index('word')

In [0]:
freq_map.loc["the", "count"]

In [0]:
sum(df_test['question_text'].isna())

In [0]:
def load_word_embedding(filepath):
    """
    given a filepath to embeddings file, return a word to vec
    dictionary, in other words, word_embedding
    E.g. {'word': array([0.1, 0.2, ...])}
    """
    def _get_vec(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    print('load word embedding ......')
    try:
        word_embedding = dict(_get_vec(*w.split(' ')) for w in open(filepath))
    except UnicodeDecodeError:
        word_embedding = dict(_get_vec(*w.split(' ')) for w in open(
            filepath, encoding="utf8", errors='ignore'))
    # sanity check word vector length
    words_to_del = []
    for word, vec in word_embedding.items():
        if len(vec) != 300:
            words_to_del.append(word)
    for word in words_to_del:
        del word_embedding[word]
    return word_embedding

In [0]:
#download link : http://nlp.stanford.edu/data/glove.42B.300d.zip
word_embedding = load_word_embedding("./glove.42B.300d.txt")

## Data Preprocessing

In [0]:

def clean_misspell(text):
    """
    misspell list (quora vs. glove)
    """
    misspell_to_sub = {
        'fortnite': 'video game ',
        'Swachh': 'swachh bharat mission campaign ',
        'Quorans': 'quoran',
        'Qoura ': 'quora ',
        'quoras': 'quora',
        'Quroa': 'quora',
        'QUORA': 'quora',
        'qoura': 'quora',
        'Fortnite': 'video game',
        'redmi': 'mobile phone',
        'oneplus': 'mobile phone',
        '₹': 'rupee',
        'upwork': 'job website',
        'unacademy': 'education website',
        'byju': 'education website',
        'padmaavati': 'bollywood movie',
        'padmaavat': 'bollywood movie',
        'bahubali': 'bollywood movie',
        'quorans': 'quoran'
    }
    
    misspell_re = re.compile('(%s)' % '|'.join(misspell_to_sub.keys()))

    def _replace(match):
        """
        reference: https://www.kaggle.com/hengzheng/attention-capsule-why-not-both-lb-0-694 # noqa
        """
        try:
            word = misspell_to_sub.get(match.group(0))
        except KeyError:
            word = match.group(0)
            print('!!Error: Could Not Find Key: {}'.format(word))
        return word
    return misspell_re.sub(_replace, text)


def spacing_misspell(text):
    """
    'deadbody' -> 'dead body'
    """
    misspell_list = [
        '(F|f)uck',
        'Trump',
        '\W(A|a)nti',
        '(W|w)hy',
        '(W|w)hat',
        'How',
        'care\W',
        '\Wover',
        'gender',
        'people',
    ]
    misspell_re = re.compile('(%s)' % '|'.join(misspell_list))
    return misspell_re.sub(r" \1 ", text)


def clean_latex(text):
    """
    convert r"[math]\vec{x} + \vec{y}" to English
    """
    # edge case
    text = re.sub(r'\[math\]', ' LaTex math ', text)
    text = re.sub(r'\[\/math\]', ' LaTex math ', text)
    text = re.sub(r'\\', ' LaTex ', text)

    pattern_to_sub = {
        r'\\mathrm': ' LaTex math mode ',
        r'\\mathbb': ' LaTex math mode ',
        r'\\boxed': ' LaTex equation ',
        r'\\begin': ' LaTex equation ',
        r'\\end': ' LaTex equation ',
        r'\\left': ' LaTex equation ',
        r'\\right': ' LaTex equation ',
        r'\\(over|under)brace': ' LaTex equation ',
        r'\\text': ' LaTex equation ',
        r'\\vec': ' vector ',
        r'\\var': ' variable ',
        r'\\theta': ' theta ',
        r'\\mu': ' average ',
        r'\\min': ' minimum ',
        r'\\max': ' maximum ',
        r'\\sum': ' + ',
        r'\\times': ' * ',
        r'\\cdot': ' * ',
        r'\\hat': ' ^ ',
        r'\\frac': ' / ',
        r'\\div': ' / ',
        r'\\sin': ' Sine ',
        r'\\cos': ' Cosine ',
        r'\\tan': ' Tangent ',
        r'\\infty': ' infinity ',
        r'\\int': ' integer ',
        r'\\in': ' in ',
    }
    # post process for look up
    pattern_dict = {k.strip('\\'): v for k, v in pattern_to_sub.items()}
    # init re
    patterns = pattern_to_sub.keys()
    pattern_re = re.compile('(%s)' % '|'.join(patterns))

    def _replace(match):
        """
        reference: https://www.kaggle.com/hengzheng/attention-capsule-why-not-both-lb-0-694 # noqa
        """
        try:
            word = pattern_dict.get(match.group(0).strip('\\'))
        except KeyError:
            word = match.group(0)
            print('!!Error: Could Not Find Key: {}'.format(word))
        return word
    return pattern_re.sub(_replace, text)

def decontracted(text):
    """
    de-contract the contraction
    """
    # specific
    text = re.sub(r"(W|w)on(\'|\’)t", "will not", text)
    text = re.sub(r"(C|c)an(\'|\’)t", "can not", text)
    text = re.sub(r"(Y|y)(\'|\’)all", "you all", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll", "you all", text)

    # general
    text = re.sub(r"(I|i)(\'|\’)m", "i am", text)
    text = re.sub(r"(A|a)in(\'|\’)t", "is not", text)
    text = re.sub(r"n(\'|\’)t", " not", text)
    text = re.sub(r"(\'|\’)re", " are", text)
    text = re.sub(r"(\'|\’)s", " is", text)
    text = re.sub(r"(\'|\’)d", " would", text)
    text = re.sub(r"(\'|\’)ll", " will", text)
    text = re.sub(r"(\'|\’)t", " not", text)
    text = re.sub(r"(\'|\’)ve", " have", text)
    return text


def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    regular_punct = list(string.punctuation)
    extra_punct = [
        ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '₹', '&',
        '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
        '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
        '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
        '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
        '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
        '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
        'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
        '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
        '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']
    all_punct = ''.join(sorted(list(set(regular_punct + extra_punct))))
    re_tok = re.compile(f'([{all_punct}])')
    return re_tok.sub(r' \1 ', text)


def spacing_digit(text):
    """
    add space before and after digits
    """
    re_tok = re.compile('([0-9])')
    return re_tok.sub(r' \1 ', text)


def spacing_number(text):
    """
    add space before and after numbers
    """
    re_tok = re.compile('([0-9]{1,})')
    return re_tok.sub(r' \1 ', text)


def remove_number(text):
    """
    numbers are not toxic
    """
    return re.sub('\d', '#', text)


def remove_space(text):
    """
    remove extra spaces and ending space if any
    """
    text = re.sub('\s+', ' ', text)
    text = re.sub('\s+$', '', text)
    return text

def lower_casing(text):
    return text.lower()


def tokenizeSentences(df):
    COLUMNS = ["text","lemma","pos","tag","dep","shape","is_alpha","is_stop_word"]
    #Iterate through every word of every sentence
    tokenizedWords = pd.DataFrame(columns=COLUMNS)
    for sent in df["question_text"].values.tolist():
        doc = nlp(sent.lower()) #Tokenize using spacy
        for token in doc:
            #Add new row for each new word
            temp = pd.DataFrame(index = [token.text],
                                columns=COLUMNS,
                                data=[[
                                        token.text,
                                        token.lemma_, 
                                        token.pos_, 
                                        token.tag_, 
                                        token.dep_, 
                                        token.shape_, 
                                        token.is_alpha, 
                                        token.is_stop
                                        ]])
            tokenizedWords = pd.concat([tokenizedWords,temp])
    #only unique words will exist
    tokenizedWords = tokenizedWords.drop_duplicates()
    return tokenizedWords

def preprocess(text, remove_num=True):
    
    # 1. de-contract
    text = decontracted(text)
    # 2. clean misspell
    text = clean_misspell(text)
    # 3. space misspell
    text = spacing_misspell(text)
    # 4. clean_latex
    text = clean_latex(text)
    # 5. space
    text = spacing_punctuation(text)
    # 6. handle number
    text = spacing_digit(text)    
    text = remove_number(text)
    # 7. remove space
    text = remove_space(text)
    return text

In [0]:
def tokenize(df_text, max_features):
    # preprocess
    df_text = df_text.progress_apply(preprocess)
#     df_text = df_text.progress_apply(lower_casing)
#     df_text = df_text.progress_apply(preprocess)
    # tokenizer
    tokenizer = Tokenizer(
        num_words=max_features,
        filters='',
        lower=True,
        split=' ')
    # fit to data
    tokenizer.fit_on_texts(list(df_text))
    # tokenize the texts into sequences
    sequences = tokenizer.texts_to_sequences(df_text)
    return sequences, tokenizer

In [0]:
y_train = df_train.target
# get split index
train_test_cut = df_train.shape[0]
# get all text
df_text = pd.concat(
    [df_train['question_text'], df_test['question_text']],
    axis=0).reset_index(drop=True)
sequences, tokenizer = tokenize(df_text, max_features=MAX_FEATURES)

In [0]:
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='pre', truncating='post')  # noqa
X_train = X[:train_test_cut]
X_test = X[train_test_cut:]

In [0]:
X_train[5]

## Create Embeddings

In [0]:
"""
create weights for embedding layer where row is the word index
and collumns are the embedding dense vector
"""
def create_embedding_weights(word_index, word_embedding,
                             max_features, paragram=False):
    print('create word embedding weights ......')
    # get entire embedding matrix
    mat_embedding = np.stack(word_embedding.values())
    # get shape
    a, b = min(max_features, len(word_index)), mat_embedding.shape[1]
    print('embedding weights matrix with shape: ({}, {})'.format(a, b))
    # init embedding weight matrix
    embedding_mean, embedding_std = mat_embedding.mean(), mat_embedding.std()
    embedding_weights = np.random.normal(embedding_mean, embedding_std, (a, b))
    # mapping
    not_in_embedding = 0
    not_in_embedding_map = []
    for word, idx in word_index.items():
        if idx >= a:
            continue
        if paragram:
            word_vec = word_embedding.get(word.lower(), None)
        else:
            word_vec = word_embedding.get(word, None)
        if word_vec is not None:
            embedding_weights[idx] = word_vec
        else:
            not_in_embedding +=1

            count = 0
            if word in freq_map.index:
                count = freq_map.loc[word, "count"]
                
#             print(count)
            not_in_embedding_map.append((word, count))
            
    print("total not found in embeddings")
    print(not_in_embedding)
    sorted_by_second = sorted(not_in_embedding_map, key=lambda tup: tup[1], reverse=True)
    print(sorted_by_second[:100])
    
    return embedding_weights


In [0]:
glove_weights = create_embedding_weights(tokenizer.word_index, word_embedding, MAX_FEATURES, False)

In [0]:
word_embedding['93']

## Model Layers

In [0]:
def get_callbacks():
    earlystopping = EarlyStopping(monitor='val_loss',
                                  min_delta=0.0001,
                                  patience=2,
                                  verbose=2,
                                  mode='auto')
    checkpoint = ModelCheckpoint(filepath=MODEL_PATH,
                                 monitor='val_loss',
                                 save_best_only=True,
                                 mode='min',
                                 verbose=2)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  min_lr=0.0001,
                                  factor=0.6,
                                  patience=1,
                                  verbose=2)
    return [earlystopping, checkpoint, reduce_lr]


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim


def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale


class Capsule(Layer):

    def __init__(self, num_capsule, dim_capsule, routings=3,
                 kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),   # noqa
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),   # noqa
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))    # noqa
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]  # noqa

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]  # noqa
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]    # noqa
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))    # noqa
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])
        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)


class DropConnect(Wrapper):

    def __init__(self, layer, prob, **kwargs):
        self.prob = prob
        self.layer = layer
        super(DropConnect, self).__init__(layer, **kwargs)
        if 0. < self.prob < 1.:
            self.uses_learning_phase = True

    def build(self, input_shape):
        if not self.layer.built:
            self.layer.build(input_shape)
            self.layer.built = True
        super(DropConnect, self).build()

    def compute_output_shape(self, input_shape):
        return self.layer.compute_output_shape(input_shape)

    def call(self, x):
        if 0. < self.prob < 1.:
            self.layer.kernel = K.in_train_phase(
                K.dropout(self.layer.kernel, self.prob),
                self.layer.kernel)
            self.layer.bias = K.in_train_phase(
                K.dropout(self.layer.bias, self.prob),
                self.layer.bias)
        return self.layer.call(x)    
    
def get_embedding_layer(embed_weights=None):
    input_dim = embed_weights.shape[0]
    output_dim = embed_weights.shape[1]
    embedding_layer = Embedding(
        input_dim=input_dim,
        output_dim=output_dim,
        weights=[embed_weights],
        trainable=False,
        name='embedding'
    )
    del embed_weights, input_dim, output_dim
    gc.collect()
    return embedding_layer

## **CNN Model Implementation**

In [0]:
def get_conv_pool(x_input, sufix, n_grams=[3,4,5], feature_maps=100):
    branches = []
    for n in n_grams:
        branch = Conv1D(filters=feature_maps, kernel_size=n, activation='relu', name='Conv_'+sufix+'_'+str(n))(x_input)
        branch = MaxPooling1D(pool_size=2, strides=None, padding='valid', name='MaxPooling_'+sufix+'_'+str(n))(branch)
        branch = Flatten(name='Flatten_'+sufix+'_'+str(n))(branch)
        branches.append(branch)
    return branches


def get_cnn_model(embed_weights):
    
    i = Input(shape=(MAX_LEN,), dtype='int32', name='main_input')
    embedding_layer = get_embedding_layer(embed_weights)

    x = embedding_layer(i)

    # generate several branches in the network, each for a different convolution+pooling operation,
    # and concatenate the result of each branch into a single vector
    branches = get_conv_pool(x, 'static')
    z = concatenate(branches, axis=-1)

    # pass the concatenated vector to the predition layer
    o = Dense(1, activation='sigmoid', name='output')(z)

    model = Model(inputs=i, outputs=o)
    model.compile(loss={'output': 'binary_crossentropy'}, optimizer='adam')

    return model

## LSTM Model Implementation

In [0]:
def get_lstm_model(embed_weights):
    input_layer = Input(shape=(MAX_LEN, ), name='input')
    # 1. embedding layer
    # get embedding weights    
    embedding_layer = get_embedding_layer(embed_weights)
    x = embedding_layer(input_layer)
    # 2. dropout
    x = SpatialDropout1D(rate=SPATIAL_DROPOUT)(x)
    # 3. bidirectional lstm
    x = Bidirectional(
        layer=LSTM(RNN_UNITS, return_sequences=True,
                        kernel_initializer=glorot_normal(seed=1029),
                        recurrent_initializer=orthogonal(gain=1.0, seed=1029)),
        name='bidirectional_lstm')(x)
    # 4. capsule layer
    capsul = Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x) # noqa
    capsul = Flatten()(capsul)
    capsul = DropConnect(Dense(32, activation="relu"), prob=0.01)(capsul)

    # 5. attention later
    atten = Attention(step_dim=MAX_LEN, name='attention')(x)
    atten = DropConnect(Dense(16, activation="relu"), prob=0.05)(atten)
    x = Concatenate(axis=-1)([capsul, atten])

    # 6. output (sigmoid)
    output_layer = Dense(units=1, activation='sigmoid', name='output')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

## GRU Model Implementation

In [0]:

def get_gru_model(embed_weights):
    input_layer = Input(shape=(MAX_LEN, ), name='input')
    # 1. embedding layer
    # get embedding weights    
    embedding_layer = get_embedding_layer(embed_weights)
    x = embedding_layer(input_layer)
    # 2. dropout
    x = SpatialDropout1D(rate=SPATIAL_DROPOUT)(x)
    # 3. bidirectional lstm
    x = Bidirectional(
        layer=GRU(RNN_UNITS, return_sequences=True,
                        kernel_initializer=glorot_normal(seed=1029),
                        recurrent_initializer=orthogonal(gain=1.0, seed=1029)),
        name='bidirectional_lstm')(x)
    # 4. capsule layer
    capsul = Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x) # noqa
    capsul = Flatten()(capsul)
    capsul = DropConnect(Dense(32, activation="relu"), prob=0.01)(capsul)

    # 5. attention later
    atten = Attention(step_dim=MAX_LEN, name='attention')(x)
    atten = DropConnect(Dense(16, activation="relu"), prob=0.05)(atten)
    x = Concatenate(axis=-1)([capsul, atten])

    # 6. output (sigmoid)
    output_layer = Dense(units=1, activation='sigmoid', name='output')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [0]:
def get_model(embed_weights=None, model_name=CONST_LSTM):
    if(model_name == CONST_CNN):
        return get_cnn_model(embed_weights)
    elif(model_name == CONST_GRU):
        return get_gru_model(embed_weights)
    else:
        return get_lstm_model(embed_weights)

In [0]:
embed_weights = np.mean((glove_weights, glove_weights), axis=0)
# print('embedding weights with shape: {}'.format(embed_weights.shape))
# train models
kfold = StratifiedKFold(n_splits=NFOLDS, random_state=SEED, shuffle=True)
best_thres = []
y_submit = np.zeros((X_test.shape[0], ))

In [0]:
y_train_1_idx = y_train[y_train == 1][:100].index
y_train_0_idx = y_train[y_train == 0][:100].index


X_train_1 = X_train[y_train_1_idx]
X_train_0 = X_train[y_train_0_idx]
X_train_sample = np.append(X_train_1, X_train_0, axis=0)
X_train_sample
y_train_sample = [1]*100 + [0]*100
X_train_sample.shape

## Calculating Performance Metrics

In [0]:
def calculate_metrics(y_true, y_pred):
  f1 = f1_score(y_true, y_pred) 
  cm = confusion_matrix(y_true, y_pred)
  pre = precision_score(y_true, y_pred)
  rec = recall_score(y_true, y_pred)
  roc = roc_curve(y_true, y_pred)
  kap = cohen_kappa_score(y_true, y_pred)
  
  print("F1 score " + str(f1))
  print("Confusion metrics " + str(cm))
  print("Precision " + str(pre))
  print("Recall " + str(rec))
  print("ROC curve score " + str(roc))
  print("Kappa score " + str(kap))
  

In [0]:
 X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42, stratify=y_train)

## Training Models

In [0]:
for i, (idx_train, idx_val) in enumerate(kfold.split(X_train, y_train)):
    # data
    X_t = X_train[idx_train]
    y_t = y_train[idx_train]
    X_v = X_train[idx_val]
    y_v = y_train[idx_val]
    model = get_model(embed_weights, CONST_CNN)
    
    if i == 0:
        print(model.summary())
    # get class weight
    weights = None
    weights = utils.class_weight.compute_class_weight('balanced', np.unique(y_t), y_t)    # noqa
    # train
    model.fit(
        X_t, y_t,
        batch_size=512, epochs=5,
        validation_data=(X_v, y_v),
        verbose=2, callbacks=get_callbacks(),
        class_weight=weights)
    # reload best model
    model.load_weights(MODEL_PATH)
    # get f1 threshold
    y_proba = model.predict([X_v], batch_size=1024, verbose=2)
    print("Metrics....")
    print(calculate_metrics(np.squeeze(y_v), (np.squeeze(y_proba) > 0.51).astype(int)))

## Testing models

In [0]:
y_pred = model.predict([X_test], batch_size=1024, verbose=2)
print("accurracy....")
print(calculate_metrics(np.squeeze(y_test), (np.squeeze(y_pred) > 0.51).astype(int)))

In [0]:
y_train_1_idx = y_train[y_train == 1]
y_train_0_idx = y_train[y_train == 0]
print("Toxic count " + str(len(y_train_1_idx)))
print("Intoxic count " + str(len(y_train_0_idx)))