# Data Augmentation

we're going to experiment with augmenting the data. We'll do this by adding noise to the embedding vectors as they go into the model.

In [65]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, Concatenate, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.optimizers import Adam, RMSprop
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

In [66]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [67]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.08, random_state=2018)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 95000 # 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_##_").values
val_X = val_df["question_text"].fillna("_##_").values
test_X = test_df["question_text"].fillna("_##_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

**Attention Layer:** https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

In [69]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [70]:
!ls ../input/embeddings/

glove.840B.300d			paragram_300_sl999
GoogleNews-vectors-negative300	wiki-news-300d-1M


# Load Some Embeddings

We have four different types of embeddings.
 * GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/
 * glove.840B.300d - https://nlp.stanford.edu/projects/glove/
 * paragram_300_sl999 - https://cogcomp.org/page/resource_view/106
 * wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html
 
 A very good explanation for different types of embeddings are given in this [kernel](https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge). Please refer the same for more details..

**Glove Embeddings:**

In this section, let us use the Glove embeddings with LSTM model.

In [71]:
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [13]:
train_X[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,    15, 27975,  1469, 24011], dtype=int32)

In [14]:
embedding_matrix[train_X[0:10]].shape

(10, 70, 300)

# Data Augmentation

Data augmentation strategy is adding an additional multiplier p training examples (ie. total training set is p\*m examples) where additional examples are obtained by adding noise to the embedding vector. 

We could additionally try translations on all the embedding vectors (based on word analogy rationale).

First, let's get a matrix of training examples.

In [15]:
train_X.shape

(1105501, 70)

In [19]:
# Now let's write a generator function that manually converts train data to embedding matrix

def x_generator(x_data, y_data, embedding_matrix, max_features, batch_size = 512):
    n_batches = int(x_data.shape[0] / batch_size)
    
    # set lower index for this batch
    batch_lower = 0
    
    while True:
                
        batch_upper = batch_lower + batch_size
        #handle the final batch
        if batch_upper > x_data.shape[0]:
            batch_upper = x_data.shape[0]
            
        x_batch = x_data[batch_lower:batch_upper,:]
        y_batch = y_data[batch_lower:batch_upper]
        
        x_batch_embeddings = embedding_matrix[x_batch]
        
        
        batch_lower += batch_size
        
        #handle the final batch
        if batch_lower > x_data.shape[0]:
            batch_lower = 0
        
        yield x_batch_embeddings, y_batch

In [20]:
#modifying the generator to augment the data by duplicating the batch and adding noise
def x_generator_augment(x_data, y_data, embedding_matrix, emb_std, max_features, batch_size=512, #
                    augment_factor=4, noise_scale=0.1):
    """
    emb_std is the standard deviation of the embedding matrix
    max_features is the number of tokenized words
    batch_size is the size of the training batch to augment
    augment_factor is the multiplier for the size of the augmented batch
    noise_scale is how many standard deviations to scale the noise by
    """
    n_batches = int(x_data.shape[0] / batch_size)
    
    
    # set lower index for this batch
    batch_lower = 0
    
    
    # every time we loop round, shuffle the training set
    np.random.seed(batch_lower)
    
    # not using shuffled for now
    rnd_idx = np.random.permutation(len(x_data))
    x_shuffled = x_data[rnd_idx]
    y_shuffled = y_data[rnd_idx]
    
    
    
    while True:
        
        batch_upper = batch_lower + batch_size
        #handle the final batch
        if batch_upper > x_data.shape[0]:
            batch_upper = x_data.shape[0]
            
        x_batch = x_data[batch_lower:batch_upper,:]
        y_batch = y_data[batch_lower:batch_upper]
        batch_embeddings = embedding_matrix[x_batch]
        
        # create an empty list for the augmented batches
        augmented_batches = [batch_embeddings]
        y_batches = [y_batch]
        
        for p in range (augment_factor):
            noise = np.random.normal(0, emb_std * noise_scale, (batch_embeddings.shape))
            aug = np.add (noise, batch_embeddings)
            augmented_batches.append(aug)
            y_batches.append(y_batch)
        
        x_augmented = np.vstack((augmented_batches))
        y_augmented = np.hstack((y_batches))
        
        # now reset the counters for the next iteration
        batch_lower += batch_size
        
        #reset the generator and reshuffle the training set
        if batch_lower > x_data.shape[0]:
            batch_lower = 0
            rnd_idx = np.random.permutation(len(x_data))
            x_shuffled = x_data[rnd_idx]
            y_shuffled = y_data[rnd_idx]
        
        yield x_augmented, y_augmented
    

In [22]:
#let's test out the generator by looking at the shapes of the data it outputs

x, y = train_generator.__next__()
print (x.shape, y.shape)

(512, 70, 300) (512,)


In [23]:
# ORIGINAL MODEL CODE        
#inp = Input(shape=(maxlen,))
#x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
#x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
#x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
#x = Attention(maxlen)(x)
#x = Dense(64, activation="relu")(x)
#x = Dense(1, activation="sigmoid")(x)
#model = Model(inputs=inp, outputs=x)
#model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])

Let's try a modification of the LSTM attention model, where we also feed in the internal states of the LSTMs into the fully connected layers. Note that each LSTM has two internal states (c and s) for each of the forward and backward directions. This ends up with quite a lot of units in the Concatenate layer so there's another fully connected layer to reduce the number of units toward the softmax classifier more gradually.

In [24]:
def build_attention_model(embed_matrix):
    inp = Input(shape=(maxlen,embed_size))
#    x = Embedding(max_features, embed_size, weights=[embed_matrix], trainable=False)(inp)
    # get internal states of LSTM, both forward and back
    [x, s_1f, s_1b, c_1f, c_1b] = Bidirectional(CuDNNLSTM(128, return_sequences=True, return_state=True))(inp)
    [x, s_2f, s_2b, c_2f, c_2b] = Bidirectional(CuDNNLSTM(64, return_sequences=True, return_state=True))(x)
    x = Attention(maxlen)(x)
    
    # fully connected part of model, takes internal states of both LSTMs as well as the output of LSTM2
    x = Concatenate()([x, s_1f, s_1b, c_1f, c_1b, s_2f, s_2b, c_2f, c_2b])
    x = Dense(256, activation="relu")(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=1e-3), metrics=['accuracy'])
    model.summary()
    
    return model

And code to evaluate the model (F1 scores at various thresholds) on the validation set

In [25]:
from sklearn import metrics

def calc_f1_scores(model, dev_x, dev_y):

    dev_x_embeddings = embedding_matrix[dev_x]

    pred_glove_dev_Y = model.predict([dev_x_embeddings], batch_size=1024, verbose=1)

    best_thresh = -1 # init value
    best_f1 = 0

    for thresh in np.arange(0.1, 0.501, 0.01):

        thresh = np.round(thresh, 2)
    
        f1 = metrics.f1_score(dev_y, (pred_glove_dev_Y>thresh).astype(int))
        print("F1 score at threshold {0} is {1}".format(thresh, f1))
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh

        
    print("Best F1 score was at threshold {0}, {1}".format(best_thresh, best_f1))
    return (best_thresh, best_f1, pred_glove_dev_Y)

Compare the models. First, the untuned model

In [26]:
model = build_attention_model(embedding_matrix)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 70, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 70, 256), (N 440320      input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) [(None, 70, 128), (N 164864      bidirectional_1[0][0]            
__________________________________________________________________________________________________
attention_1 (Attention)         (None, 128)          198         bidirectional_2[0][0]            
__________________________________________________________________________________________________
concatenat

In [28]:
# Configure the generator
batch_size=1024
num_train_batches = math.ceil(train_X.shape[0] / batch_size)
num_val_batches = math.ceil(val_X.shape[0] / batch_size)
print ("num train batches:", num_train_batches)
print ("num val batches:", num_val_batches)

train_generator = x_generator_augment(train_X, train_y, embedding_matrix, emb_std, max_features, batch_size=batch_size, 
                    augment_factor=0, noise_scale=0.05)

val_generator = x_generator(val_X, val_y, embedding_matrix, max_features, batch_size = batch_size)

num train batches: 1080
num val batches: 94


# Testing the data augmentation

Model built, data augmentation algorithm built into the generator, let's test out whether we can get an improvement in prediction accuracy/F1 score by comparing the model trained on the original dataset (trained until it starts to overfit) with the same model trained on the augmented data.

In [29]:
model.fit_generator(train_generator, steps_per_epoch=num_train_batches, epochs=3, 
                    validation_data=val_generator, validation_steps=num_val_batches)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f71285a48d0>

In [63]:
print ("Original model")
(best_thresh, best_f1, pred_glove_val_y) = calc_f1_scores (model, val_X, val_y)
pred_glove_test_y = model.predict([embedding_matrix[test_X]], batch_size=1024, verbose=1)

Original model
F1 score at threshold 0.1 is 0.5705485635388455
F1 score at threshold 0.11 is 0.5796503420566836
F1 score at threshold 0.12 is 0.5882287679591724
F1 score at threshold 0.13 is 0.5963323522753
F1 score at threshold 0.14 is 0.6032881453706375
F1 score at threshold 0.15 is 0.6103468547912992
F1 score at threshold 0.16 is 0.6155685758699032
F1 score at threshold 0.17 is 0.6214654910307085
F1 score at threshold 0.18 is 0.6262526289743907
F1 score at threshold 0.19 is 0.6311094358587762
F1 score at threshold 0.2 is 0.6364448984803984
F1 score at threshold 0.21 is 0.6400465959099145
F1 score at threshold 0.22 is 0.6439697666776207
F1 score at threshold 0.23 is 0.6474034620505993
F1 score at threshold 0.24 is 0.6503812672919901
F1 score at threshold 0.25 is 0.6523135807531953
F1 score at threshold 0.26 is 0.6541176470588235
F1 score at threshold 0.27 is 0.6565847511027095
F1 score at threshold 0.28 is 0.6584951112370696
F1 score at threshold 0.29 is 0.6604017216642755
F1 score a

## Augmented model

In [60]:
# rebuild the model
model = build_attention_model(embedding_matrix)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 70, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_11 (Bidirectional [(None, 70, 256), (N 440320      input_6[0][0]                    
__________________________________________________________________________________________________
bidirectional_12 (Bidirectional [(None, 70, 128), (N 164864      bidirectional_11[0][0]           
__________________________________________________________________________________________________
attention_6 (Attention)         (None, 128)          198         bidirectional_12[0][0]           
__________________________________________________________________________________________________
concatenat

In [61]:
batch_size=128
num_train_batches = math.ceil(train_X.shape[0] / batch_size)
num_val_batches = math.ceil(val_X.shape[0] / batch_size)
print ("num train batches:", num_train_batches)
print ("num val batches:", num_val_batches)

train_generator = x_generator_augment(train_X, train_y, embedding_matrix, emb_std, max_features, batch_size=batch_size, 
                    augment_factor=1, noise_scale=0.15)

val_generator = x_generator(val_X, val_y, embedding_matrix, max_features, batch_size = batch_size)

num train batches: 8637
num val batches: 752


In [62]:
model.fit_generator(train_generator, steps_per_epoch=num_train_batches, epochs=3, 
                    validation_data=val_generator, validation_steps=num_val_batches)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f71383315c0>

In [64]:
print ("Augmented model")
(best_thresh, best_f1, pred_glove_val_y) = calc_f1_scores (model, val_X, val_y)
pred_augmented_test_y = model.predict([embedding_matrix[test_X]], batch_size=1024, verbose=1)

Augmented model
F1 score at threshold 0.1 is 0.5705485635388455
F1 score at threshold 0.11 is 0.5796503420566836
F1 score at threshold 0.12 is 0.5882287679591724
F1 score at threshold 0.13 is 0.5963323522753
F1 score at threshold 0.14 is 0.6032881453706375
F1 score at threshold 0.15 is 0.6103468547912992
F1 score at threshold 0.16 is 0.6155685758699032
F1 score at threshold 0.17 is 0.6214654910307085
F1 score at threshold 0.18 is 0.6262526289743907
F1 score at threshold 0.19 is 0.6311094358587762
F1 score at threshold 0.2 is 0.6364448984803984
F1 score at threshold 0.21 is 0.6400465959099145
F1 score at threshold 0.22 is 0.6439697666776207
F1 score at threshold 0.23 is 0.6474034620505993
F1 score at threshold 0.24 is 0.6503812672919901
F1 score at threshold 0.25 is 0.6523135807531953
F1 score at threshold 0.26 is 0.6541176470588235
F1 score at threshold 0.27 is 0.6565847511027095
F1 score at threshold 0.28 is 0.6584951112370696
F1 score at threshold 0.29 is 0.6604017216642755
F1 score 