# Attention based model


Based on the this post and google paper:
https://explosion.ai/blog/deep-learning-formula-nlp



## Import packages

In [61]:
%matplotlib inline
from __future__ import print_function
import numpy as np
import pandas as pd
import datetime, time, json
import keras 


from keras.models import Sequential,Model
from keras.layers import Embedding, Dense, Dropout, Reshape, Merge, BatchNormalization, Lambda ,merge,SpatialDropout1D
from keras.layers import Conv1D , Flatten, Input
from keras.layers.pooling import MaxPooling1D
from keras.layers.convolutional import ZeroPadding1D
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.recurrent import LSTM

from keras.layers.merge import * 
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.optimizers import Adam

from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint



from keras import backend as K

from keras.initializers import he_normal



from sklearn.model_selection import train_test_split


## Initialize global variables

In [203]:


path = '/home/ubuntu/quora/'
data_home = path +"data/"

Q1_TRAINING_DATA_FILE = data_home+'cache/q1_train.npy'
Q2_TRAINING_DATA_FILE = data_home+'cache/q2_train.npy'
LABEL_TRAINING_DATA_FILE = data_home+'cache/label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = data_home+'cache/word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = data_home+'cache/nb_words.json'
Q1_TESTING_DATA_FILE = 'q1_test.npy'
Q2_TESTING_DATA_FILE = 'q2_test.npy'


MODEL_WEIGHTS_FILE = path+'weights/conv_weights_v1.h5'
MAX_SEQUENCE_LENGTH = 35
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25

nr_hidden = 128
drop_out=0.2

## Load the dataset, embedding matrix and word count

In [204]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    nb_words = json.load(f)['nb_words']

In [205]:
q1_data.shape,q2_data.shape

((404290, 35), (404290, 35))

## Partition the dataset into train and test sets

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=2019)


In [6]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
# Q1_train = X_train[:,0]
# Q2_train = X_train[:,1]
# Q1_test = X_test[:,0]
# Q2_test = X_test[:,1]

In [7]:
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]



TRAIN: [337176  17039  75113 ...,  41001 336218 115328] TEST: [283851 377233  35357 ..., 392228 325350 150065]


In [8]:
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

## Define the model

#### Pretrained AND tuneable embeddings (skipped)

In [67]:
def create_embedder():
    
    embedder = Sequential()
    embedder.add(Embedding(nb_words + 1, 
                     EMBEDDING_DIM, 
                     weights=[word_embedding_matrix], 
                     input_length=MAX_SEQUENCE_LENGTH, 
                     trainable=False,
                    input_shape=(MAX_SEQUENCE_LENGTH,)))
    embedder.add(TimeDistributed(
                        Dense(
                            200, #reducing dimensionality according to paper
                            activation=None,
                            use_bias=False,
                            name='project')))
    
    return embedder
    

In [70]:
def create_tuneable_embedder():
    
    embedder = Sequential()
    
    # projection embeddings ; 
    #not sure about their effectiveness but including them anyways; more details in the paper
    
    nr_tune = nb_words/2
    
    embedder.add(Lambda(lambda sent: sent % (nr_tune-1)+1,
                     input_shape=(MAX_SEQUENCE_LENGTH,),
                     output_shape=(MAX_SEQUENCE_LENGTH,)))
    
    embedder.add(Embedding(
                    nr_tune,
                    200,
                    input_length=MAX_SEQUENCE_LENGTH,
                    weights=None,
                    name='tune',
                    trainable=True))
    embedder.add(SpatialDropout1D(drop_out))
    
    return embedder
    

In [102]:
def create_bilstm():    
    

    encoder = Sequential()
        
    encoder.add(Bidirectional(LSTM(nr_hidden, return_sequences=True,
                                             dropout=drop_out, recurrent_dropout=drop_out)
                             ,input_shape=(MAX_SEQUENCE_LENGTH,200)
                             ))

    encoder.add(TimeDistributed(Dense(nr_hidden, activation='relu', kernel_initializer= he_normal())))
    encoder.add(TimeDistributed(Dropout(drop_out)))
    encoder.add(TimeDistributed(BatchNormalization()))

 
        
    
    return encoder


In [103]:

q1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
q2 = Input(shape=(MAX_SEQUENCE_LENGTH,))#,EMBEDDING_DIM

default_embedder = create_embedder()
tuneable_embedder = create_tuneable_embedder()

q1_pretrained_embed = default_embedder(q1)
q1_tuneable_embed = tuneable_embedder(q1)
q1_embed = merge([q1_pretrained_embed, q1_tuneable_embed], mode='sum')

q2_pretrained_embed = default_embedder(q2)
q2_tuneable_embed = tuneable_embedder(q2)
q2_embed = merge([q2_pretrained_embed, q2_tuneable_embed], mode='sum')


encoder = create_bilstm()

q1_encoded = encoder(q1_embed)
q2_encoded = encoder(q2_embed)





#### Pretrained Embeddings only (CURRENT)


In [206]:
def create_bilstm():
    
    
    Q = Sequential()
    
    # Embed
    Q.add(Embedding(nb_words + 1, 
                     EMBEDDING_DIM, 
                     weights=[word_embedding_matrix], 
                     input_length=MAX_SEQUENCE_LENGTH, 
                     trainable=False,
                    input_shape=(MAX_SEQUENCE_LENGTH,)))

    #ToDo: projection

    # Encode 
    Q.add(Bidirectional(LSTM(nr_hidden, return_sequences=True,
                                             dropout=drop_out, recurrent_dropout=drop_out)))

    Q.add(TimeDistributed(Dense(nr_hidden, activation='relu', kernel_initializer= he_normal())))#, init='he_normal'
    Q.add(TimeDistributed(Dropout(drop_out)))
    Q.add(TimeDistributed(BatchNormalization(),name='encoded'))

 
        
    
    return Q


In [207]:

q1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
q2 = Input(shape=(MAX_SEQUENCE_LENGTH,))#,EMBEDDING_DIM

encoder = create_bilstm()

q1_encoded = encoder(q1)
q2_encoded = encoder(q2)


### Attend

In [208]:
#Attention model

attention_model = Sequential()
attention_model.add(Dropout(drop_out,input_shape=(nr_hidden,)))

attention_model.add(Dense(nr_hidden, name='attend1',kernel_initializer= he_normal(),
            activation='relu'))
attention_model.add(BatchNormalization())


attention_model.add(Dropout(drop_out))
attention_model.add(Dense(nr_hidden, name='attend2',kernel_initializer= he_normal(),
            activation='relu'))
attention_model.add(BatchNormalization())

attention_model = TimeDistributed(attention_model,name='attended')

In [209]:
attention1 = attention_model(q1_encoded)
attention2 = attention_model(q2_encoded)


In [210]:
def _outer(AB):
    att_ji = K.batch_dot(AB[1], K.permute_dimensions(AB[0], (0, 2, 1)))
    return K.permute_dimensions(att_ji,(0, 2, 1))

co_attention = merge([attention1, attention2],
                     mode=_outer,
                     output_shape=(MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH))



### Align

In [211]:
def align(sentence, attention, transpose=False):
        def _normalize_attention(attmat):
            att = attmat[0]
            mat = attmat[1]
            if transpose:
                att = K.permute_dimensions(att,(0, 2, 1))
            # 3d softmax
            e = K.exp(att - K.max(att, axis=-1, keepdims=True))
            s = K.sum(e, axis=-1, keepdims=True)
            sm_att = e / s
            return K.batch_dot(sm_att, mat)
        
        return merge([attention, sentence], mode=_normalize_attention,
                      output_shape=(MAX_SEQUENCE_LENGTH,nr_hidden))#, name='aligned') # Shape: (i, n)

In [212]:
align1 = align(q2_encoded, co_attention) #alignment is normalized attention for the other question
align2 = align(q1_encoded, co_attention, transpose=True)



### Compare

In [213]:
comparison_model = Sequential()
comparison_model.add(Dropout(drop_out, input_shape=(nr_hidden*2,)))

comparison_model.add(Dense(nr_hidden, name='compare1',kernel_initializer="he_normal",activation='relu'))
comparison_model.add(BatchNormalization())
comparison_model.add(Dropout(drop_out))

comparison_model.add(Dense(nr_hidden, name='compare2',kernel_initializer="he_normal",activation='relu'))
comparison_model.add(BatchNormalization())

comparison_model = TimeDistributed(comparison_model,name='compared')

In [214]:
def get_features_by_comparison( sent, align):

    result = comparison_model(merge([sent, align], mode='concat')) # Shape: (i, n)
    avged = GlobalAveragePooling1D()(result)
    maxed = GlobalMaxPooling1D()(result)
    merged = merge([avged, maxed])
    result = BatchNormalization()(merged)
    
    return result

In [215]:


feats1 = get_features_by_comparison(q1_encoded, align1)
feats2 = get_features_by_comparison(q2_encoded, align2)

  app.launch_new_instance()


### Regression

In [216]:
regressor_model = Sequential()

regressor_model.add(Dropout(drop_out, input_shape=(nr_hidden*2,)))
regressor_model.add(Dense(200, name='entail1',kernel_initializer='he_normal',activation='relu'))
regressor_model.add(BatchNormalization())

regressor_model.add(Dropout(drop_out))
regressor_model.add(Dense(200, name='entail2', kernel_initializer='he_normal',activation='relu'))
regressor_model.add(BatchNormalization())

regressor_model.add(Dropout(drop_out))
regressor_model.add(Dense(200, name='entail3', kernel_initializer='he_normal',activation='relu'))
regressor_model.add(BatchNormalization())


regressor_model.add(Dropout(drop_out))
regressor_model.add(Dense(200, name='entail4', kernel_initializer='he_normal',activation='relu'))
regressor_model.add(BatchNormalization())

regressor_model.add(Dense(1, name='entail_out', activation='sigmoid'))

In [217]:
scores = regressor_model(merge([feats1, feats2], mode='concat'))

  if __name__ == '__main__':


In [218]:
model = Model(inputs=[q1, q2], outputs=[scores])
model.compile(loss='binary_crossentropy', 
              optimizer='nadam', 
              metrics=['accuracy'])#, 'precision', 'recall', 'fbeta_score'])

In [219]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_100 (InputLayer)           (None, 35)            0                                            
____________________________________________________________________________________________________
input_101 (InputLayer)           (None, 35)            0                                            
____________________________________________________________________________________________________
sequential_76 (Sequential)       (None, 35, 150)       35915550                                     
____________________________________________________________________________________________________
attended (TimeDistributed)       (None, 35, 150)       46500                                        
___________________________________________________________________________________________

## Train the model, checkpointing weights with best validation accuracy

In [221]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()


MODEL_WEIGHTS_FILE = path+'weights/attention_v1.h5'


callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]

history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=15,
                    batch_size=1024,
                    validation_data = ([Q1_test, Q2_test],y_test),
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2017-04-25 20:30:54.419606
Train on 363861 samples, validate on 40429 samples
Epoch 1/15

KeyboardInterrupt: 

## Plot training and validation accuracy

In [None]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                    'training': history.history['acc'],
                    'validation': history.history['val_acc']})
ax = acc.ix[:,:].plot(x='epoch', figsize={5,8}, grid=True)
ax.set_ylabel("accuracy")
ax.set_ylim([0.0,1.0]);

In [None]:


print("Starting training at", datetime.datetime.now())
t0 = time.time()

model.optimizer.lr = 1e-4
MODEL_WEIGHTS_FILE2 = path+'weights/attention_v1_lr1e4.h5'


callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE2, monitor='val_acc', save_best_only=True)]

history2 = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=15,
                    batch_size=64,
                    validation_split=VALIDATION_SPLIT,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

## Print best validation accuracy and epoch

without cleaning: Maximum accuracy at epoch 8 = 0.8227
with cleaning: Maximum accuracy at epoch 15 = 0.8268
with cleaning and stratified: 0.8275

In [45]:
max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(history.history['val_acc']))
print('Maximum accuracy at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(max_val_acc))

Maximum accuracy at epoch 10 = 0.8167


## Evaluate the model with best validation accuracy on the test partition

In [46]:
model.metrics_names

['loss', 'acc']

In [47]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_test, Q2_test], y_test) #, precision, recall, fbeta_score
print('')
print('loss      = {0:.4f}'.format(loss))
print('accuracy  = {0:.4f}'.format(accuracy))
# print('precision = {0:.4f}'.format(precision))
# print('recall    = {0:.4f}'.format(recall))
# print('F         = {0:.4f}'.format(fbeta_score))

loss      = 0.4085
accuracy  = 0.8144


In [48]:
preds = model.predict([Q1_test, Q2_test])



In [50]:
from sklearn.metrics import log_loss


print (log_loss(y_test,preds))
print (log_loss(y_test,np.clip(preds,1-0.82,0.82)))
print (log_loss(y_test,np.clip(preds,1-0.90,0.90)))
print (log_loss(y_test,np.clip(preds,1-0.93,0.93)))




0.408477160842
0.443975123093
0.418641389755
0.412511057188


## Making predictions on test set

In [51]:
q1_test_data = np.load(open(data_home+"cache/"+Q1_TESTING_DATA_FILE, 'rb'))
q2_test_data = np.load(open(data_home+"cache/"+Q2_TESTING_DATA_FILE, 'rb'))

In [52]:
q1_test_data[0],q1_test_data[0]

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    5,   22,    1, 1990,  802, 1222,   13,  352,   29,
        1611,  802], dtype=int32),
 array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    5,   22,    1, 1990,  802, 1222,   13,  352,   29,
        1611,  802], dtype=int32))

In [53]:
%%time
preds = model.predict([q1_test_data,q2_test_data], batch_size=128)

CPU times: user 5min 55s, sys: 1min 15s, total: 7min 11s
Wall time: 7min 11s


In [55]:
preds.shape

(2345796, 1)

In [54]:
clip =0.90

submission_name = "subm/deep_conv_v1_c90.csv"
flattend = preds.flatten()

clipped = np.clip(flattend,1-clip,clip)

In [56]:
df_test = pd.read_csv(data_home+'test.csv')

sub = pd.DataFrame({'test_id': df_test['test_id'], 'is_duplicate': clipped})
sub.head(2)

Unnamed: 0,is_duplicate,test_id
0,0.1,0
1,0.449301,1


In [57]:
sub.to_csv(path+submission_name, index=False)

In [58]:
from IPython.lib.display import FileLink

FileLink(submission_name)