# Part B)

In [79]:
#import statements
import pandas as pd
import numpy as np
import keras
import nltk
from nltk.tokenize import word_tokenize
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras import layers

from sklearn.metrics import roc_auc_score
import re
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_colwidth', -1)
from sklearn.model_selection import train_test_split

import functions.utils
from functions.utils import *
from functions.utils import clean_text
from functions.utils import load_embedding, create_embedding_weights, max_seq_len
from keras.utils import to_categorical, plot_model
import keras.backend as K
import matplotlib as plt
import models.models as models

import functions.model_evaluation
from functions.model_evaluation import *

import functions.data_manipulation
from functions.data_manipulation import *
from keras.regularizers import l2


In [7]:
# loading datasets
train = pd.read_csv('data/train.txt', delimiter = '\t')
test = pd.read_csv('data/test.txt', delimiter = '\t')
test_ann = pd.read_csv('data/test_anno.txt', delimiter = '\t')
trial = pd.read_csv('data/trial.txt', delimiter = '\t')

## Data Preprocessing

#### Data cleaning


In [8]:
#Cleaning the data
train['sentence_A'] = train['sentence_A'].apply(clean_text)
train['sentence_B'] = train['sentence_B'].apply(clean_text)
test['sentence_A'] = test['sentence_A'].apply(clean_text)
test['sentence_B'] = test['sentence_B'].apply(clean_text)
trial['sentence_A'] = trial['sentence_A'].apply(clean_text)
trial['sentence_B'] = trial['sentence_B'].apply(clean_text)

In [9]:
#encoding the target feature
lbl_enc = LabelEncoder()
train['entailment_encoded'] = lbl_enc.fit_transform(train['entailment_judgment'])
trial['entailment_encoded'] = lbl_enc.fit_transform(trial['entailment_judgment'])
test_ann['entailment_encoded'] = lbl_enc.fit_transform(test['entailment_judgment'])

### Tokenizing data

In [10]:
file_name = 'word_embeddings/glove.6B.300d.txt'
embeddings = load_embedding(file_name)

In [16]:
NUM_WORDS = len(embeddings) #200000
sentences = (list(train['sentence_A']) + list(train['sentence_B']) + 
                       list(test['sentence_A']) + list(test['sentence_B'])+ 
                           list(trial['sentence_A']) + list(trial['sentence_B']))
tokenize = Tokenizer(num_words = NUM_WORDS)
tokenize.fit_on_texts(sentences)
sent1_word_seq = tokenize.texts_to_sequences(train['sentence_A'])
sent2_word_seq = tokenize.texts_to_sequences(train['sentence_B'])
sent1_word_seq_test = tokenize.texts_to_sequences(test['sentence_A'])
sent2_word_seq_test = tokenize.texts_to_sequences(test['sentence_B'])
sent1_word_seq_trial = tokenize.texts_to_sequences(trial['sentence_A'])
sent2_word_seq_trial = tokenize.texts_to_sequences(trial['sentence_B'])
word_index = tokenize.word_index

In [17]:
#Matrix with the embedding weights
embedding_dim = 300
embedding_weights = create_embedding_weights(embeddings, embedding_dim, word_index, NUM_WORDS)

In [18]:
# extracting the maximum sequence length
max_seq_length = max_seq_len(sent1_word_seq)
max_seq_length = max_seq_len(sent2_word_seq, max_seq_length)
max_seq_length = max_seq_len(sent1_word_seq_test, max_seq_length)
max_seq_length = max_seq_len(sent2_word_seq_test, max_seq_length)
max_seq_length = max_seq_len(sent1_word_seq_trial, max_seq_length)
max_seq_length = max_seq_len(sent2_word_seq_trial, max_seq_length)

In [19]:
# padding the sequences
sent1_data = pad_sequences(sent1_word_seq, maxlen = max_seq_length)
sent2_data = pad_sequences(sent2_word_seq, maxlen = max_seq_length)

sent1_data_trial = pad_sequences(sent1_word_seq_trial, maxlen = max_seq_length)
sent2_data_trial = pad_sequences(sent2_word_seq_trial, maxlen = max_seq_length)

sent1_data_test = pad_sequences(sent1_word_seq_test, maxlen = max_seq_length)
sent2_data_test = pad_sequences(sent2_word_seq_test, maxlen = max_seq_length)

In [20]:
NUM_WORDS = len(embedding_weights)
NUM_WORDS

2307

## Model

### Siamese LSTM

In [22]:
m1 = models(embedding_dim = embedding_dim,
                   NUM_WORDS = NUM_WORDS,
                   embedding_weights = embedding_weights,
                   max_seq_length = max_seq_length,
                   task = 'relatedness',
                   dropout = .1,
                   l2_reg=.0001
                  )

In [23]:
model = m1.siames()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 32, 300)      692100      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 300)          721200      embedding_1[0][0]          

In [26]:
hist = model.fit([sent1_data, sent2_data], train['relatedness_score'], batch_size = 10, 
                 epochs = 15,
                 validation_data = ([sent1_data_trial, sent2_data_trial], trial['relatedness_score']))

Train on 4500 samples, validate on 500 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


#### Model Evaluation

In [27]:
k = model.predict([sent1_data_test, sent2_data_test])

In [33]:
y_pred = [i[0] for i in k]
y_true = test_ann['relatedness_score']

In [34]:
pearson, spearman, mean_abs_deviation = evaluate_relatedness(y_true, y_pred)

In [35]:
print("pearson: "+str(pearson[0]))
print("spearman: "+str(spearman[0]))
print("mean_abs_deviation: "+str(mean_abs_deviation)+ "% ")

pearson: 0.8321203296074714
spearman: 0.7837459793551984
mean_abs_deviation: 15.854661095719488% 


In [45]:
siamese_df = map_relatedness(test,y_pred,cols = ['pair_ID']  )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['relatedness_score']= relatedness_result


In [46]:
output_csv(siamese_df, file_name ='entailment_relatedness/relatedness/siamese_lstm_relatedness.csv')

### Bidirectional LSTM

### siamese CNN

In [152]:
import models.models as models

In [153]:
m1 = models.models(embedding_dim = embedding_dim,
                  NUM_WORDS = NUM_WORDS,
                   embedding_weights = embedding_weights,
                   max_seq_length = max_seq_length,
                   task = 'relatedness',
                   dropout = .1,
                   l2_reg=.0001
                  )

In [154]:
siames_cnn = m1.cnn()

Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 32)           0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           (None, 32)           0                                            
__________________________________________________________________________________________________
sequential_7 (Sequential)       (None, 250)          731550      input_15[0][0]                   
                                                                 input_16[0][0]                   
__________________________________________________________________________________________________
lambda_10 (Lambda)              (None, 250)          0           sequential_7[1][0]        

In [155]:
hist = siamese_cnn.fit([sent1_data, sent2_data], train['relatedness_score'], batch_size = 10, 
                 epochs = 25,
                 validation_data = ([sent1_data_trial, sent2_data_trial], trial['relatedness_score']))

Train on 4500 samples, validate on 500 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [156]:
cnn = model

In [157]:
k = siamese_cnn.predict([sent1_data_test, sent2_data_test])

In [158]:
y_pred = [i[0] for i in k]
y_true = test_ann['relatedness_score']

In [161]:
pearson, spearman, mean_abs_deviation = evaluate_relatedness(y_true, y_pred)

In [162]:
print("pearson: "+str(pearson[0]))
print("spearman: "+str(spearman[0]))
print("mean_abs_deviation: "+str(mean_abs_deviation)+ "% ")

pearson: 0.42540778986934513
spearman: 0.356766587848538
mean_abs_deviation: 29.027313147387307% 


In [163]:
cnn_df = map_relatedness(test,y_pred,cols = ['pair_ID']  )

In [164]:
output_csv(cnn_df, file_name ='entailment_relatedness/relatedness/cnn_relatedness.csv', index = False)

### Deep RNN

In [138]:
input_1 = Input(shape=(max_seq_length,))
input_2 = Input(shape=(max_seq_length,))

left_input = Embedding(input_dim=NUM_WORDS,
               output_dim=embedding_dim,
               weights=[embedding_weights],
               input_length=max_seq_length,
               trainable=False)(input_1)
left_input = TimeDistributed(Dense(300, activation='relu'))(left_input)
left_input = Lambda(lambda x: K.max(x, axis=1), output_shape=(300,))(left_input)

right_input = Embedding(input_dim=NUM_WORDS,
               output_dim=300,
               weights=[embedding_weights],
               input_length=max_seq_length,
               trainable=False)(input_2)
right_input = TimeDistributed(Dense(300, activation='relu'))(right_input)
right_input = Lambda(lambda x: K.max(x, axis=1), output_shape=(300,))(right_input)

x = concatenate([left_input, right_input])
x = Dense(200, activation='relu')(x)
x = Dropout(0.1)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu')(x)
x = Dropout(0.1)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu')(x)
x = Dropout(0.1)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu')(x)
x = Dropout(0.1)(x)
x = BatchNormalization()(x)

out = Dense(1, activation='selu')(x)

model = Model(inputs=[input_1, input_2], outputs=out)
model.compile(loss='mse', optimizer=Adam(0.0001))

In [139]:
hist = model.fit([sent1_data, sent2_data], train['relatedness_score'], batch_size = 10, 
                 epochs = 25,
                 validation_data = ([sent1_data_trial, sent2_data_trial], trial['relatedness_score']))

Train on 4500 samples, validate on 500 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [146]:
k = model.predict([sent1_data_test, sent2_data_test])

In [147]:
y_pred = [i[0] for i in k]
y_true = test_ann['relatedness_score']

In [148]:
pearson, spearman, mean_abs_deviation = evaluate_relatedness(y_true, y_pred)

In [149]:
print("pearson: "+str(pearson[0]))
print("spearman: "+str(spearman[0]))
print("mean_abs_deviation: "+str(mean_abs_deviation)+ "% ")

pearson: 0.22068082651994758
spearman: 0.18324760312426733
mean_abs_deviation: 32.146169094015534% 


In [150]:
deep_rnn = map_relatedness(test,y_pred,cols = ['pair_ID']  )

In [151]:
output_csv(deep_rnn, file_name ='entailment_relatedness/relatedness/deeprnn_relatedness.csv', index = False)

### Bidirectional LSTM

In [117]:
from keras.layers import Concatenate

In [124]:
max_seq = 64

In [125]:
main_input = Input(shape=(max_seq,), dtype='int32', name='main_input') #(N,70)
#x = Embedding(output_dim=opts['emb'], input_dim=len(VOCABULARY.keys())+1, input_length=N, name='x')(main_input)

x = Embedding(
            NUM_WORDS,
            embedding_dim,
            weights = [embedding_weights], 
            input_length = max_seq,
            trainable = False)(main_input)

drop_out = Dropout(0.3, name='dropout')(x) # 70,50
lstm_fwd = LSTM(125, return_sequences=True, name='lstm_fwd')(drop_out)
lstm_bwd = LSTM(125, return_sequences=True, go_backwards=True, name='lstm_bwd')(drop_out)
#70,100
bilstm = Concatenate()([lstm_fwd,lstm_bwd])
#70,200
drop_out = Dropout(0.1, name="d_bilstm")(bilstm)
flat_h_star = Flatten(name="flat_h_star")(drop_out)
out = Dense(1, activation='selu')(flat_h_star)

model = Model([main_input], output=out)
model.summary()
model.compile(loss='mse',optimizer=Adam())

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 64)           0                                            
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 64, 300)      692100      main_input[0][0]                 
__________________________________________________________________________________________________
dropout (Dropout)               (None, 64, 300)      0           embedding_16[0][0]               
__________________________________________________________________________________________________
lstm_fwd (LSTM)                 (None, 64, 125)      213000      dropout[0][0]                    
___________________________________________________________________________________________



In [126]:
sent = np.concatenate((sent1_data, sent2_data),axis= 1)
sent_trial = np.concatenate((sent1_data_trial, sent2_data_trial), axis = 1)

In [127]:
hist = model.fit(sent, train['relatedness_score'], batch_size = 32, 
                 epochs = 15,
                 validation_data = (sent_trial, trial['relatedness_score']))

Train on 4500 samples, validate on 500 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [130]:
sent_test = np.concatenate((sent1_data_test, sent2_data_test), axis =1)

In [131]:
k = model.predict(sent_test)

In [132]:
y_pred = [i[0] for i in k]
y_true = test_ann['relatedness_score']

In [133]:
print("pearson: "+str(pearson[0]))
print("spearman: "+str(spearman[0]))
print("mean_abs_deviation: "+str(mean_abs_deviation)+ "% ")

pearson: 0.27552558822749795
spearman: 0.20059533062058138
mean_abs_deviation: 31.209168436051787% 


In [134]:
bidir_df = map_relatedness(test,y_pred,cols = ['pair_ID']  )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['relatedness_score']= relatedness_result


In [137]:
output_csv(bidir_df, file_name ='entailment_relatedness/relatedness/bidirectional_lstm_relatedness.csv', index = False)