In [78]:
import pandas as pd
from sklearn import preprocessing
import keras
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import Input, Dense, LSTM, Embedding
from keras.models import Model
from keras.optimizers import Adam
from keras import regularizers

# Data Processing

In [79]:
filepath = '../../Datasets/SICK/SICK.txt'
data = pd.read_csv(filepath, sep='\t')

In [80]:
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,entailment_label,relatedness_score,entailment_AB,entailment_BA,sentence_A_original,sentence_B_original,sentence_A_dataset,sentence_B_dataset,SemEval_set
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,NEUTRAL,4.5,A_neutral_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.2,A_contradicts_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,ENTAILMENT,4.7,A_entails_B,B_entails_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRAIN
3,4,The young boys are playing outdoors and the ma...,There is no boy playing outdoors and there is ...,CONTRADICTION,3.6,A_contradicts_B,B_contradicts_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRIAL
4,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.4,A_neutral_B,B_neutral_A,"A group of children playing in a yard, a man i...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRAIN


In [81]:
num_sentences = data.shape[0]
num_labels = 3

We will extract the Premise and Hypothesis along with the Entailment Label to construct our dataset.

In [82]:
X = data[['sentence_A','sentence_B']]
Y = data['entailment_label']

We can process our label encoding now, but our sentences are not in a format which can be encoded.

In [83]:
le = preprocessing.LabelEncoder().fit(Y)
Y = np_utils.to_categorical(le.transform(Y))
print(Y)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


sentence_A holds the Premise while sentence_B holds the Hypothesis data.

To tokenize our corpus we will need to stack these two columns into a single series.

In [84]:
stacked = pd.concat([X['sentence_A'], X['sentence_B']], axis=0)

In [85]:
# Convert question corpus into sequential encoding for LSTM
vocab_size = 2048
sequence_length = 32
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(stacked)
sequences = tokenizer.texts_to_sequences(stacked)
x_text = sequence.pad_sequences(sequences, maxlen=sequence_length)

premise_sequences = x_text[:data.shape[0]]
hypothesis_sequences = x_text[data.shape[0]:]

# Neural Network

In [86]:
epochs = 100
batch_size = 64
learning_rate = .0025

In [None]:
embedding_layer = Embedding(vocab_size, 128)

premise_inputs = Input(shape=(sequence_length,))
premise_embedding = embedding_layer(premise_inputs)
premise_outputs, state_h, state_c = LSTM(128, return_state=True)(premise_embedding)

hypothesis_inputs = Input(shape=(sequence_length,))
hypothesis_embedding = embedding_layer(hypothesis_inputs)
hypothesis_outputs = LSTM(128)(hypothesis_embedding, initial_state=[state_h, state_c])

dense = Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.006))(hypothesis_outputs)
dense = Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.006))(dense)
dense = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.006))(dense)
dense = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.006))(dense)
outputs = Dense(num_labels, activation='softmax', kernel_regularizer=regularizers.l2(0.006))(dense)

In [None]:
model = Model(inputs=[premise_inputs, hypothesis_inputs], outputs=outputs)
print(model.summary())

model.compile(optimizer=Adam(lr=learning_rate),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit([premise_sequences, hypothesis_sequences], Y, batch_size=batch_size, epochs=epochs, shuffle=True, validation_split = 0.2)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 32)           0                                            
__________________________________________________________________________________________________
input_15 (InputLayer)           (None, 32)           0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 32, 128)      262144      input_15[0][0]                   
                                                                 input_16[0][0]                   
__________________________________________________________________________________________________
lstm_15 (LSTM)                  [(None, 128), (None, 131584      embedding_8[0][0]                
__________

Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100