In [266]:
import pandas as pd
from sklearn import preprocessing
import keras
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

# Data Processing

In [267]:
filepath = '../../Datasets/SICK/SICK.txt'
data = pd.read_csv(filepath, sep='\t')

In [268]:
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,entailment_label,relatedness_score,entailment_AB,entailment_BA,sentence_A_original,sentence_B_original,sentence_A_dataset,sentence_B_dataset,SemEval_set
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,NEUTRAL,4.5,A_neutral_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.2,A_contradicts_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,ENTAILMENT,4.7,A_entails_B,B_entails_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRAIN
3,4,The young boys are playing outdoors and the ma...,There is no boy playing outdoors and there is ...,CONTRADICTION,3.6,A_contradicts_B,B_contradicts_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRIAL
4,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.4,A_neutral_B,B_neutral_A,"A group of children playing in a yard, a man i...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRAIN


In [269]:
data.shape

(9840, 12)

We will extract the Premise and Hypothesis along with the Entailment Label to construct our dataset.

In [270]:
X = data[['sentence_A','sentence_B']]
Y = data['entailment_label']

We can process our label encoding now, but our sentences are not in a format which can be encoded.

In [271]:
le = preprocessing.LabelEncoder().fit(Y)
Y = np_utils.to_categorical(le.transform(Y))
print(Y)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


sentence_A holds the Premise while sentence_B holds the Hypothesis data.

To tokenize our corpus we will need to stack these two columns into a single series.

In [272]:
stacked = pd.concat([X['sentence_A'], X['sentence_B']], axis=0)

In [275]:
# Convert question corpus into sequential encoding for LSTM
vocab_size = 2048
sequence_length = 32
tokenizer = Tokenizer(num_words=vocab_size)

tokenizer.fit_on_texts(stacked)
sequences = tokenizer.texts_to_sequences(stacked)
x_text = sequence.pad_sequences(sequences, maxlen=sequence_length)
premise_sequences = x_text[:data.shape[0]]
hypothesis_sequences = x_text[data.shape[0]:]

# Neural Network Definition

In [274]:
text_inputs = Input(shape=(sequence_length,), name='text_input')
text_x = Embedding(vocab_size, 128)(text_inputs)
text_x = LSTM(128)(text_x)