This notebook attempts to replicate Bowman et al.'s implementation of NLI on their SNLI dataset with a neural network architecture, as well as the neural attention based model seen in Rocktaschel et al.

Unfortunately, due to memory constraints, only a subset of the dataset could be used

These are the main steps to completing this process:
1. Import SNLI dataset
 - This may require imprting a portion of the dataset in order to save memory
2. Perform an exploratory data analysis
 - Vocabulary size
 - Maximum observation length
 - Distribution of obswrvation lengths based on set type(training, testing, validation)
 - Word frequencies
 - Label frequencies
 - N-grams
3. Preprocess data by
 - Clean -1 labels
 - Tokenize data
 - Pad data
 - Perform embedding
     - We will compute sentence embeddings with MPnet for the basic Bowman et al. model
     - We will do word embeddings for the neural attention model
 - DATA CLEANING DOES NOT INVOLVE REMOVING STOP WORDS, AS THEY WILL HAVE AN ACTUAL EFFECT ON THE ENTAILMENT OF TWO SENTENCES
4. Create, train, and evaluate models
- Basic Bowman et al. model with MPnet sentence embeddings
- Basic Bowman et al. model with LSTM sentence embeddings
- Rocktaschel's conditional LSTM model with MPnet sentence embeddings

## 1. Import SNLI dataset

In [None]:
#Import modules for step 1
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
#Actually import dataset into RAM

dataset_train = load_dataset("snli", split="train")[:]
dataset_test = load_dataset("snli", split="test")[:]
dataset_valid = load_dataset("snli", split="validation")[:]

dataset = DatasetDict({"train":Dataset.from_dict(dataset_train), "test":Dataset.from_dict(dataset_test), "validation":Dataset.from_dict(dataset_valid)})


## 2. Exploratory Data Analysis

In this section, we attempt to find
 - Vocabulary size
 - Maximum observation length
 - Distribution of obswrvation lengths based on set type(training, testing, validation)
 - Word frequencies
 - Label frequencies
 - N-grams

In [None]:
#Import modules for step 2
from tqdm import tqdm
import statistics
import numpy as np
from datasets import concatenate_datasets
import spacy
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from nltk import ngrams
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 58182256

### Vocabulary size

In [None]:
full_corpus = concatenate_datasets([dataset["train"], dataset["test"], dataset["validation"]])
text_full_corpus = full_corpus["premise"]+full_corpus["hypothesis"]
text_full_corpus_string = " ".join(text_full_corpus).lower()
# count the number of unique tokens
vocab_size = len(set(text_full_corpus_string.split()))
print("Vocabulary size:", vocab_size)

### Max observation length

In [None]:
max_len = 0
for text in tqdm(text_full_corpus):
    current_length = len(text.split())
    if current_length > max_len:
        max_len = current_length
print(max_len)

### Text length distribution

In [None]:
def text_length_distribution(data, dataset_name:str):
    #Get lengths of hypothesis
    text_lengths_hypothesis = [len(text.split()) for text in data["hypothesis"]]
    #Get lengths of premise
    text_lengths_premise = [len(text.split()) for text in data["premise"]]
    #Plot lengths with a histogram
    plt.hist(text_lengths_hypothesis, color="blue", alpha=0.5, bins=20)
    plt.hist(text_lengths_premise, color="red", alpha=0.5,bins=20)
    plt.xlabel("Text Length")
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {dataset_name} Text Lengths')
    plt.show()
    print(f"Mean: {statistics.mean(text_lengths_hypothesis+text_lengths_premise)}")
    print(f"Median: {statistics.median(text_lengths_hypothesis+text_lengths_premise)}")
    print(f"Max: {max(text_lengths_hypothesis+text_lengths_premise)}")
    print(f"Min: {min(text_lengths_hypothesis+text_lengths_premise)}")

In [None]:
text_length_distribution(dataset["train"], "Train")
text_length_distribution(dataset["test"], "Test")
text_length_distribution(dataset["validation"], "Validation")

### Word frequencies
This takes a long time with the whole dataset

In [None]:
def word_frequency_distribution(data, dataset_name:str):
    #Convert hypothesis dataset to a string
    data_string_hypothesis = " ".join(data["hypothesis"])
    #Convert premise dataset to a string
    data_string_premise = " ".join(data["premise"])
    doc_hyp = nlp(data_string_hypothesis+data_string_premise)
    words = [token.text for token in doc_hyp if not token.is_stop and token.is_alpha]
    freq_dist = Counter(words)
    x, y1 = zip(*freq_dist.most_common(30))
    plt.figure(figsize=(10,5))
    plt.bar(x, y1, alpha=0.5,color="red")
    plt.title(f'Word Frequency Distribution {dataset_name}')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=80)
    plt.show()

In [None]:
word_frequency_distribution(dataset["validation"], "Validation")
word_frequency_distribution(dataset["train"], "Training")
word_frequency_distribution(dataset["test"], "Testing")

### Label frequencies

In [None]:
def get_label_frequencies(data, dataset_name:str):
    labels = defaultdict(lambda: 0)
    for obs in data:
        labels[obs["label"]] += 1
    values = list(labels.values())
    keys = list(labels.keys())
    for val in keys:
        print(f"{val}:{100*(labels[val]/len(data))}%")
    plt.bar(keys, values)
    plt.title(f'Frequency of {dataset_name} Labels')
    plt.xlabel('Label')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
get_label_frequencies(dataset["validation"], "Validation")
get_label_frequencies(dataset["train"], "Train")
get_label_frequencies(dataset["test"], "Test")

### Ngrams

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def top_bigrams(dataset):
     # Define bigram size
    n = 2
    # Define list to store bigrams
    bigrams_list = []
    
    # Loop over each list of sentences in the dictionary
    for key, sentences in dataset.items():
        # Check if key should be excluded
        if key in ["label"]:
            continue
        
        # Combine all sentences in list into a single string with stop words removed
        section = " ".join(sentences).lower()
        section = ' '.join([word for word in section.split() if word.lower() not in stop_words])
        
        # Generate bigrams for section
        words = section.split()
        section_bigrams = list(ngrams(words, n))
        bigrams_list += section_bigrams
    
    # Count frequency of each bigram
    bigrams_counts = Counter(bigrams_list)

    # Get top 20 most common bigrams
    top_bigrams = bigrams_counts.most_common(20)
    
    return top_bigrams
    return top_bigrams

In [None]:
training_bigrams = "Train\n"
for bigram, count in top_bigrams(dataset_train):
    training_bigrams += str((bigram, count))+"\n"
valid_bigrams = "Valid\n"
for bigram, count in top_bigrams(dataset_valid):
    valid_bigrams += str((bigram, count))+"\n"
test_bigrams = "Test\n"
for bigram, count in top_bigrams(dataset_test):
    test_bigrams += str((bigram, count))+"\n"
zipped = zip(training_bigrams.split("\n"), valid_bigrams.split("\n"), test_bigrams.split("\n"))

for count, row in enumerate(zipped):
    print(f"{row[0].ljust(25)}&{row[1].ljust(25)}&{row[2].ljust(25)}\\\\")
    print(f"\hline")

## 3. Clean data

In [None]:
# Import modules for step 3
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import numpy as np
from keras.utils import to_categorical

from multiprocessing import Pool

Remove -1s from dataset

In [None]:
filtered_dataset = dataset.filter(lambda example: example['label'] != -1)# and example['label'] != 1)

### Test new label frequencies

In [None]:
get_label_frequencies(filtered_dataset["validation"], "Filtered Validation")
get_label_frequencies(filtered_dataset["train"], "Filtered Train")
get_label_frequencies(filtered_dataset["test"], "Filtered Test")

### Find the new vocab size

In [None]:
full_corpus = concatenate_datasets([filtered_dataset["train"], filtered_dataset["test"]])
text_full_corpus = full_corpus["premise"]+full_corpus["hypothesis"]
text_full_corpus_string = " ".join(text_full_corpus).lower()
# count the number of unique tokens
vocab_size = len(set(text_full_corpus_string.split()))
print("Vocabulary size:", vocab_size)

### New Max observation length

In [None]:
max_len = 0
for text in tqdm(text_full_corpus):
    current_length = len(text.split())
    if current_length > max_len:
        max_len = current_length
print(max_len)

### Sentence embeddings from MPnet

In [None]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('all-mpnet-base-v2', device="cuda")
sbert_model.max_seq_length = 100

In [None]:
train_prem = sbert_model.encode(filtered_dataset["train"]["premise"], show_progress_bar=True)
train_hyp = sbert_model.encode(filtered_dataset["train"]["hypothesis"], show_progress_bar=True)

test_prem = sbert_model.encode(filtered_dataset["test"]["premise"], show_progress_bar=True)
test_hyp = sbert_model.encode(filtered_dataset["test"]["hypothesis"], show_progress_bar=True)


### Word embeddings with gloVe

#### Tokenize and pad

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(full_corpus["premise"]+full_corpus["hypothesis"])
train_premise_sequences = tokenizer.texts_to_sequences(filtered_dataset["train"]["premise"])
train_hypothesis_sequences = tokenizer.texts_to_sequences(filtered_dataset["train"]["hypothesis"])
test_premise_sequences = tokenizer.texts_to_sequences(filtered_dataset["test"]["premise"])
test_hypothesis_sequences = tokenizer.texts_to_sequences(filtered_dataset["test"]["hypothesis"])

padded_train_premise_sequences = pad_sequences(train_premise_sequences, padding='post', maxlen=max_len)
padded_train_hypothesis_sequences = pad_sequences(train_hypothesis_sequences, padding='post', maxlen=max_len)

padded_test_premise_sequences = pad_sequences(test_premise_sequences, padding='post', maxlen=max_len)
padded_test_hypothesis_sequences = pad_sequences(test_hypothesis_sequences, padding='post', maxlen=max_len)

tokenizer.word_index["NULL"] = 0
tokenizer.index_word[0] = "NULL"

Glove

In [None]:
#FROM : https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db
embeddings_dict = {}

with open("glove.6B.100d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [None]:
#Let's map the embeddings to our own tokenized words!
# Create a dictionary that maps tokens to their embeddings
embedding_dict_tokenized = {}
for token in tqdm(tokenizer.word_index):
    if token in list(embeddings_dict.keys()):
        embedding_dict_tokenized[token] = embeddings_dict[token]
    else:
        embedding_dict_tokenized[token] = np.random.rand(100)
embedding_dict_tokenized["NULL"] = np.zeros(100)

In [None]:
def embedded_sequence(sequences, embedding_dict_tokenized):
    embedded_sequences = []
    for seq in tqdm(sequences):
        embedded_sequence = []
        for token in seq:
            embedded_sequence.append(embedding_dict_tokenized[tokenizer.index_word[token]])
        embedded_sequences.append(embedded_sequence)
    return embedded_sequences

In [None]:
train_hyp = embedded_sequence(padded_train_hypothesis_sequences, embedding_dict_tokenized)
train_prem = embedded_sequence(padded_train_premise_sequences, embedding_dict_tokenized)

In [None]:
test_hyp = embedded_sequence(padded_test_hypothesis_sequences, embedding_dict_tokenized)
test_prem = embedded_sequence(padded_test_premise_sequences, embedding_dict_tokenized)

### One hot encode labels

In [None]:
y_train = to_categorical(filtered_dataset["train"]["label"], num_classes=3)
y_test = to_categorical(filtered_dataset["test"]["label"], num_classes=3)

## 4. Create, train, and evaluate models

In [None]:
# Import modules for step 4
from tensorflow.keras.layers import Input, Dense, Concatenate, Embedding, LSTM, Activation, Layer
from tensorflow.keras.models import Model
from keras.callbacks import EarlyStopping
import tensorflow
import keras.backend as K

#### Word-by-Word Attention Based Model

Create basic layers for trainable weights

In [None]:
class Trainable_Matrix(Layer):

  def __init__(self, units=32):
      super(Trainable_Matrix, self).__init__()
      self.units = units

  def build(self, input_shape):  # Create the state of the layer (weights)
    w_init = tensorflow.random_normal_initializer()
    self.w = tensorflow.Variable(
        initial_value=w_init(shape=(input_shape[-1], self.units),
                             dtype='float32'),
        trainable=True)

  def call(self, inputs):  # Defines the computation from inputs to outputs
      return tensorflow.matmul(inputs, self.w)


In [None]:
def create_wordword_attention_SNLI_model():
    input_premise = Input(shape=(max_len, 100))
    input_hypothesis = Input(shape=(max_len, 100))
    lstm_premise = LSTM(units=64, return_sequences=True)(input_premise) #key
    lstm_hypothesis = LSTM(units=64)(input_hypothesis) #query
    attn = tensorflow.keras.layers.Attention()([lstm_premise, lstm_hypothesis])
    attn_flat = tensorflow.keras.layers.Flatten()(attn)
    output = Dense(units=3, activation="softmax")(attn_flat)
    model = tensorflow.keras.models.Model(inputs=[input_premise, input_hypothesis], outputs=output)
    return model
wordword_attention_model = create_wordword_attention_SNLI_model()
optimizer =  tensorflow.keras.optimizers.Adam(learning_rate=0.001)
wordword_attention_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
wordword_attention_model.summary()

Model fitting

In [None]:
early_stop = EarlyStopping(monitor='loss', patience=2, min_delta=0.0005, restore_best_weights=True)
wordword_attention_model.fit([np.array(train_prem), \
           np.array(train_hyp)], \
            y_train, \
            epochs=10, callbacks=[early_stop])

Testing & Performance

In [None]:
wordword_pred = wordword_attention_model.predict([test_prem, test_hyp])
correct_labels = [np.argmax(x) for x in y_test]
wordword_pred_labels = [np.argmax(x) for x in wordword_pred]
from sklearn.metrics import classification_report
print(classification_report(correct_labels, wordword_pred_labels))

#### Conditional LSTM model

In [None]:
def create_conditional_LSTM_SNLI_model():
    input_premise = Input(shape=(max_len, 100))
    input_hypothesis = Input(shape=(max_len, 100))
    lstm_premise = LSTM(units=64, return_state=True)(input_premise)
    lstm_hypothesis = LSTM(units=64)(input_hypothesis, initial_state=lstm_premise[1:])
    output = Dense(units=3, activation="softmax")(lstm_hypothesis)
    model = tensorflow.keras.models.Model(inputs=[input_premise, input_hypothesis], outputs=output)
    return model
conditional_lstm_model = create_conditional_LSTM_SNLI_model()
optimizer =  tensorflow.keras.optimizers.Adam(learning_rate=0.05)
conditional_lstm_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
conditional_lstm_model.summary()

Model fitting

In [None]:
conditional_lstm_model.fit([np.array(train_prem), \
           np.array(train_hyp)], \
            y_train, \
            epochs=10)

Testing & Performance

In [None]:
conditionalLSTM_pred = conditional_lstm_model.predict([test_prem, test_hyp])
correct_labels = [np.argmax(x) for x in y_test]
conditionalLSTM_pred_labels = [np.argmax(x) for x in conditionalLSTM_pred]
from sklearn.metrics import classification_report
print(classification_report(correct_labels, conditionalLSTM_pred_labels))

#### Bowman et al. LSTM sentence embeddings

In [None]:
def create_bowman_LSTM_SNLI_model():
    input_premise = Input(shape=(max_len, 100))
    input_hypothesis = Input(shape=(max_len, 100))
    premise_lstm = LSTM(units=100)(input_premise)
    hypothesis_lstm = LSTM(units=100)(input_hypothesis)
    concat_premise_hypothesis = Concatenate()([premise_lstm, hypothesis_lstm])
    tanh1 = Dense(units=200, activation="tanh", )(concat_premise_hypothesis)
    tanh2 = Dense(units=200, activation="tanh")(tanh1)
    tanh3 = Dense(units=200, activation="tanh")(tanh2)
    output = Dense(units=3, activation="softmax")(tanh3)
    model = tensorflow.keras.models.Model(inputs=[input_premise, input_hypothesis], outputs=output)
    return model
lstm_bowman_model = create_bowman_LSTM_SNLI_model()
optimizer =  tensorflow.keras.optimizers.SGD(learning_rate=0.1)
lstm_bowman_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
lstm_bowman_model.summary()

Training

In [None]:
lstm_bowman_model.fit([np.array(train_prem), np.array(train_hyp)], y_train,epochs=10)

Testing & Performance

In [None]:
wordword_pred = create_bowman_LSTM_SNLI_model.predict([test_prem, test_hyp])
correct_labels = [np.argmax(x) for x in y_test]
wordword_pred_labels = [np.argmax(x) for x in wordword_pred]
from sklearn.metrics import classification_report
print(classification_report(correct_labels, wordword_pred_labels))

#### Create the Bowman "basic" model with MPnet sentence embeddings

In [None]:
def create_bowman_SNLI_model():
    input_premise = Input(shape=(768))
    input_hypothesis = Input(shape=(768))
    concat_premise_hypothesis = Concatenate()([input_premise, input_hypothesis])
    tanh1 = Dense(units=200, activation="tanh", )(concat_premise_hypothesis)
    tanh2 = Dense(units=200, activation="tanh")(tanh1)
    tanh3 = Dense(units=200, activation="tanh")(tanh2)
    output = Dense(units=3, activation="softmax")(tanh3)
    model = tensorflow.keras.models.Model(inputs=[input_premise, input_hypothesis], outputs=output)
    return model
bowman_model = create_bowman_SNLI_model()
optimizer =  tensorflow.keras.optimizers.SGD(learning_rate=0.1)
bowman_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
bowman_model.summary()

Training

In [None]:
bowman_model.fit([np.array(train_prem), np.array(train_hyp)], y_train,epochs=10)

Testing & Performance

In [None]:
basic_pred = bowman_model.predict([test_prem, test_hyp])
correct_labels = [np.argmax(x) for x in y_test]
basic_pred_labels = [np.argmax(x) for x in basic_pred]
from sklearn.metrics import classification_report
print(classification_report(correct_labels, basic_pred_labels))