### Natural Language Inference using Ensemble Architecture with Attention and Local Inference Modelling



In [120]:
# imports
import pandas as pd
import numpy as np
import tensorflow as tf

In [121]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [122]:
VOCAB_SIZE = 20000
MAX_LENGTH = 35

In [123]:
def load_data(csv_path):
    """Load dataset from CSV file"""
    df = pd.read_csv(csv_path)
    return df["premise"].astype(str).tolist(), df["hypothesis"].astype(str).tolist(), df["label"].values

In [None]:
train_data_path = input("Input the path to train data: ")
val_data_path = input("Input the path to validation data: ")

In [124]:
train_premises, train_hypotheses, train_labels = load_data(train_data_path)
val_premises, val_hypotheses, val_labels = load_data(val_data_path)

In [125]:
# Max length was chosen from this value
premise_lengths = [len(text.split()) for text in train_premises]
hypothesis_lengths = [len(text.split()) for text in train_hypotheses]

max_length = int(np.percentile(premise_lengths + hypothesis_lengths, 95))
max_length = round(max_length / 5) * 5
max_length

35

In [126]:
def tokenize_and_pad(texts, tokenizer, max_length):
    """Convert text to sequences and pad"""
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

In [127]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_premises + train_hypotheses)

In [128]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [129]:
X_train_p = tokenize_and_pad(train_premises, tokenizer, MAX_LENGTH)
X_train_h = tokenize_and_pad(train_hypotheses, tokenizer, MAX_LENGTH)
y_train = to_categorical(train_labels, num_classes=2)

In [130]:
X_val_p = tokenize_and_pad(val_premises, tokenizer, MAX_LENGTH)
X_val_h = tokenize_and_pad(val_hypotheses, tokenizer, MAX_LENGTH)
y_val = to_categorical(val_labels, num_classes=2)

In [131]:
EMBEDDING_DIM = 300

In [132]:
def load_glove_embeddings(glove_file, embedding_dim):
    """
    Load the embeddings for every word in the file
    """
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

In [None]:
glove_path = input("Input the path to glove embeddings file: ")
embeddings_index = load_glove_embeddings(glove_path, EMBEDDING_DIM)

In [134]:
def create_embedding_matrix(embedding_index, word_index, embedding_dim, vocab_size):
    """
    Creates an embedding matrix from the GloVe embeddings.
    """
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    for word, index in word_index.items():
        if index >= vocab_size:
            break
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
    
    return embedding_matrix

In [135]:
word_index = tokenizer.word_index
embedding_matrix = create_embedding_matrix(embeddings_index, word_index, EMBEDDING_DIM, VOCAB_SIZE)

In [136]:
from tensorflow.keras.layers import (Input, Embedding, Bidirectional, GRU, LSTM, Dense, 
                                    Concatenate, Subtract, Multiply, Dropout, Dot, Softmax)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

In [137]:
BATCH_SIZE = 32
EPOCHS = 20
HIDDEN_DIM = 128

In [138]:
def soft_attention(premise, hypothesis):
    attention = Dot(axes=-1)([premise, hypothesis])
    premise_attn = Softmax(axis=-1)(attention)
    hypothesis_attn = Softmax(axis=-2)(attention)

    premise_aligned = Dot(axes=1)([premise_attn, hypothesis])
    hypothesis_aligned = Dot(axes=1)([hypothesis_attn, premise])
    return premise_aligned, hypothesis_aligned

In [139]:
def build_nli_model(rnn_1, rnn_2, vocab_size, embedding_dim, max_length, hidden_dim, embedding_matrix):
    premise_input = Input(shape=(max_length,), name="premise_input")
    hypothesis_input = Input(shape=(max_length,), name="hypothesis_input")
    
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False)
    
    premise_embedded = embedding_layer(premise_input)
    hypothesis_embedded = embedding_layer(hypothesis_input)
    
    # Context Representation using first RNN
    premise_encoded = rnn_1(premise_embedded)
    hypothesis_encoded = rnn_1(hypothesis_embedded)
    
    # Soft Attention
    premise_aligned, hypothesis_aligned = soft_attention(premise_encoded, hypothesis_encoded)
    
    # Local Inference Modeling using Difference & Element-wise Product
    premise_diff = Subtract()([premise_encoded, premise_aligned])
    hypothesis_diff = Subtract()([hypothesis_encoded, hypothesis_aligned])
    premise_mul = Multiply()([premise_encoded, premise_aligned])
    hypothesis_mul = Multiply()([hypothesis_encoded, hypothesis_aligned])
    
    premise_combined = Concatenate()([premise_encoded, premise_aligned, premise_diff, premise_mul])
    hypothesis_combined = Concatenate()([hypothesis_encoded, hypothesis_aligned, hypothesis_diff, hypothesis_mul])
    
    # Inference Composition using second RNN
    premise_composed = rnn_2(premise_combined)
    hypothesis_composed = rnn_2(hypothesis_combined)
    
    # Fully Connected Layers
    merged = Concatenate()([premise_composed, hypothesis_composed])
    dense = Dense(hidden_dim, activation="relu")(merged)
    dense = Dropout(0.5)(dense)
    dense = Dense(hidden_dim // 2, activation="relu")(dense)
    output = Dense(2, activation="softmax")(dense)
    
    model = Model(inputs=[premise_input, hypothesis_input], outputs=output)
    return model


In [150]:
models = {
    "LSTM": build_nli_model(
        LSTM(HIDDEN_DIM, return_sequences=True),
        LSTM(HIDDEN_DIM, return_sequences=False),
        VOCAB_SIZE, EMBEDDING_DIM, MAX_LENGTH, HIDDEN_DIM, embedding_matrix
    ),
    "BiLSTM": build_nli_model(
        Bidirectional(LSTM(HIDDEN_DIM, return_sequences=True)),
        Bidirectional(LSTM(HIDDEN_DIM, return_sequences=False)),
        VOCAB_SIZE, EMBEDDING_DIM, MAX_LENGTH, HIDDEN_DIM, embedding_matrix
    ),
    "GRU": build_nli_model(
        GRU(HIDDEN_DIM, return_sequences=True),
        GRU(HIDDEN_DIM, return_sequences=False),
        VOCAB_SIZE, EMBEDDING_DIM, MAX_LENGTH, HIDDEN_DIM, embedding_matrix
    ),
    "BiGRU": build_nli_model(
        Bidirectional(GRU(HIDDEN_DIM, return_sequences=True)),
        Bidirectional(GRU(HIDDEN_DIM, return_sequences=False)),
        VOCAB_SIZE, EMBEDDING_DIM, MAX_LENGTH, HIDDEN_DIM, embedding_matrix
    )
}

In [151]:
for name, model in models.items():
    optimizer = Adam(learning_rate=0.0001)
    model.compile(loss='binary_crossentropy', 
                  optimizer=optimizer, 
                  metrics=['accuracy'])

In [153]:
val_accuracies = {}

for name, model in models.items():
    checkpoint_callback = ModelCheckpoint(
        f"nli_{name}_model.keras", 
        monitor="val_accuracy",
        save_best_only=True, 
        mode="max",
        verbose=0
    )
    early_stop_callback = EarlyStopping(monitor='val_loss', patience=4)
    
    history = model.fit(
        [X_train_p, X_train_h], y_train, 
        batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=([X_val_p, X_val_h], y_val),
        callbacks=[checkpoint_callback, early_stop_callback]
    )
    
    best_val_acc = max(history.history["val_accuracy"])
    val_accuracies[name] = best_val_acc

    model.load_weights(f"nli_{name}_model.keras")
    print()

Epoch 1/20
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 18ms/step - accuracy: 0.5597 - loss: 0.6766 - val_accuracy: 0.6444 - val_loss: 0.6217
Epoch 2/20
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.6556 - loss: 0.6153 - val_accuracy: 0.6599 - val_loss: 0.6083
Epoch 3/20
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.6680 - loss: 0.6021 - val_accuracy: 0.6620 - val_loss: 0.6006
Epoch 4/20
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.6819 - loss: 0.5850 - val_accuracy: 0.6672 - val_loss: 0.5953
Epoch 5/20
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.6903 - loss: 0.5705 - val_accuracy: 0.6709 - val_loss: 0.5917
Epoch 6/20
[1m764/764[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - accuracy: 0.7047 - loss: 0.5526 - val_accuracy: 0.6755 - val_loss: 0.5896
Epoch 7/20
[1m7

In [None]:
import json

total_acc = sum(val_accuracies.values())
weights = {name: acc/total_acc for name, acc in val_accuracies.items()}

with open("model_accuracy_weights.json", "w") as f:
    json.dump(weights, f)

In [None]:
from sklearn.metrics import classification_report

In [156]:
weighted_sum = np.zeros_like(y_val, dtype=np.float32)
    
for name, model in models.items():
    raw_preds = model.predict([X_val_p, X_val_h])

    # Apply weighting and accumulate
    weighted_preds = raw_preds * weights[name]
    weighted_sum += weighted_preds

# Calculate final predictions
y_pred = np.argmax(weighted_sum, axis=1)

[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step


In [157]:
print(classification_report(val_labels, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7185    0.6636    0.6900      3258
           1     0.7059    0.7565    0.7303      3478

    accuracy                         0.7115      6736
   macro avg     0.7122    0.7100    0.7101      6736
weighted avg     0.7120    0.7115    0.7108      6736



In [158]:
def save_predictions_only(y_pred, output_path):
    """Save only the predictions to a CSV file with a 'predictions' column."""
    df = pd.DataFrame({"predictions": y_pred.flatten()})
    df.to_csv(output_path, index=False)
    print(f"Predictions saved to {output_path}")

In [159]:
save_predictions_only(y_pred, "Group_47_B.csv.predict")

Predictions saved to Group_47_B.csv.predict
