# Import modules and dependencies

In [1]:
import torch
import json
from typing import List
from torch.utils.data import Dataset
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    DistilBertModel,
    DistilBertPreTrainedModel,
    DistilBertConfig,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score

Helper functions for reading data

In [13]:
# Parse the input file from JSONL to a list of dictionaries.
def read_jsonl_lines(input_file: str) -> List[dict]:
    with open(input_file) as f:
        lines = f.readlines()
        return [json.loads(l.strip()) for l in lines]

# Function to read labels from a .lst file where each line is '1' or '2'
def read_labels(file_path):
    with open(file_path, 'r') as f:
        labels = [line.strip() for line in f if line.strip() in ['1', '2']]
    return labels

Custom dataset for processing the aNLI task

In [14]:
# Define a custom dataset class for the aNLI task
class aNLIDataset(Dataset):
    def __init__(self, data, labels_list, tokenizer, max_length=128):
        self.encodings = []
        self.labels = []
        for i, entry in enumerate(data):
            # Use the corresponding label from the file for this story.
            # If label is '1': hyp1 is correct (label 1), hyp2 is incorrect (label 0).
            # If label is '2': hyp1 is incorrect (label 0), hyp2 is correct (label 1).
            label_str = labels_list[i]
            if label_str == '1':
                label_hyp1 = 1
                label_hyp2 = 0
            else:  # label_str == '2'
                label_hyp1 = 0
                label_hyp2 = 1

            # Create input strings combining the observations and each hypothesis
            text1 = f"Observation 1: {entry['obs1']} Observation 2: {entry['obs2']} Hypothesis: {entry['hyp1']}"
            text2 = f"Observation 1: {entry['obs1']} Observation 2: {entry['obs2']} Hypothesis: {entry['hyp2']}"

            # Tokenize the texts
            enc1 = tokenizer(text1, truncation=True, padding='max_length', max_length=max_length)
            enc2 = tokenizer(text2, truncation=True, padding='max_length', max_length=max_length)

            self.encodings.append(enc1)
            self.labels.append(label_hyp1)
            self.encodings.append(enc2)
            self.labels.append(label_hyp2)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.encodings[idx].items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Helper predict function
def predict(obs1, obs2, hyp):
    # Combine observations and hypothesis into one input string
    input_text = f"Observation 1: {obs1} Observation 2: {obs2} Hypothesis: {hyp}"

    # Tokenize and return tensors
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

    # 3. Move inputs to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # 4. Run the model forward pass
    outputs = model(**inputs)

    # 5. Get softmax probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    return probs.detach().cpu().numpy()  # move back to CPU if needed for further processing

# Baseline Model

In [2]:
# Load the pre-trained DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [5]:
# Read the data
data = read_jsonl_lines('/content/anli_data/train.jsonl')

# Read the labels
labels = read_labels('/content/anli_data/train-labels.lst')

# Instantiate the training dataset
train_dataset = aNLIDataset(data, labels, tokenizer)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='/content/results',
    num_train_epochs=3,
    do_train=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    max_steps=2500
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
    # You can also add an evaluation dataset using eval_dataset=...
)

# Fine-tune the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohitgsridhar[0m ([33mmohitgsridhar-dell[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.6974
20,0.6904
30,0.7125
40,0.6885
50,0.7
60,0.6932
70,0.6951
80,0.7053
90,0.6956
100,0.6882


TrainOutput(global_step=2500, training_loss=0.614001502418518, metrics={'train_runtime': 587.9148, 'train_samples_per_second': 68.037, 'train_steps_per_second': 4.252, 'total_flos': 1324673986560000.0, 'train_loss': 0.614001502418518, 'epoch': 0.11788560381006272})

Example output

In [6]:
# Example usage
obs1 = "Chad went to get the wheel alignment measured on his car."
obs2 = "The mechanic provided a working alignment with new body work."
hyp1 = "Chad was waiting for his car to be washed."
hyp2 = "Chad was waiting for his car to be finished."

probs_hyp1 = predict(obs1, obs2, hyp1)
probs_hyp2 = predict(obs1, obs2, hyp2)

print("Hypothesis 1 probabilities:", probs_hyp1)
print("Hypothesis 2 probabilities:", probs_hyp2)

Hypothesis 1 probabilities: [[0.7841545  0.21584548]]
Hypothesis 2 probabilities: [[0.50128615 0.49871385]]


Evaluate on dev data

In [11]:
# Test on Dev Data
dev_data = read_jsonl_lines('/content/anli_data/dev.jsonl')
dev_labels = read_labels('/content/anli_data/dev-labels.lst')

In [10]:
def evaluate_dev_set(dev_data, dev_labels):
    correct = 0

    for i, entry in enumerate(dev_data):
        # Extract observations and hypotheses
        obs1 = entry['obs1']
        obs2 = entry['obs2']
        hyp1 = entry['hyp1']
        hyp2 = entry['hyp2']

        # Get softmax probabilities for each hypothesis
        probs_hyp1 = predict(obs1, obs2, hyp1)  # shape: [1, 2]
        probs_hyp2 = predict(obs1, obs2, hyp2)  # shape: [1, 2]

        # Compare the probability of label '1' for each hypothesis
        # (index 1 in the softmax is often considered the "positive" label)
        score_hyp1 = probs_hyp1[0][1]
        score_hyp2 = probs_hyp2[0][1]

        # Predicted hypothesis is whichever has higher score for label=1
        predicted_label = 1 if score_hyp1 > score_hyp2 else 2

        # Convert the ground-truth label from '1'/'2' to an integer
        gold_label = int(dev_labels[i])

        # Check correctness
        if predicted_label == gold_label:
            correct += 1

    # Calculate accuracy
    accuracy = correct / len(dev_labels)
    return accuracy

# Test on Dev Data
dev_data = read_jsonl_lines('/content/anli_data/dev.jsonl')
dev_labels = read_labels('/content/anli_data/dev-labels.lst')

# Now call evaluate_dev_set on your dev data
dev_accuracy = evaluate_dev_set(dev_data, dev_labels)
print("Dev Accuracy:", dev_accuracy)

Dev Accuracy: 0.5711488250652742


# Experiment #1: Adding Attention

In [6]:
import torch.nn as nn

In [9]:
# Define a custom model that adds a basic attention mechanism
class DistilBERTWithAttention(DistilBertPreTrainedModel):
    def __init__(self, config):
        super(DistilBERTWithAttention, self).__init__(config)
        self.num_labels = config.num_labels
        self.distilbert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.seq_classif_dropout)
        # A trainable attention layer: project each token’s hidden state to a scalar
        self.attention_layer = nn.Linear(config.hidden_size, 1)
        # Classifier takes the attention-pooled vector and outputs logits for each class
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, head_mask=None, labels=None):
        # Get hidden states from DistilBERT
        outputs = self.distilbert(input_ids, attention_mask=attention_mask, head_mask=head_mask)
        hidden_states = outputs.last_hidden_state  # shape: (batch_size, seq_len, hidden_size)

        # Compute attention weights for each token
        attn_weights = self.attention_layer(hidden_states)  # shape: (batch_size, seq_len, 1)
        attn_weights = torch.softmax(attn_weights, dim=1)  # normalize over sequence length

        # Compute the context vector as the weighted sum of hidden states
        context_vector = torch.sum(attn_weights * hidden_states, dim=1)  # shape: (batch_size, hidden_size)
        context_vector = self.dropout(context_vector)
        logits = self.classifier(context_vector)  # shape: (batch_size, num_labels)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return (loss, logits) if loss is not None else logits

In [12]:
# Load the pre-trained tokenizer and configuration, and modify the config for our task.
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
config = DistilBertConfig.from_pretrained(model_name)
config.num_labels = 2  # binary classification

# Instantiate our custom model with attention
model = DistilBERTWithAttention.from_pretrained(model_name, config=config)

Some weights of DistilBERTWithAttention were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Read the data
data = read_jsonl_lines('./anli_data/train.jsonl')

# Read the labels
labels = read_labels('/content/anli_data/train-labels.lst')

# Instantiate the training dataset
train_dataset = aNLIDataset(data, labels, tokenizer)

# Set up training arguments (adjust as needed)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    max_steps=2500
    # You can also set max_steps if you want to limit training steps directly.
)

# Initialize the Trainer with our custom model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # Optionally, add eval_dataset if available.
)

# Fine-tune the model with the new attention mechanism
trainer.train()

# Make sure to move the model to the appropriate device for inference:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohitgsridhar[0m ([33mmohitgsridhar-dell[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.6639
20,0.729
30,0.7704
40,0.8528
50,0.6466
60,0.7557
70,0.8247
80,0.7399
90,0.7883
100,0.686


DistilBERTWithAttention(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

In [20]:
def predict(obs1, obs2, hyp):
    input_text = f"Observation 1: {obs1} Observation 2: {obs2} Hypothesis: {hyp}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = model(**inputs)
    # outputs is either logits or (loss, logits)
    if isinstance(outputs, tuple):
        logits = outputs[1]
    else:
        logits = outputs
    probs = torch.nn.functional.softmax(logits, dim=-1)
    return probs.detach().cpu().numpy()

def evaluate_dev_set(dev_data, dev_labels):
    correct = 0

    for i, entry in enumerate(dev_data):
        # Extract observations and hypotheses
        obs1 = entry['obs1']
        obs2 = entry['obs2']
        hyp1 = entry['hyp1']
        hyp2 = entry['hyp2']

        # Get softmax probabilities for each hypothesis
        probs_hyp1 = predict(obs1, obs2, hyp1)  # shape: [1, 2]
        probs_hyp2 = predict(obs1, obs2, hyp2)  # shape: [1, 2]

        # Compare the probability of label '1' for each hypothesis
        # (index 1 in the softmax is often considered the "positive" label)
        score_hyp1 = probs_hyp1[0][1]
        score_hyp2 = probs_hyp2[0][1]

        # Predicted hypothesis is whichever has higher score for label=1
        predicted_label = 1 if score_hyp1 > score_hyp2 else 2

        # Convert the ground-truth label from '1'/'2' to an integer
        gold_label = int(dev_labels[i])

        # Check correctness
        if predicted_label == gold_label:
            correct += 1

    # Calculate accuracy
    accuracy = correct / len(dev_labels)
    return accuracy

# Test on Dev Data
dev_data = read_jsonl_lines('/content/anli_data/dev.jsonl')
dev_labels = read_labels('/content/anli_data/dev-labels.lst')

# Now call evaluate_dev_set on your dev data
dev_accuracy = evaluate_dev_set(dev_data, dev_labels)
print("Dev Accuracy:", dev_accuracy)

Dev Accuracy: 0.5019582245430809


In [21]:
import requests
obj = requests.get('http://api.conceptnet.io/c/en/example').json()
obj.keys()

dict_keys(['@context', '@id', 'edges', 'version', 'view'])

In [None]:
# --- Step 1: Define a dummy ConceptNet embedding function ---
def get_conceptnet_embedding(text, concept_dim=50):
    """
    Query ConceptNet to extract an embedding
    """

    return ...

# --- Step 2: Define a custom model that injects ConceptNet knowledge ---
class DistilBERTWithConceptNet(DistilBertPreTrainedModel):
    def __init__(self, config, concept_dim=50):
        super(DistilBERTWithConceptNet, self).__init__(config)
        self.num_labels = config.num_labels
        self.concept_dim = concept_dim
        self.distilbert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.seq_classif_dropout)
        # Basic attention mechanism on DistilBERT output
        self.attention_layer = nn.Linear(config.hidden_size, 1)
        # Project the ConceptNet embeddings to the same dimension as hidden_size
        self.concept_proj = nn.Linear(concept_dim, config.hidden_size)
        # Final classifier: fuse DistilBERT output and ConceptNet features
        self.classifier = nn.Linear(config.hidden_size * 2, config.num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, concept_embedding=None, labels=None):
        # Obtain DistilBERT representations
        outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)

        # Apply a basic attention over token representations
        attn_weights = self.attention_layer(hidden_states)  # (batch_size, seq_len, 1)
        attn_weights = torch.softmax(attn_weights, dim=1)
        context_vector = torch.sum(attn_weights * hidden_states, dim=1)  # (batch_size, hidden_size)
        context_vector = self.dropout(context_vector)

        # Process concept embeddings if provided
        if concept_embedding is not None:
            concept_vector = self.concept_proj(concept_embedding)  # (batch_size, hidden_size)
            concept_vector = torch.tanh(concept_vector)
        else:
            # Use a zero vector if no concept embedding is provided
            concept_vector = torch.zeros(context_vector.size()).to(context_vector.device)

        # Concatenate the text-based and commonsense representations
        combined = torch.cat([context_vector, concept_vector], dim=1)  # (batch_size, hidden_size*2)
        combined = self.dropout(combined)
        logits = self.classifier(combined)  # (batch_size, num_labels)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return (loss, logits) if loss is not None else logits