# **NLP Assignment 5**

## PRN: *22070126062*
## Name: *Manan Tandel*
## Class: *TY AIML A3*  

**Github Link: [GITHUB REPO LINK](https://github.com/manan3044/NLP_Assignment_5)**

In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import nltk

nltk.download('punkt')

import logging
logging.disable(logging.WARNING)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def load_coqa_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['data']

class CoQADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['story']
        question = item['questions'][0]['input_text']
        answer = item['answers'][0]['input_text']

        # Tokenize the input
        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Find the start and end positions of the answer in the tokenized input
        input_ids = inputs['input_ids'][0]
        answer_tokens = self.tokenizer.encode(answer, add_special_tokens=False)
        start_position = None
        end_position = None

        for i in range(len(input_ids) - len(answer_tokens) + 1):
            if input_ids[i:i+len(answer_tokens)].tolist() == answer_tokens:
                start_position = i
                end_position = i + len(answer_tokens) - 1
                break

        # If the answer is not found, use the CLS token position as a default
        if start_position is None:
            start_position = 0
            end_position = 0

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'start_positions': torch.tensor(start_position),
            'end_positions': torch.tensor(end_position),
            'answer': answer
        }

In [5]:
data = load_coqa_data('/kaggle/input/qna-dataset/coqa-train-v1.0.json')
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

In [6]:
# Initialize tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_dataset = CoQADataset(train_data, tokenizer)
val_dataset = CoQADataset(val_data, tokenizer)
test_dataset = CoQADataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [7]:
def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training")

    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass with DistilBERT
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, 
                        start_positions=start_positions, end_positions=end_positions)

        # Get the loss from the output
        loss = outputs.loss

        # Average loss across GPUs (if using multiple)
        loss = loss.mean()

        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        # Update progress bar with current loss
        progress_bar.set_postfix({'loss': loss.item()})

    # Return the average loss over all batches
    return total_loss / len(train_loader)


In [8]:
# model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')


# Set device and move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



In [9]:
def validate(model, val_loader, device):
    model.eval()
    total_loss = 0
    progress_bar = tqdm(val_loader, desc="Validating")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # Forward pass with DistilBERT
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, 
                            start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            total_loss += loss.item()

            progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(val_loader)

In [10]:
# Test function
def test(model, test_loader, tokenizer, device):
    model.eval()
    all_predictions = []
    all_answers = []
    progress_bar = tqdm(test_loader, desc="Testing")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            answers = batch['answer']

            outputs = model(input_ids, attention_mask=attention_mask)
            start_scores = outputs.start_logits
            end_scores = outputs.end_logits

            for i in range(input_ids.shape[0]):
                start_index = torch.argmax(start_scores[i])
                end_index = torch.argmax(end_scores[i])
                prediction = tokenizer.decode(input_ids[i][start_index:end_index+1])
                all_predictions.append(prediction)
                all_answers.append(answers[i])

    bleu_score = calculate_bleu(all_predictions, all_answers)
    return bleu_score

In [11]:
# Training loop
num_epochs = 5
best_loss = float('inf')
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_loader, optimizer, device)
    val_loss = validate(model, val_loader, device)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), 'distilbert_qa_model.pth')
        print("Model saved!")
    else:
        print("Validation Loss Increased. Model Not Saved.")
    print("*" * 50)

Epoch 1/5


Training: 100%|██████████| 630/630 [04:48<00:00,  2.18it/s, loss=3.82]
Validating: 100%|██████████| 135/135 [00:27<00:00,  4.88it/s, loss=2.27]


Train Loss: 2.8866, Validation Loss: 2.1153
Model saved!
**************************************************
Epoch 2/5


Training: 100%|██████████| 630/630 [04:52<00:00,  2.16it/s, loss=0.981]
Validating: 100%|██████████| 135/135 [00:27<00:00,  4.87it/s, loss=2.02]


Train Loss: 1.4879, Validation Loss: 2.0351
Model saved!
**************************************************
Epoch 3/5


Training: 100%|██████████| 630/630 [04:52<00:00,  2.16it/s, loss=1.13]  
Validating: 100%|██████████| 135/135 [00:27<00:00,  4.87it/s, loss=2.6]  


Train Loss: 0.7604, Validation Loss: 2.4218
Validation Loss Increased. Model Not Saved.
**************************************************
Epoch 4/5


Training: 100%|██████████| 630/630 [04:52<00:00,  2.16it/s, loss=0.187] 
Validating: 100%|██████████| 135/135 [00:27<00:00,  4.86it/s, loss=2.65] 


Train Loss: 0.4304, Validation Loss: 2.9000
Validation Loss Increased. Model Not Saved.
**************************************************
Epoch 5/5


Training: 100%|██████████| 630/630 [04:52<00:00,  2.16it/s, loss=0.273] 
Validating: 100%|██████████| 135/135 [00:27<00:00,  4.88it/s, loss=3.63] 

Train Loss: 0.2623, Validation Loss: 3.3733
Validation Loss Increased. Model Not Saved.
**************************************************





In [12]:
# Calculate BLEU score
def calculate_bleu(predictions, references):
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        bleu_scores.append(sentence_bleu([ref.split()], pred.split()))
    return sum(bleu_scores) / len(bleu_scores)


In [13]:
# Test the model
bleu_score = test(model, test_loader, tokenizer, device)
print(f"BLEU Score: {bleu_score:.4f}")


Testing: 100%|██████████| 135/135 [00:28<00:00,  4.80it/s]

BLEU Score: 0.1330



Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
