In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import nltk

nltk.download('punkt')

import logging
logging.disable(logging.WARNING)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def load_coqa_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['data']

# Custom dataset class
class CoQADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['story']
        question = item['questions'][0]['input_text']
        answer = item['answers'][0]['input_text']

        # T5 format: "question: <question> context: <context>"
        input_text = f"question: {question} context: {context}"
        target_text = answer

        # Tokenize the input and target texts
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': labels['input_ids'].flatten()  # Use labels for T5
        }

In [3]:
data = load_coqa_data('/kaggle/input/qna-dataset/coqa-train-v1.0.json')
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

In [4]:
# Initialize tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

train_dataset = CoQADataset(train_data, tokenizer)
val_dataset = CoQADataset(val_data, tokenizer)
test_dataset = CoQADataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]



In [5]:
def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training")

    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass with T5
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Get the loss from the output
        loss = outputs.loss

        # Average loss across GPUs (if using multiple)
        loss = loss.mean()

        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        # Update progress bar with current loss
        progress_bar.set_postfix({'loss': loss.item()})

    # Return the average loss over all batches
    return total_loss / len(train_loader)

In [6]:
# model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model = T5ForConditionalGeneration.from_pretrained('t5-base')


# Set device and move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



In [7]:
# Use DataParallel to parallelize across GPUs
import torch.nn as nn

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

model = model.to(device)

Using 2 GPUs!


In [8]:
def validate(model, val_loader, device):
    model.eval()
    total_loss = 0
    progress_bar = tqdm(val_loader, desc="Validating")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass with T5
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Average loss across GPUs (if using multiple)
            loss = loss.mean()

            total_loss += loss.item()

            progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(val_loader)

In [9]:
def test2(model, test_loader, tokenizer, device):
    model.eval()
    all_predictions = []
    all_answers = []
    progress_bar = tqdm(test_loader, desc="Testing")
    
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            answers = batch['labels'].to(device)  # Ensure answers are on the correct device
            
            # Generate outputs from the model
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

            # Decode predictions and append to the list
            for i in range(outputs.shape[0]):
                prediction = tokenizer.decode(outputs[i], skip_special_tokens=True)
                all_predictions.append(prediction)
                # Decoding the ground truth labels (answers) as well
                ground_truth = tokenizer.decode(answers[i], skip_special_tokens=True)
                all_answers.append(ground_truth)

    # Calculate BLEU score using the predictions and the ground truths
    bleu_score = calculate_bleu(all_predictions, all_answers)
    return bleu_score


In [10]:
# Training loop
num_epochs = 5
best_loss = float('inf')
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_loader, optimizer, device)
    val_loss = validate(model, val_loader, device)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), 't5_qa_model.pth')
        print("Model saved!")
    else:
        print("Validation Loss Increased. Model Not Saved.")
    print("*" * 50)

Epoch 1/5


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Training: 100%|██████████| 630/630 [14:52<00:00,  1.42s/it, loss=0.034] 
Validating: 100%|██████████| 135/135 [01:18<00:00,  1.72it/s, loss=0.0303]


Train Loss: 0.4777, Validation Loss: 0.0359
Model saved!
**************************************************
Epoch 2/5


Training: 100%|██████████| 630/630 [14:44<00:00,  1.40s/it, loss=0.012]  
Validating: 100%|██████████| 135/135 [01:18<00:00,  1.72it/s, loss=0.0105] 


Train Loss: 0.0235, Validation Loss: 0.0113
Model saved!
**************************************************
Epoch 3/5


Training: 100%|██████████| 630/630 [14:41<00:00,  1.40s/it, loss=0.0184] 
Validating: 100%|██████████| 135/135 [01:18<00:00,  1.72it/s, loss=0.00881]


Train Loss: 0.0115, Validation Loss: 0.0096
Model saved!
**************************************************
Epoch 4/5


Training: 100%|██████████| 630/630 [14:42<00:00,  1.40s/it, loss=0.0077] 
Validating: 100%|██████████| 135/135 [01:18<00:00,  1.73it/s, loss=0.0081] 


Train Loss: 0.0092, Validation Loss: 0.0087
Model saved!
**************************************************
Epoch 5/5


Training: 100%|██████████| 630/630 [14:39<00:00,  1.40s/it, loss=0.00385]
Validating: 100%|██████████| 135/135 [01:18<00:00,  1.72it/s, loss=0.00792]


Train Loss: 0.0079, Validation Loss: 0.0082
Model saved!
**************************************************


In [11]:
# Calculate BLEU score
def calculate_bleu(predictions, references):
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        bleu_scores.append(sentence_bleu([ref.split()], pred.split()))
    return sum(bleu_scores) / len(bleu_scores)


In [12]:
# Test function
def test(model, test_loader, tokenizer, device):
    model.eval()
    all_predictions = []
    all_answers = []
    progress_bar = tqdm(test_loader, desc="Testing")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)  # Ensure answers are on the correct device

            # Create decoder_input_ids from input_ids
            # This is typically done by prepending a start token. Adjust as needed.
            decoder_input_ids = torch.full((input_ids.shape[0], 1), tokenizer.pad_token_id, device=device)
            decoder_input_ids = torch.cat((decoder_input_ids, input_ids[:, :-1]), dim=1)  # Shift input_ids

            # Pass through the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)
            
            # Get the logits from the model output
            logits = outputs.logits
            
            # Get the predicted token ids
            predicted_ids = torch.argmax(logits, dim=-1)

            for i in range(predicted_ids.shape[0]):
                prediction = tokenizer.decode(predicted_ids[i], skip_special_tokens=True)
                all_predictions.append(prediction)
                all_answers.append(tokenizer.decode(labels[i], skip_special_tokens=True))  # Decode your answers

    bleu_score = calculate_bleu(all_predictions, all_answers)
    return bleu_score


In [13]:
# Test the model
bleu_score = test(model, test_loader, tokenizer, device)
print(f"BLEU Score: {bleu_score:.4f}")


Testing: 100%|██████████| 135/135 [03:31<00:00,  1.57s/it]
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU Score: 0.1875
