<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/LLM_GPT2_QA_Finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m22.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import transformers
print(transformers.__version__)

4.47.1


Finetuning:

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import os

import random
import numpy as np

# Seed setting function
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Set the seed for reproducibility
seed = 50
set_seed(seed)

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Tie lm_head weights if missing
if model.lm_head.weight.shape[0] != model.transformer.wte.weight.shape[0]:
    model.tie_weights()

# Add padding token to tokenizer and model
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Example QA dataset
qa_data = [
    {"question": "What is the capital of France?", "answer": "The capital of France is Paris."},
    {"question": "Who wrote '1984'?", "answer": "George Orwell wrote '1984'."},
]

# Preprocess dataset
def preprocess_data(example):
    input_text = f"Question: {example['question']}\nAnswer: {example['answer']}"
    inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=60)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

# Convert dataset to Huggingface Dataset object
dataset = Dataset.from_list(qa_data)
tokenized_dataset = dataset.map(preprocess_data, remove_columns=["question", "answer"])

# Define collation function
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item["attention_mask"] for item in batch], dtype=torch.long)
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Split dataset into training and validation sets
# train_size = int(0.8 * len(tokenized_dataset))
# train_dataset = tokenized_dataset.select(range(train_size))
# val_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

# Data loaders
batch_size = 2
train_loader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Define optimizer, criterion, and device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Training loop
def train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10, save_dir="./gpt2_qa_finetuned"):
    best_val_loss = float("inf")
    os.makedirs(save_dir, exist_ok=True)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Training phase
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            print('input_ids', input_ids.shape)
            print('labels', labels.shape)
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Logits shape: (batch_size, seq_length, vocab_size)

            # Shift logits and labels for causal language modeling
            shift_logits = logits[..., :-1, :].contiguous()  # Remove the last token
            shift_labels = labels[..., 1:].contiguous()      # Remove the first token

            # Reshape for CrossEntropyLoss
            loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Training Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                # Shift logits and labels for causal language modeling
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = labels[..., 1:].contiguous()

                # Reshape for CrossEntropyLoss
                loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        # Save the model if validation loss improves
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model.save_pretrained(save_dir)
            tokenizer.save_pretrained(save_dir)
            print(f"Saved best model with Validation Loss: {avg_val_loss:.4f}")

# Train the model
train_model(model, train_loader, val_loader, optimizer, criterion, device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Epoch 1/10


Training:   0%|          | 0/1 [00:00<?, ?it/s]

input_ids torch.Size([2, 60])
labels torch.Size([2, 60])


Training: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it]


Training Loss: 3.3350


Validating: 100%|██████████| 1/1 [00:00<00:00, 48.81it/s]


Validation Loss: 2.1785
Saved best model with Validation Loss: 2.1785
Epoch 2/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.07it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 2.3493


Validating: 100%|██████████| 1/1 [00:00<00:00, 53.59it/s]

Validation Loss: 1.7007





Saved best model with Validation Loss: 1.7007
Epoch 3/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.10it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 1.9128


Validating: 100%|██████████| 1/1 [00:00<00:00, 45.32it/s]

Validation Loss: 1.3272





Saved best model with Validation Loss: 1.3272
Epoch 4/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.50it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 1.5536


Validating: 100%|██████████| 1/1 [00:00<00:00, 48.31it/s]

Validation Loss: 1.0289





Saved best model with Validation Loss: 1.0289
Epoch 5/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.53it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 1.1126


Validating: 100%|██████████| 1/1 [00:00<00:00, 44.11it/s]

Validation Loss: 0.8119





Saved best model with Validation Loss: 0.8119
Epoch 6/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.16it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.8002


Validating: 100%|██████████| 1/1 [00:00<00:00, 35.82it/s]

Validation Loss: 0.6255





Saved best model with Validation Loss: 0.6255
Epoch 7/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.66it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.8276


Validating: 100%|██████████| 1/1 [00:00<00:00, 50.92it/s]

Validation Loss: 0.4551





Saved best model with Validation Loss: 0.4551
Epoch 8/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.68it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.4953


Validating: 100%|██████████| 1/1 [00:00<00:00, 50.07it/s]

Validation Loss: 0.3170





Saved best model with Validation Loss: 0.3170
Epoch 9/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.78it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.5933


Validating: 100%|██████████| 1/1 [00:00<00:00, 49.32it/s]

Validation Loss: 0.2148





Saved best model with Validation Loss: 0.2148
Epoch 10/10


Training: 100%|██████████| 1/1 [00:00<00:00,  9.48it/s]


input_ids torch.Size([2, 60])
labels torch.Size([2, 60])
Training Loss: 0.3313


Validating: 100%|██████████| 1/1 [00:00<00:00, 48.59it/s]

Validation Loss: 0.1480





Saved best model with Validation Loss: 0.1480


#Inference: Greedy Search

In [None]:
import torch
import torch.nn.functional as F

def generate_answer(question, model, tokenizer, max_length=50, device="cuda"):
    """
    Generate an answer using greedy search.
    Args:
        question (str): The input question.
        model (GPT2LMHeadModel): The GPT-2 model.
        tokenizer (GPT2Tokenizer): The tokenizer.
        max_length (int): Maximum length of the generated text.
        device (str): Device to run the model on ('cuda' or 'cpu').

    Returns:
        str: The generated answer.
    """
    # Move the model to the correct device
    model.to(device)
    model.eval()  # Set model to evaluation mode

    # Prepare input
    input_text = f"Question: {question}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Initialize the generated token IDs with the input
    generated_ids = input_ids

    for _ in range(max_length):
        # Forward pass to get logits for the next token
        outputs = model(input_ids=generated_ids)
        logits = outputs.logits

        # Select the token with the highest probability (greedy search)
        next_token_id = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)

        # Append the predicted token to the sequence
        generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)

        # Stop if the end-of-sequence (EOS) token is generated
        if next_token_id.item() == tokenizer.eos_token_id:
            break

    # Decode the generated token IDs to a text string
    answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return answer


# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2_qa_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_qa_finetuned")

# Test the generate_answer function
question = "What is the capital of France?"
device = "cuda" if torch.cuda.is_available() else "cpu"
answer = generate_answer(question, model, tokenizer, device=device)
print(answer)

Question: What is the capital of France?
Answer: The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The


#Inference: Beam Search

In [None]:
import torch
import torch.nn.functional as F

def generate_answer(question, model, tokenizer, max_length=50, num_beams=5, device="cuda"):
    """
    Generate an answer using beam search for the fine-tuned GPT-2 model.

    Args:
        question (str): The input question.
        model (GPT2LMHeadModel): The fine-tuned model.
        tokenizer (GPT2Tokenizer): The tokenizer.
        max_length (int): Maximum length of the generated text.
        num_beams (int): Number of beams for beam search.
        device (str): The device to run the model on ('cuda' or 'cpu').

    Returns:
        str: The generated answer.
    """
    model.to(device)
    model.eval()

    # Prepare input text
    input_text = f"Question: {question}\nAnswer:"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True)

    # Extract input_ids and attention_mask
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Initialize beams
    beams = [(input_ids, attention_mask, 0.0)]  # List of (sequence, attention_mask, score)
    completed_sequences = []

    # Beam search loop
    for _ in range(max_length):
        new_beams = []

        for seq, mask, score in beams:
            # Stop expanding if sequence ends with EOS token
            if seq[0, -1] == tokenizer.eos_token_id:
                completed_sequences.append((seq, score))  # Append sequence and score
                continue

            # Forward pass
            with torch.no_grad():
                outputs = model(seq, attention_mask=mask)
                logits = outputs.logits[:, -1, :]  # Logits of the last token
                probs = F.log_softmax(logits, dim=-1)  # Convert to log probabilities

            # Get top-k tokens and their log probabilities
            top_k_probs, top_k_tokens = torch.topk(probs, num_beams, dim=-1)

            # Expand each beam
            for prob, token in zip(top_k_probs[0], top_k_tokens[0]):
                new_seq = torch.cat([seq, token.unsqueeze(0).unsqueeze(0)], dim=1)  # Append token
                new_mask = torch.cat([mask, torch.ones((1, 1), device=device)], dim=1)  # Update attention mask
                new_score = score + prob.item()  # Accumulate log probability as float
                new_beams.append((new_seq, new_mask, new_score))

        # Sort new beams by score and keep top-k
        new_beams = sorted(new_beams, key=lambda x: x[2], reverse=True)[:num_beams]
        beams = new_beams

        # Break if all beams end with EOS token
        if all(seq[0, -1] == tokenizer.eos_token_id for seq, _, _ in beams):
            break

    # Add remaining beams to completed sequences
    completed_sequences.extend([(seq, float(score)) for seq, _, score in beams])

    # Select the sequence with the highest score
    best_sequence, _ = max(completed_sequences, key=lambda x: x[1])

    # Decode the tokens to text
    answer = tokenizer.decode(best_sequence[0], skip_special_tokens=True)
    return answer


# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2_qa_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_qa_finetuned")

# Test the generate_answer function
question = "What is the capital of France?"
device = "cuda" if torch.cuda.is_available() else "cpu"
answer = generate_answer(question, model, tokenizer, max_length=50, num_beams=5, device=device)
print(answer)

Question: What is the capital of France?
Answer: The capital of France is Paris.
Answer: The capital of France is Paris.
Answer: The capital of France is Paris.
Answer: The capital of France is Paris.
Answer: The capital of France is Paris.
Answer:


Debugging:

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import os

import random
import numpy as np

# Seed setting function
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Set the seed for reproducibility
seed = 50
set_seed(seed)

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Tie lm_head weights if missing
if model.lm_head.weight.shape[0] != model.transformer.wte.weight.shape[0]:
    model.tie_weights()

# Add padding token to tokenizer and model
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Example QA dataset
qa_data = [
    {"question": "What is the capital of France?", "answer": "The capital of France is Paris."},
    {"question": "Who wrote '1984'?", "answer": "George Orwell wrote '1984'."},
]

# Preprocess dataset
def preprocess_data(example):
    input_text = f"Question: {example['question']}\nAnswer: {example['answer']}"
    inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=60)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

# Convert dataset to Huggingface Dataset object
dataset = Dataset.from_list(qa_data)
tokenized_dataset = dataset.map(preprocess_data, remove_columns=["question", "answer"])

# Define collation function
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item["attention_mask"] for item in batch], dtype=torch.long)
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Split dataset into training and validation sets
# train_size = int(0.8 * len(tokenized_dataset))
# train_dataset = tokenized_dataset.select(range(train_size))
# val_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

# Data loaders
batch_size = 1
train_loader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Define optimizer, criterion, and device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Training loop
def train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=1, save_dir="./gpt2_qa_finetuned"):
    best_val_loss = float("inf")
    os.makedirs(save_dir, exist_ok=True)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Training phase
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            print('input_ids:', input_ids.shape,'\ntoken ids:',input_ids, '\ndecoded text:', tokenizer.decode(input_ids[0]))
            print('labels:', labels.shape,'\ntoken ids:',labels, '\ndecoded text:', tokenizer.decode(labels[0]))
            # print(tokenizer.decode(input_ids[0]))
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Logits shape: (batch_size, seq_length, vocab_size)

            # Shift logits and labels for causal language modeling
            shift_logits = logits[..., :-1, :].contiguous()  # Remove the last token
            shift_labels = labels[..., 1:].contiguous()      # Remove the first token
            print('shift_labels:', shift_labels.shape,'\ntoken ids:',shift_labels, '\ndecoded text:', tokenizer.decode(shift_labels[0]))

            # Reshape for CrossEntropyLoss
            loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            break

        avg_train_loss = train_loss / len(train_loader)
        print(f"Training Loss: {avg_train_loss:.4f}")


# Train the model
train_model(model, train_loader, val_loader, optimizer, criterion, device)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Epoch 1/1


Training:   0%|          | 0/2 [00:00<?, ?it/s]

input_ids: torch.Size([1, 60]) 
token ids: tensor([[24361,    25,  1867,   318,   262,  3139,   286,  4881,    30,   198,
         33706,    25,   383,  3139,   286,  4881,   318,  6342,    13, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]],
       device='cuda:0') 
decoded text: Question: What is the capital of France?
Answer: The capital of France is Paris.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext


