In [2]:
import sys
import subprocess

def install_packages():
    packages = ["transformers", "datasets", "peft", "torch", "accelerate", "trl"]
    for package in packages:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

install_packages()
print("Required packages installed successfully")



Required packages installed successfully


In [None]:
# 2. Load Libraries and Set Up Environment
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import get_peft_model, LoraConfig, TaskType

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def load_model():
    model_name = "google/flan-t5-small"
    token = "hf-token" 
    
    try:
        # Load tokenizer and model with the token
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", token=token)
        
        print(f"Model loaded: {model_name}")
        print(f"Approximate parameters: {model.num_parameters() / 1_000_000:.1f}M")
        
        return model, tokenizer
    except Exception as e:
        print(f"Error loading model: {e}")
        raise

# Call the function to load the model and tokenizer
model, tokenizer = load_model()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded: google/flan-t5-small
Approximate parameters: 77.0M


In [None]:
from datasets import load_dataset

def load_dataset_sample():
    token = "hf-token"
    
    try:
        # Load a small subset of SQuAD with the token
        dataset = load_dataset("squad", split="train[:1000]", token=token)
        print(f"Loaded {len(dataset)} examples from SQuAD dataset")
        
        # Show an example
        print("\nExample data:")
        example = dataset[0]
        print(f"Context: {example['context'][:150]}...")
        print(f"Question: {example['question']}")
        print(f"Answer: {example['answers']['text'][0]}")
        
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        raise

# Call the function to load the dataset
dataset = load_dataset_sample()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 87599/87599 [00:00<00:00, 1172462.24 examples/s]
Generating validation split: 100%|██████████| 10570/10570 [00:00<00:00, 962794.39 examples/s]

Loaded 1000 examples from SQuAD dataset

Example data:
Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front o...
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer: Saint Bernadette Soubirous





In [None]:
# 5. Prepare the Data for Fine-tuning
def preprocess_function(examples):
    inputs = []
    targets = []
    
    for context, question, answer in zip(examples["context"], examples["question"], examples["answers"]):
        # Format: "Answer the question based on the context: <context> Question: <question>"
        input_text = f"Answer the question based on the context: {context} Question: {question}"
        target_text = answer["text"][0] if len(answer["text"]) > 0 else "I don't know."
        
        inputs.append(input_text)
        targets.append(target_text)
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=64, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(f"Processed {len(tokenized_dataset)} examples")

Map: 100%|██████████| 1000/1000 [00:00<00:00, 4793.70 examples/s]

Processed 1000 examples





In [None]:
# 6. Set Up PEFT (Parameter-Efficient Fine-Tuning)
def create_peft_model(model):
    # Set up LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        r=8,                  # Rank of the update matrices
        lora_alpha=32,        # Alpha parameter for LoRA scaling
        lora_dropout=0.1,     # Dropout probability for LoRA layers
        target_modules=["q", "v"],  # Modules to apply LoRA to
    )
    
    # Create PEFT model
    peft_model = get_peft_model(model, peft_config)
    print("PEFT model created")
    
    # Count trainable parameters to show memory efficiency
    trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in peft_model.parameters())
    print(f"Trainable parameters: {trainable_params:,} ({trainable_params / total_params:.2%} of total)")
    
    return peft_model

peft_model = create_peft_model(model)

PEFT model created
Trainable parameters: 344,064 (0.45% of total)


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch

def train_model(peft_model, tokenized_dataset):
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./finetuned-squad-model",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        learning_rate=1e-4,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_steps=10,
        save_strategy="epoch",
        eval_strategy="no",  
        fp16=torch.cuda.is_available(), 
    )
    
    # Create data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=peft_model,
        padding=True,
        label_pad_token_id=-100,
    )
    
    # Create trainer
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )
    
    # Train the model
    print("Starting training...")
    trainer.train()
    
    return trainer

# Call the function to train the model
trainer = train_model(peft_model, tokenized_dataset)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,48.3354
20,47.1141
30,44.8993
40,40.7143
50,42.299
60,40.3524
70,36.8806
80,38.7033
90,38.4345



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in google/flan-t5-small.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in google/flan-t5-small.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in google/flan-t5-small.


In [None]:
# 8. Save the Fine-tuned Model
def save_model(peft_model, tokenizer):
    # Save the model
    output_dir = "./finetuned-squad-model-final"
    peft_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

save_model(peft_model, tokenizer)

Model saved to ./finetuned-squad-model-final



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in google/flan-t5-small.


In [None]:
# 9. Test the Fine-tuned Model with Dataset Examples
def test_model_with_dataset(peft_model, dataset):
    def answer_question(context, question):
        input_text = f"Answer the question based on the context: {context} Question: {question}"
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        
        # Generate response
        with torch.no_grad():
            outputs = peft_model.generate(
                input_ids=inputs.input_ids,
                max_new_tokens=50,
                do_sample=False
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

    # Sample a few examples from the dataset
    test_examples = dataset.select(range(5))
    
    # Test each example
    for i, example in enumerate(test_examples):
        context = example["context"]
        question = example["question"]
        expected_answer = example["answers"]["text"][0]
        
        print(f"\nExample {i+1}:")
        print(f"Context (truncated): {context[:100]}...")
        print(f"Question: {question}")
        print(f"Expected answer: {expected_answer}")
        
        # Get model's answer
        model_answer = answer_question(context, question)
        print(f"Model's answer: {model_answer}")

test_model_with_dataset(peft_model, dataset)


Example 1:
Context (truncated): Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden...
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Expected answer: Saint Bernadette Soubirous
Model's answer: Saint-Bélemic

Example 2:
Context (truncated): Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden...
Question: What is in front of the Notre Dame Main Building?
Expected answer: a copper statue of Christ
Model's answer: the Grotto

Example 3:
Context (truncated): Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden...
Question: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Expected answer: the Main Building
Model's answer: St. Bernadette Soubirous

Example 4:
Context (truncated): Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden...
Que

In [None]:
# 10. Interactive Question Answering
def interactive_qa(peft_model):
    def answer_question(context, question):
        input_text = f"Answer the question based on the context: {context} Question: {question}"
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        
        # Generate response
        with torch.no_grad():
            outputs = peft_model.generate(
                input_ids=inputs.input_ids,
                max_new_tokens=50,
                do_sample=False
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

    # Sample context from our dataset
    sample_context = dataset[10]["context"]
    print(f"Context:\n{sample_context}\n")
    
    # Interactive question answering
    while True:
        user_question = input("\nAsk a question about the context (or type 'quit' to exit): ")
        if user_question.lower() == "quit":
            break
            
        answer = answer_question(sample_context, user_question)
        print(f"Answer: {answer}")

# Uncomment to run interactive questioning:
interactive_qa(peft_model)

Context:
The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.

Answer: Cathedral
Answer: a Buechner Prize for Preaching
Answer: a crucifix


KeyboardInterrupt: Interrupted by user

In [None]:

from datasets import load_metric

# 5. Define Metric Computation

def compute_metrics(predictions, labels):
    squad_metric = load_metric("squad")
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Format for SQuAD metric
    formatted_predictions = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(decoded_preds)]
    formatted_references = [{"id": str(i), "answers": {"text": [label], "answer_start": [0]}} for i, label in enumerate(decoded_labels)]
    
    # Compute metrics
    results = squad_metric.compute(predictions=formatted_predictions, references=formatted_references)
    
    # Calculate accuracy (exact match of decoded strings)
    accuracy = sum(pred.strip().lower() == label.strip().lower() for pred, label in zip(decoded_preds, decoded_labels)) / len(decoded_preds)
    
    return {
        "exact_match": results["exact_match"],
        "f1": results["f1"],
        "accuracy": accuracy
    }


ImportError: cannot import name 'load_metric' from 'datasets' (e:\GPT\venv\Lib\site-packages\datasets\__init__.py)

In [23]:
%pip install --upgrade evaluate

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import torch
import evaluate  # Use evaluate instead of datasets.load_metric
import numpy as np

# Use existing variables: peft_model, tokenizer, dataset, device
print(f"Using device: {device}")

# 2. Create a Small Test Set from Existing Dataset

def create_test_set(dataset, num_examples=50):
    # Use a subset of the existing dataset as a test set
    test_dataset = dataset.select(range(num_examples))
    print(f"Created test set with {len(test_dataset)} examples")
    return test_dataset

test_dataset = create_test_set(dataset)

Using device: cpu
Created test set with 50 examples


In [None]:

# 3. Preprocess Test Data (Reuse Existing Preprocess Function)

def preprocess_function(examples):
    inputs = []
    targets = []
    
    for context, question, answer in zip(examples["context"], examples["question"], examples["answers"]):
        input_text = f"Answer the question based on the context: {context} Question: {question}"
        target_text = answer["text"][0] if len(answer["text"]) > 0 else "I don't know."
        inputs.append(input_text)
        targets.append(target_text)
    
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=True)
    labels = tokenizer(targets, max_length=64, truncation=True, padding=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
print(f"Processed {len(tokenized_test_dataset)} test examples")

Map: 100%|██████████| 50/50 [00:00<00:00, 1541.76 examples/s]

Processed 50 test examples





In [None]:

# 4. Define Manual Metric Computation

def compute_metrics(predictions, labels):
    # wl: Manual computation due to evaluate.load("squad") errors
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Manual SQuAD metrics
    def compute_f1(pred, ref):
        pred_tokens = pred.lower().split()
        ref_tokens = ref.lower().split()
        common = len(set(pred_tokens) & set(ref_tokens))
        if common == 0:
            return 0.0
        precision = common / len(pred_tokens) if pred_tokens else 0.0
        recall = common / len(ref_tokens) if ref_tokens else 0.0
        return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    # Exact Match: Percentage of exact string matches
    exact_match = sum(pred.strip().lower() == ref.strip().lower() for pred, ref in zip(decoded_preds, decoded_labels)) / len(decoded_preds) * 100
    
    # F1: Average token overlap
    f1 = sum(compute_f1(pred, ref) for pred, ref in zip(decoded_preds, decoded_labels)) / len(decoded_preds) * 100
    
    # Accuracy: Same as EM (exact string match)
    accuracy = exact_match
    
    return {
        "exact_match": exact_match,
        "f1": f1,
        "accuracy": accuracy
    }

In [None]:

# 5. Evaluate Model on Test Set

def evaluate_model(peft_model, tokenized_test_dataset):
    def generate_predictions(dataset):
        predictions = []
        labels = []
        for example in dataset:
            input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
            with torch.no_grad():
                outputs = peft_model.generate(
                    input_ids=input_ids,
                    max_new_tokens=50,
                    do_sample=False
                )
            pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
            label = tokenizer.decode(example["labels"], skip_special_tokens=True)
            predictions.append(pred)
            labels.append(label)
        return predictions, labels
    
    print("Generating predictions...")
    predictions, labels = generate_predictions(tokenized_test_dataset)
    
    # Tokenize for metric computation
    pred_ids = tokenizer(predictions, max_length=64, truncation=True, padding=True)["input_ids"]
    label_ids = tokenizer(labels, max_length=64, truncation=True, padding=True)["input_ids"]
    
    metrics = compute_metrics(pred_ids, label_ids)
    
    print("\nPerformance Metrics on Test Set:")
    print(f"Exact Match (EM): {metrics['exact_match']:.2f}%")
    print(f"F1 Score: {metrics['f1']:.2f}%")
    print(f"Accuracy: {metrics['accuracy']:.2f}%")
    
    return metrics

metrics = evaluate_model(peft_model, tokenized_test_dataset)

Generating predictions...

Performance Metrics on Test Set:
Exact Match (EM): 54.00%
F1 Score: 67.53%
Accuracy: 54.00%


In [None]:

# 6. Re-evaluate Original Test Examples

def evaluate_original_examples(peft_model):
    test_cases = [
        {
            "context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. The Grotto, a Marian place of prayer and reflection, is a replica of the grotto at Lourdes.",
            "question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?",
            "expected": "Saint Bernadette Soubirous"
        },
        {
            "context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
            "question": "What is in front of the Notre Dame Main Building?",
            "expected": "a copper statue of Christ"
        },
        {
            "context": "Architecturally, the school has a Catholic character. The Basilica of the Sacred Heart is beside the Main Building.",
            "question": "The Basilica of the Sacred heart at Notre Dame is beside to which structure?",
            "expected": "the Main Building"
        },
        {
            "context": "Architecturally, the school has a Catholic character. The Grotto, a Marian place of prayer and reflection, is a replica of the grotto at Lourdes.",
            "question": "What is the Grotto at Notre Dame?",
            "expected": "a Marian place of prayer and reflection"
        },
        {
            "context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
            "question": "What sits on top of the Main Building at Notre Dame?",
            "expected": "a golden statue of the Virgin Mary"
        }
    ]
    
    def answer_question(context, question):
        input_text = f"Answer the question based on the context: {context} Question: {question}"
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = peft_model.generate(
                input_ids=inputs.input_ids,
                max_new_tokens=50,
                do_sample=False
            )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    correct = 0
    print("\nEvaluating Original Test Examples:")
    for i, test in enumerate(test_cases):
        model_answer = answer_question(test["context"], test["question"])
        print(f"\nExample {i+1}:")
        print(f"Question: {test['question']}")
        print(f"Expected: {test['expected']}")
        print(f"Model: {model_answer}")
        if model_answer.strip().lower() == test["expected"].strip().lower():
            correct += 1
    accuracy = correct / len(test_cases) * 100
    print(f"\nOriginal Test Example Accuracy: {accuracy:.2f}%")

evaluate_original_examples(peft_model)


Evaluating Original Test Examples:

Example 1:
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Expected: Saint Bernadette Soubirous
Model: Mary

Example 2:
Question: What is in front of the Notre Dame Main Building?
Expected: a copper statue of Christ
Model: the main building

Example 3:
Question: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Expected: the Main Building
Model: Main Building

Example 4:
Question: What is the Grotto at Notre Dame?
Expected: a Marian place of prayer and reflection
Model: a replica of the grotto

Example 5:
Question: What sits on top of the Main Building at Notre Dame?
Expected: a golden statue of the Virgin Mary
Model: gold dome

Original Test Example Accuracy: 0.00%
