In [10]:
"""
DS 5110 Assignment 4: Part 2: Fine-tuning Script for the GPT model series

This script loads a pretrained GPT2 model checkpoint and fine-tunes it on a specific dataset
using the HuggingFace Transformers library's Trainer API.
"""

'''
DO NOT MODIFY THIS CELL.
This section contains essential imports for fine-tuning GPT models.
'''
import os
import torch
import argparse
import numpy as np
from collections import OrderedDict
from datasets import load_dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    GPT2Config,
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    set_seed
)
# Add safetensors import
try:
    from safetensors.torch import save_file
    SAFETENSORS_AVAILABLE = True
except ImportError:
    SAFETENSORS_AVAILABLE = False
    print("SafeTensors not available. Install with: pip install safetensors")

In [11]:
'''
DO NOT MODIFY THIS CELL.
This cell loads the model and tokenizer.
'''
def load_model_and_tokenizer(model_name):
    """
    Load the model and tokenizer either from checkpoint or from HF hub.
    
    Args:
        checkpoint_path: Path to the pretrained model checkpoint
        model_name: Base model architecture to use if checkpoint not found
        
    Returns:
        tuple: (model, tokenizer) - The loaded model and tokenizer
    """
    # TODO: Your code here
    # Hint. Load the tokenizer from model_name
    # Hint. Set padding token to eos_token if not set
    # Hint. Try to load model from checkpoint_path, handling PT files if needed
    # Hint. Fall back to base model if checkpoint loading fails
    # Always load the tokenizer from model_name (more reliable)
    # Load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Initialize model architecture
    #config = GPT2Config.from_pretrained(model_name)
    
    # Initialize model with this config
    print(f"Load a pretrained GPT2 model")
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    return model, tokenizer

In [12]:
'''
This cell prepares the dataset.
'''
def prepare_dataset(dataset_name, tokenizer, dataset_config=None):
    """    Load and prepare dataset for fine-tuning.
    
    Args:
        dataset_name: Name of the Hugging Face dataset to use
        dataset_config: Specific configuration of the dataset
        tokenizer: The tokenizer to use for preprocessing
        
    Returns:
        tuple: (train_dataset, eval_dataset) - Tokenized datasets for training and evaluation
    """
    try:
        dataset = load_dataset(dataset_name, dataset_config)
        print(f"Dataset loaded with config: {dataset}")
    except:
        dataset = load_dataset(dataset_name)
        print(f"Dataset loaded without config: {dataset}")
    
    # Prepare train and validation splits
    if 'validation' in dataset:
        train_dataset = dataset['train']
        eval_dataset = dataset['validation']
    else:
        # Create a validation split if none exists
        train_eval = dataset['train'].train_test_split(test_size=0.1)
        train_dataset = train_eval['train']
        eval_dataset = train_eval['test']

    """
    TODO: Task 2.1
    """
    # Hint: Explicitly set the dataset range to N
    # Hint: where 50 <= N <= 200

    # TODO: Your code here... (uncomment the following two statements and configure N)
    #N = 50
    # I commented this again because I want to utilize the whole dataset
    #train_dataset = train_dataset.select(range(N))
    #eval_dataset = eval_dataset.select(range(N))
    """
    End of Task 2.1
    """

    
    print(f"Training examples: {len(train_dataset)}")
    print(f"Evaluation examples: {len(eval_dataset)}")
    column_names = train_dataset.column_names
    print(f"Dataset columns: {column_names}")

    # Define tokenization function
    def tokenize_function(examples):
        # For alpaca style datasets with input/output columns
        if 'instruction' in column_names and 'output' in column_names: # This is changed to match the philosophy dialogue
            # Combine input and output for language modeling
            texts = [
                f"Input: {inp}\nOutput: {out}"
                for inp, out in zip(examples['instruction'], examples['output']) # Maybe this should not have been changed
            ]
            
        return tokenizer(
            texts,
            truncation=True,
            max_length=256,
            return_tensors="pt",
            padding="max_length"
        )

    # Tokenize the datasets
    tokenized_train = train_dataset.map(
        tokenize_function,
        batched=True
    )
    
    tokenized_eval = eval_dataset.map(
        tokenize_function,
        batched=True
    )
    
    return tokenized_train, tokenized_eval

In [13]:
"""
DO NOT MODIFY THIS CELL.
This cell: 
1. Sets up the TrainingArguments with appropriate parameters
2. Initializes the Trainer with the model, datasets, and training arguments
3. Configures a data collator for language modeling
"""
def setup_trainer(model, tokenizer, train_dataset, eval_dataset, output_dir, learning_rate, batch_size, max_steps):
    """
    Set up the Trainer for fine-tuning.
    
    Args:
        model: The model to fine-tune
        tokenizer: The tokenizer used for preprocessing
        train_dataset: The training dataset
        eval_dataset: The evaluation dataset
        output_dir: Directory to save the fine-tuned model
        learning_rate: Learning rate for fine-tuning
        batch_size: Batch size for training
        max_steps: Maximum number of training steps
        
    Returns:
        Trainer: Configured Trainer instance
    """
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # We're doing causal language modeling, not masked LM
    )
    
    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        run_name="my-finetune-run",
        report_to="none",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        max_steps=max_steps,
        eval_strategy="steps",
        eval_steps=100,
        save_steps=100,
        warmup_steps=100,
        logging_steps=10,
        gradient_accumulation_steps=4,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator
    )
    return trainer

In [14]:
"""
DO NOT MODIFY THIS CELL.
This cell: 
1. Starts the fine-tuning process using the trainer
2. Saves the fine-tuned model and tokenizer
3. Handles any interruptions gracefully
"""
def run_fine_tuning(trainer, output_dir, tokenizer):
    """
    Run the fine-tuning process and save the model.
    
    Args:
        trainer: The configured Trainer instance
        output_dir: Directory to save the fine-tuned model
        tokenizer: The tokenizer to save alongside the model
        
    Returns:
        bool: True if fine-tuning completed successfully, False otherwise
    """
    trainer.train()
    
    # Save the model and tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
        
    return True

In [15]:
"""
This cell: 
1. Load the fine-tuned model and tokenizer
2. Create a text generation pipeline
3. Generate sample text to verify the model's performance
You could tune the configurations for model.generate() to see how differently the model performs.
"""
def test_fine_tuned_model(model_path, prompt="Once upon a time, "):
    """
    Generate sample text with the fine-tuned model.
    
    Args:
        model_path: Path to the fine-tuned model
        prompt: Text prompt to start generation
        
    Returns:
        str: Generated text
    """
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    
    # Use GPU if available; but most likely it'll use CPU as 
    # our EC2 instance is CPU-only 
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate output
    outputs = model.generate(
        **inputs,
        max_length=100,  # Set the max output token length
        temperature=0.7, # Set how random the output is
        top_p=0.9,
        do_sample=True
    )
    
    # Decode and return generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text

In [16]:
'''
This cell defines the configurations of the fine-tuning process.
'''
model_name = 'gpt2'    # By default, load pretrained GPT2 from Hugging Face
#model_name = 'EleutherAI/pythia-70m'
#dataset_name = 'wikitext'
dataset_name = 'Hypersniper/philosophy_dialogue' # Use the alpaca_1k Q&A dataset first
dataset_config = None

"""
TODO: Task 2.2
"""
# Hint: You need to change output_dir to a different directory so that 
# Hint: when you start a new fine-tuning, it won't overwrite your existing
# Hint: fine-tuned model

output_dir = 'fine-tuned-model-3' # The output directory where the fine-tuned model is saved
""" 
End of Task 2.2
"""
max_steps = 100        # Max number of training steps
batch_size = 4         # Batch size is set to 4 to bound your DRAM usage to ~60%; increasing this would lead to OOM (out of memory)
learning_rate = 1e-5   
seed = 42

In [17]:
'''
Run the entire fine-tuning process.
'''

"""
TODO: Task 1
"""
# Hint: Set seed for reproducibility by calling set_seed()
# Hint: Load model and tokenizer by calling load_model_and_tokenizer()
# Hint: Prepare dataset
# Hint: Set up trainer by calling setup_trainer()
# Hint: Start the fine-tuning process by calling run_fine_tuning()

# Hint: You should add reasonable print statements to help track the program running

# TODO: Your code here...
SEED = 42
set_seed(SEED)

model, tokenizer = load_model_and_tokenizer(model_name)

# Prepped dataset

tokenized_train, tokenized_eval = prepare_dataset(dataset_name, tokenizer)
print("Setting up trainer...")
trainer = setup_trainer(model, tokenizer, tokenized_train, tokenized_eval, output_dir, learning_rate, batch_size, max_steps)
print("Starting fine-tuning...")
run_fine_tuning(trainer, output_dir, tokenizer)
"""
End of Task 1
"""

print(f"Testing fine-tuned model...")
generated_text = test_fine_tuned_model(output_dir, prompt = 'What is the meaning of life?') # Using a paraphrased philosophical prompt
print(f"Generated text is {generated_text}")
print(f"Fine-tuning complete! Model saved to {output_dir}") 

Load a pretrained GPT2 model
Dataset loaded with config: DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 395
    })
})
Training examples: 355
Evaluation examples: 40
Dataset columns: ['instruction', 'output']


Map:   0%|          | 0/355 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Setting up trainer...
Starting fine-tuning...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
100,2.476,2.334355


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Testing fine-tuned model...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text is What is the meaning of life?
Life is not a simple question. It is an exploration of our lives, our emotions, our relationships, our relationships with others, and our understanding of ourselves. It is a journey that we must all share, not just for ourselves, but for all of us.

But it is not the only life. It is also a journey that we must all strive for, for we must strive for our own happiness, for we must strive for our own fulfillment
Fine-tuning complete! Model saved to fine-tuned-model-3
