In [None]:
!pip install -q -U torch==2.5.1+cu124 transformers==4.48.3 peft==0.14.0 bitsandbytes==0.45.2 datasets==3.3.0 trl==0.15.0 # python 3.11.11

In [None]:
# - transformers: Library for loading and fine-tuning transformer models
# - bitsandbytes: Enables low-memory 4-bit and 8-bit quantization for efficient training
# - peft: Parameter-efficient fine-tuning (PEFT) methods such as LoRA (Low-Rank Adaptation)
# - datasets: Provides access to Hugging Face datasets
# - accelerate: Optimizes model training and enables distributed training
# - trl: Library for reinforcement learning with transformers

# Import essential libraries
import torch
from datasets import load_dataset
import re

# Importing transformers components
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForCausalLM,
    TrainingArguments,
    BitsAndBytesConfig,
    logging
)

# Importing PEFT (Parameter-Efficient Fine-Tuning) components
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
)

# Importing TRL (Transformers Reinforcement Learning) Trainer
from trl import SFTTrainer

import os

In [None]:
# Login to Hugging Face Hub to access private models and datasets
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Define the base model from Hugging Face Hub
# "mistralai/Mistral-7B-Instruct-v0.2" is a large instruction-tuned model designed for conversational AI and reasoning tasks
base_model = "mistralai/Mistral-7B-Instruct-v0.2"

# Retrieve the Hugging Face authentication token from environment variables
# This is required to access private or gated models
auth_token = os.getenv("HF_TOKEN")

In [None]:
# Load the tokenizer for the specified model
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    use_auth_token=auth_token,
    padding_side="right",
    add_eos_token=True,
)

# Set the padding token to be the same as the EOS token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Configure 4-bit quantization using BitsAndBytes (bnb)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Uses Normalized Float 4 (NF4) for improved quantization accuracy
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.float16
)

# Load the pre-trained causal language model with quantization settings
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    use_auth_token=auth_token,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)


In [None]:
# Define the dataset to be used for training
dataset_name = "wikitext"
config = "wikitext-103-raw-v1"

# Load the training and evaluation datasets
train_dataset = load_dataset(dataset_name, config, split="train[:32000]")
eval_dataset = load_dataset(dataset_name, config, split="train[32000:40000]")

In [None]:
# Function to format the dataset samples for training
def generate_prompt(sample):
    return {"text": sample['text']}  # Extracts text from the dataset and structures it for model input

# Apply the formatting function to both training and evaluation datasets
train_dataset = train_dataset.map(generate_prompt)
eval_dataset = eval_dataset.map(generate_prompt)

In [None]:
# Enable gradient checkpointing to reduce memory usage during training
# This trades off computation for memory, which is useful for large models
model.gradient_checkpointing_enable()

# Prepare the model for k-bit training (low-bit precision training)
# This optimizes memory efficiency by adapting the model for quantized training
model = prepare_model_for_kbit_training(model)

# Function to calculate and display the number of trainable parameters in the model
def print_trainable_parameters(model):
    trainable_params = 0
    all_params = 0

    # Iterate through model parameters
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    if all_params == 0:
        print("No parameters found.")
    else:
        print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params:.2f}")

# Configure LoRA (Low-Rank Adaptation) for parameter-efficient fine-tuning
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[  # List of transformer layers where LoRA is applied
        "q_proj",  # Query projection in attention layers
        "k_proj",  # Key projection in attention layers
        "v_proj",  # Value projection in attention layers
        "o_proj",  # Output projection in attention layers
        "gate_proj",  # Gate projection in feedforward layers
        "up_proj",  # Up-projection in MLP layers
        "down_proj",  # Down-projection in MLP layers
    ],
    bias="none",
    lora_dropout=0.2,
    task_type="CAUSAL_LM",
)

# Apply LoRA configuration to the model
model = get_peft_model(model, lora_config)

In [None]:
print_trainable_parameters(model)

trainable params: 20971520 || all params: 3773042688 || trainable%: 0.56


In [None]:
# Define training arguments for fine-tuning
training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,  # Number of training samples per GPU/CPU
    gradient_accumulation_steps=4,  # Accumulate gradients over multiple steps to simulate a larger batch size
    optim="paged_adamw_32bit",  # Uses Paged AdamW optimizer with 32-bit precision for better memory efficiency
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=1e-4,
    weight_decay=0.001,
    num_train_epochs=2,
    eval_strategy="epoch",
    do_eval=True,
    report_to="none",
    bf16=True,
)

In [None]:
# Custom processing class for supervised fine-tuning (SFT)
class SFTProcessing:
    def __init__(self, tokenizer, max_seq_length=1024):
        """
        Initializes the processing class with a tokenizer and sequence length.
        """
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __call__(self, sample):
        # If the sample is a dictionary, extract the "text" field; otherwise, use the sample itself
        if isinstance(sample, dict):
            text = sample.get("text", sample)
        else:
            text = sample

        return self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_seq_length,
        )

    def pad(self, *args, **kwargs):
        """
        Pads tokenized sequences to ensure uniform length.
        Uses the tokenizer's built-in padding functionality.
        """
        return self.tokenizer.pad(*args, **kwargs)

    def __getattr__(self, name):
        """
        Allows access to other tokenizer methods dynamically.
        This ensures compatibility with any additional tokenizer functions.
        """
        return getattr(self.tokenizer, name)

# Instantiate the processing class with the defined tokenizer and max sequence length
processing_class = SFTProcessing(tokenizer, max_seq_length=1024)

# Initialize the SFTTrainer for fine-tuning
trainer = SFTTrainer(
    model=model,
    processing_class=processing_class,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
)

In [None]:
# Disable reentrant checkpointing
# - By default, PyTorch uses reentrant checkpointing, which can cause issues with some models.
# - Setting this to False ensures non-reentrant checkpointing, improving stability.
torch.utils.checkpoint.use_reentrant = False

# Disable model caching during training
# - This prevents the model from storing past key-value pairs in attention layers.
# - Useful for training efficiency, especially when using memory-optimized methods like LoRA.
model.config.use_cache = False

# Start the training process
trainer.train()

In [None]:
my_finetuned_model = "mistral-7B-wikitext-finetuned"

trainer.model.push_to_hub(my_finetuned_model)