<a href="https://colab.research.google.com/github/pinzger/handsonllms/blob/main/Fine_tuning_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning GPTs

Example code covers:
   * Using QLoRA to fine-tune TinyLlama to follow instructions

Example adopted from Chapter 12 of [Hands-On Large Language Models](https://www.amazon.com/Hands-Large-Language-Models-Understanding/dp/1098150961).

---

💡 **NOTE**: For using a GPU in Google Colab, go to
**Runtime > Change runtime type > Hardware accelerator > GPU > GPU type > T4**.

---

If you are viewing this notebook on Google Colab (or any other cloud vendor), you might need to **uncomment and run** the following codeblock to install the dependencies for this chapter:

In [None]:
# %%capture
# !pip install -q accelerate==0.31.0 peft==0.11.1 bitsandbytes==0.43.1 transformers==4.41.2 trl==0.9.4 sentencepiece==0.2.0

# Supervised Fine-Tuning (SFT)
## Data Preprocessing
We use the HuggingFaceH4/ultrachat_200k dataset for fine-tuning "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T".

The template of this model is (see also the book):</br>
\<|user|\></br>
Question\</s\> (EOS token)</br>
\<|assistant|\></br>
The answer is \</s\>

The goal is to fine-tune the model to generate "better" answers. For that, we use AutoModelForCausalLM that adds a head to predicts the next token in a sequence of tokens (typically used for text generation).


In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset


# Load a tokenizer to use its chat template
template_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

def format_prompt(example):
    """Format the prompt to using the <|user|> template TinyLLama is using"""

    # Format answers
    chat = example["messages"]
    prompt = template_tokenizer.apply_chat_template(chat, tokenize=False)

    return {"text": prompt}

# Load and format the data using the template TinyLLama is using
# 3000 entries are used for training
dataset = (
    load_dataset("HuggingFaceH4/ultrachat_200k",  split="test_sft")
      .shuffle(seed=42)
      .select(range(3000))  # even faster training
#      .select(range(3_000))
)

# creates the prompts and adds them in the column "text"
dataset = dataset.map(format_prompt)

In [None]:
print(dataset)

In [None]:
# Example of formatted prompt
print(dataset["text"][576])

## Load the model and configure quantization
The quantization reduces the VRAM required for loading the model from 4 GB to 1 GB (4 bits instead of 16 bits, and even more when full 32 bits are used).

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision model loading
    bnb_4bit_quant_type="nf4",  # Quantization type
    # bnb_4bit_compute_dtype="float16",  # Compute dtype
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute dtype
    bnb_4bit_use_double_quant=True,  # Apply nested quantization
)

# Load the model to train on the GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",

    # Leave this out for regular SFT
    quantization_config=bnb_config,
)
# do not use cache when training/fine-tuning a model
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "left"


## Tokenize the training data

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_data = dataset.map(tokenize_function, batched=True)

In [None]:
split_dataset = tokenized_data.train_test_split(test_size=0.2)
training_data = split_dataset["train"]
evaluation_data = split_dataset["test"]

In [None]:
# print(evaluation_data[0])

## LoRA Configuration
Using the peft library.

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Prepare LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=32,  # LoRA Scaling - amount of change added to the original weights, typically 2 x r
    lora_dropout=0.1,  # Dropout for LoRA Layers
    r=64,  # Rank of the matrices
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=  # Layers to target, fewer layers are faster to train but might lose performance
     ['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()  # Show trainable parameters

## Training Configuration


In [None]:
from transformers import TrainingArguments

output_dir = "./tinyllama-finetuned"

# Training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    # eval_steps=10,
    save_strategy="epoch",
    save_total_limit=1,    # keep the last checkpoint
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",  # suggested by the original QLoRA paper
    learning_rate=2e-4, # might use larger varlues for models >33B
    lr_scheduler_type="cosine", # linearly increase until reaching the spec. rate
    num_train_epochs=1, # typcially low for fine-tuning GPTs
    logging_steps=10,
    # fp16=True,
    bf16=True,
    report_to="none",
    load_best_model_at_end=True,
    # gradient_checkpointing=True # saves memory but costs more training time
)

## Training
Using the Trainer class.

In [None]:
from trl import SFTTrainer
from transformers import Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Use huggingface trainer
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=training_data,
    eval_dataset=evaluation_data,
    processing_class=tokenizer,
    data_collator=data_collator,
#            compute_metrics=compute_metrics
)

# Train model
trainer.train()

#

In [None]:
# Saves only the trained LoRA adapters
trainer.model.save_pretrained(f"{output_dir}/best-model-layers")

In [None]:
# for comparison: saves the trained LoRA adapters, tokenizer, and training args
trainer.save_model(f"{output_dir}/best-model")

## Merging the models
Reload the model in 16 bits (instead of the quantized 4 bits) and merge the weights.

!Note, only do this if you want to create a stand-alone model without LoRA adapters.

In [None]:
from peft import AutoPeftModelForCausalLM

# Requires the based model to be in the directory or specified in the adater_config file
model = AutoPeftModelForCausalLM.from_pretrained(
    f"{output_dir}/best-model-layers",
    low_cpu_mem_usage=True,
    device_map="auto",
)

# Merge LoRA and base model - only if you want to create a stand-alone model again
merged_model = model.merge_and_unload()

## Loading and Testing the QLoRA fine-tuned model
First loading the model and apply the trained LoRA adapters to it. Note, AutoPeftModelForCausalLM can be used if the based model is specified in the adapter_config file or is available in the directory that also contain the trained LoRA adapters. Otherwise, the base model has to be loaded first. Then PeftModel is used to load and apply the trained LoRA adapters.

Furthermore, I figured that 4bit and 8bit quantization cannot be mixed. This means, that LoRA adapters trained on a 4bit base model cannot be applied to the same base model loaded with 8bit quantization.

Applying the 4bit trained LoRA adapters on the base model loaded with float16 or bfloat16, however, works.

Note, according to ChatGPT, LoRA with 8bit base model is more stable and has slightly better quality (~1–2% better), especially if the training dataset is small and noise!

In [None]:
import torch
from transformers import Trainer, DataCollatorForLanguageModeling, AutoModelForCausalLM, BitsAndBytesConfig
from peft import AutoPeftModelForCausalLM, PeftModel

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    # load_in_8bit=True,  # Use 8-bit precision model loading
    load_in_4bit=True,  # Use 4-bit precision model loading
    bnb_4bit_quant_type="nf4",  # Quantization type
    # bnb_4bit_compute_dtype="float16",  # Compute dtype
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute dtype
    bnb_4bit_use_double_quant=True,  # Apply nested quantization
)

lora_path = f"{output_dir}/best-model"
# lora_path = f"{output_dir}/best-model-full"
# model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

# load the base model and attach the fine-tuned LoRA adapters in two steps
# base_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
# model = PeftModel.from_pretrained(base_model, lora_path)

tokenizer = AutoTokenizer.from_pretrained(lora_path, trust_remote_code=True)
# AutoPeftModelForCausalLM requires either the base model in the folder or an entry to the base_model_name_or_path in the adapter config file
model = AutoPeftModelForCausalLM.from_pretrained(lora_path, quantization_config=bnb_config, device_map="auto")
# model = AutoPeftModelForCausalLM.from_pretrained(model_path, device_map="auto")
model.config.use_cache = True


## Evaluating the loaded model

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_arguments,
    eval_dataset=evaluation_data,
    processing_class=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics
)

metrics = trainer.evaluate()
print(metrics)

## Using the model

In [None]:
def small_test(model):
    # Use our predefined prompt template
    prompt = """<|user|>
    What is generative AI?</s>
    <|assistant|>
    """
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output_ids = model.generate(**inputs,  max_length=100)  # limit the length

    return output_ids

In [None]:
output_ids = small_test(model)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"{output_text} ...")

### Comparison with the original model

In [None]:
original_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", device_map="auto")
output_ids = small_test(original_model)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"{output_text} ...")