Fine-tune Llama 3.1 8B with Unsloth

In [None]:
!pip install -qqq "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --progress-bar off
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install -qqq --no-deps {xformers} trl peft accelerate bitsandbytes triton --progress-bar off

import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported

## 1. Load model for PEFT

In [None]:
# SFT involves retraining base models on a smaller dataset of instructions and answers.
# 3 most popular SFT techniques are full ft, LoRa and QLoRa.
#   1. Full ft. It involves retraining all parameters of a pre-trained model on an instruction dataset
#   2. LoRa. Instead of retraining the entire model, it freezes the weights and introduces small adapters (low-rank matrices) at each targeted layer.
#   3. QLoRa. is an extension of LoRA that offers even greater memory savings (%33 additional memory reduction). We will use this because we are using Google Colab, which has some limitations

In [None]:
# Load model
# Llama 3.1 4bit version (smaller and faster compared to 16 bit version)
# 2048 context length. Llama supports up to 128k tokens.
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

# Prepare model for PEFT
# Rank (r): matrix size. 16x16
# Alpha (lora_alpha): scaling factor for updates. Impacts directly the adapters contribution.
# Target modules: model components to retrain.
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora=True, # modifies the scaling factor of LoRA adapters to be proportional to 1/√r instead of 1/r.
    use_gradient_checkpointing="unsloth"
)
print(model.print_trainable_parameters())
# trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196
# we’ll only train 42 million out of 8 billion parameters (0.5196%).

## 2. Prepare data and tokenizer

In [None]:
# chatml (openai). adds two special tokens (<|im_start|> and <|im_end|>) to indicate who is speaking.
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",
    mapping={"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}
)

def apply_template(examples):
    messages = examples["conversations"]
    text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
    return {"text": text}

dataset = load_dataset("mlabonne/FineTome-100k", split="train[:10000]") # only 10000 samples because our limitations with GPUs (google colab.. # only 10000 samples for bnecaecuaasuuse ofur limitaiton tions with GPUTs (google colab...)
# we apply chat template to every conversation
dataset = dataset.map(apply_template, batched=True)

## 3. Training

In [None]:
import os
os.environ['UNSLOTH_RETURN_LOGITS'] = '1'

In [None]:
from trl import SFTConfig

config = SFTConfig(
    output_dir="output",
    learning_rate=3e-4,
    lr_scheduler_type="linear", # adjust learning rate
    per_device_train_batch_size=2, # only 2 because of efficiency
    gradient_accumulation_steps=2,
    # per_device_train_batch_size=4,
    # gradient_accumulation_steps=4,
    num_train_epochs=1,
    fp16=True,
    # fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit", # 8bit for memory savings
    report_to=[],
    weight_decay=0.01, # regularizer
    warmup_steps=10,
    seed=0,
    max_seq_length=max_seq_length,
    dataset_text_field="text",
    packing=True,
    dataset_num_proc=2,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=config,
    processing_class=tokenizer,
)

trainer.train()


## 4. Inference

In [None]:
# Load model for inference
# simple test not used for evaluation
model = FastLanguageModel.for_inference(model)

messages = [
    {"from": "human", "value": "Is 9.11 larger than 9.9?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)