In [None]:
# Check that PyTorch is using the GPU.
import torch
torch.cuda.is_available()

In [None]:
# Set environment variables, this must be done before importing transformers.
from dotenv import load_dotenv
load_dotenv(override=True)

import functools
import json
import numpy as np
import os
import torch
from tqdm import tqdm

if "TRANSFORMERS_OFFLINE" in os.environ and int(os.environ["TRANSFORMERS_OFFLINE"]):
    print("Using cached models from", os.environ["HF_HOME"])
else:
    print("Loading model from huggingface hub and saving to", os.environ["HF_HOME"])

# Optional: disable warnings.
# import warnings
# warnings.filterwarnings("ignore")

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

# Introduction to Llama-3

Meta's Llama-3 is one of the state-of-the-art open-source language models. We will use the 8 billion parameter instruction-tuned version in this tutorial.  
Instruction tuned models are intended for assistant-like chat, whereas pretrained models can be adapted for a variety of natural language generation tasks.

Llama-3 Instruct models can be prompted using a chat template: a prompt is a list of messages, each of which has a role (one of "system", "user" or "assistant") and content. System messages are meta-prompts that are used to define instructions that are hidden from the end user, whereas the user and assistant (the LLM) are taking turns in a chat dialogue.

You can run conversational inference using the Transformers pipeline abstraction, or by leveraging the Auto classes with the generate() function.

**Note:** Make sure to set the `HF_TOKEN` environment variable to access Llama-3 (unless you are running on Puhti or Mahti, where Llama-3-8B-Instruct is already downloaded).

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "What is the capital of France?"},
]
outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

## Once you are done with the pipeline, delete the variable to free up GPU memory for the next sections.

In [None]:
del pipeline  # Free up GPU memory

# Fine-tuning Llama-3

We will now fine-tune Llama-3-8B on a new dataset.

We would like the model to reproduce human users' preferences over two possible completions: given `(prompt, reponse A, response B)`, it should predict A, B, or tie according to what a human user preferred. We will use real human preference data collected on the Chatbot arena ([lmsys-arena-human-preference-55k](https://huggingface.co/datasets/lmsys/lmsys-arena-human-preference-55k)). This dataset is hosted by Huggingface as part of a vast library of datasets, which you can explore at https://huggingface.co/datasets.

Note that this prediction problem could also be modeled as a classification task using e.g. a BERT-like model, but we will use Llama to demonstrate general autoregressive language model fine-tuning. A general model like Llama will be a good starting point for developing more flexible user models with LLMs: models that give their preferences in natural language, in scalar reward values, etc.

## Preparing data


### Format fine-tuning prompt-reponse pairs

In [None]:
from datasets import load_dataset

We only load a subset of the dataset to keep training time reasonable for this tutorial.

In [None]:
dataset_name = "lmsys/lmsys-arena-human-preference-55k"
dataset = load_dataset(dataset_name, split='train[:2000]')

dataset

In [None]:
dataset.info.features

In [None]:
def parse_list_from_str_column(text):
    lst = []
    try:
        lst = json.loads(text)
        if len(lst) > 0 and lst[-1] is None:
            lst = lst[:-1]
    except _ as e:
        print(e)
        print(text)
        print()
    return lst


def format_prompt(example):
    preference_prompt = 'Which response to the following prompt is better? Answer with "A", "B" or "tie" only.'
    answer = "A" if example["winner_model_a"] else ("B" if example["winner_model_b"] else "tie")
    prompts = parse_list_from_str_column(example["prompt"])
    responses_a = parse_list_from_str_column(example["response_a"])
    responses_b = parse_list_from_str_column(example["response_b"])
    full_prompt = (
        preference_prompt
        + "\n\nUser prompt:\n" + "\n\n".join(prompts)
        + "\n\nResponse A:\n" + "\n\n".join(responses_a)
        + "\n\nResponse B:\n" + "\n\n".join(responses_b))
    example["preference_prompt"] = full_prompt
    example["preference_label"] = answer
    return example


def is_single_turn_dialogue(example):
    prompts = parse_list_from_str_column(example["prompt"])
    responses_a = parse_list_from_str_column(example["response_a"])
    responses_b = parse_list_from_str_column(example["response_b"])
    is_single_turn = len(prompts) == 1 and len(responses_a) == 1 and len(responses_b) == 1
    if is_single_turn:
        is_null = prompts[0] == "null" or responses_a[0] == "null" or responses_b[0] == "null"
        if is_null:
            print("IS NULL")
            print(prompts, responses_a, responses_b)
        return not is_null
    return False


# Simplify the task to one-turn dialogues.
prompt_dataset = dataset.filter(is_single_turn_dialogue)

# Format the dataset into (prompt, answer) pairs.
prompt_dataset = prompt_dataset.map(format_prompt)

print('Original dataset length', len(dataset), 'filtered length', len(prompt_dataset))

### Inspect data

In [None]:
prompt_dataset[0]

In [None]:
print(prompt_dataset[0]["preference_prompt"])

In [None]:
print(prompt_dataset[0]["preference_label"])

### Tokenize text

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

We will run Llama's tokenizer on the data once. To do this, we need a way to convert a list of chat messages from the chat template into a string prompt.

Each instruction-tuned model has been trained with a specific template for turning lists of messages into a prompt string.  
For Llama-3, this is documented in its model card at https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/.

In [None]:
def llama_format_finished_chat(messages):
    """Format documented in https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/."""
    prompt = ""
    for msg in messages:
        if msg["role"] == "system":
            prompt += f"<|start_header_id|>system<|end_header_id|>\n\n{msg['content']}<|eot_id|>"
        elif msg["role"] == "assistant":
            prompt += f"<|start_header_id|>assistant<|end_header_id|>\n\n{msg['content']}<|eot_id|>"
        elif msg["role"] == "user":
            prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{msg['content']}<|eot_id|>"
        else:
            raise ValueError(f"Wrong message role {msg['role']}.")
    return prompt


def preference_data_to_finished_chat(example):
    prompt = example["preference_prompt"]
    completion = example["preference_label"]
    
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": completion}
    ]
    input_text = llama_format_finished_chat(messages)
    return input_text

Our input pipeline needs to differentiate between the prompt and the completion: both need to be concatenated and passed as input to the model in training, but the cross-entropy loss should only be applied to the completion.

This can be implemented using standard Huggingface Trainers by setting the "labels" of the corresponding tokens to -100 (the "ignore index").

In [None]:
def llama_continue_chat_prompt(messages):
    prompt = llama_format_finished_chat(messages)
    # Add the header for a new assistant message.
    prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
    return prompt


def preference_data_to_chat_prompt(example):
    prompt = example["preference_prompt"]
    completion = example["preference_label"]
    
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": prompt},
    ]
    input_text = llama_continue_chat_prompt(messages)
    return input_text


def tokenize_unpadded(example, tokenizer):
    prompt_and_completion = preference_data_to_finished_chat(example)
    encoding = tokenizer(prompt_and_completion, return_tensors="pt")
    
    # Create labels, with -100 for prompt tokens and actual tokens for completion
    input_ids = encoding["input_ids"].squeeze()
    labels = input_ids.clone()
    prompt_only = preference_data_to_chat_prompt(example)
    prompt_encoding = tokenizer(prompt_only, return_tensors="pt")
    prompt_length = len(prompt_encoding["input_ids"].squeeze())
    labels[:prompt_length] = -100  # Ignore prompt part in loss computation
    
    return {
        "input_ids": input_ids,
        "attention_mask": encoding["attention_mask"].squeeze(),
        "labels": labels,
    }

In [None]:
tokenized_dataset = prompt_dataset.map(tokenize_unpadded, fn_kwargs={"tokenizer": tokenizer})

In [None]:
# For computational efficiency, filter out very long examples.
tokenized_dataset = tokenized_dataset.filter(lambda x: len(x["input_ids"]) < 1024)

In [None]:
len(tokenized_dataset)

In [None]:
train_val_split = (tokenized_dataset
    .shuffle(seed=123)
    .train_test_split(test_size=0.1, shuffle=False))

After these filtering steps, we will ultimately train on about 1400 prompt-response pairs.

In [None]:
len(train_val_split["train"])

In [None]:
len(train_val_split["test"])

In [None]:
# Prepare training and validation data to only include relevant columns.
columns_to_drop = [k for k in tokenized_dataset[0].keys() if k not in ["input_ids", "attention_mask", "labels"]]
train_tokens = train_val_split["train"].remove_columns(columns_to_drop)
val_tokens = train_val_split["test"].remove_columns(columns_to_drop)

# PEFT: Parameter-efficient fine-tuning

Instead of fine-tuning all 8 billion parameters, which would be very computationally expensive, we will use LoRA, low-rank adaptation, to reduce the computational cost.  
To make fine-tuning more efficient, LoRA represents weight updates with two smaller matrices (called update matrices) through low-rank decomposition: https://arxiv.org/abs/2106.09685.

We will further speed up training by quantizing model weights from their original float32 data type to 4-bit floating points. Quantization together with low-rank adaptation is referred to as QLoRA.

In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel, LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig
import torch

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16)
init_model = AutoModelForCausalLM.from_pretrained(model_id,
                                                  quantization_config=quantization_config,
                                                  low_cpu_mem_usage=True,
                                                  device_map="auto",
                                                 )

In [None]:
# Freeze all parameters
for param in init_model.parameters():
    param.requires_grad = False

In [None]:
def print_trainable_parameters(model):
    """
    Print the names and shapes of trainable parameters in a Hugging Face model.

    Args:
    model: A Hugging Face model instance.
    """
    trainable_params = 0
    all_params = 0
    for name, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable_params: {trainable_params}")
    print(f"all_params: {all_params}")
    
print_trainable_parameters(init_model)

r = 8 defines the rank of the adaptation matrix:

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
peft_model = get_peft_model(init_model, lora_config)

In [None]:
print_trainable_parameters(peft_model)

Using QLoRA, we have reduced the trainable parameters from 8 billion to 3.4 million.

# Define Trainer

Huggingface transformers defines a Trainer API that makes it convenient to set up training and evaluation tasks.

In this tutorial, we will use the specialized Supervised Fine-Tuning (SFT) trainer from TRL (Transformers  Reinforcement Learning) library.

In [None]:
from trl import SFTConfig, SFTTrainer

In [None]:
batch_size = 1
learning_rate = 1e-4
grad_acc_steps = 16
trained_model_id = f"{model_id}_lora_4bit_lr{learning_rate}_acc{grad_acc_steps}_1".replace("/", "__")

sft_config = SFTConfig(
    trained_model_id,
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_acc_steps,
    per_device_eval_batch_size=batch_size,
    max_seq_length=1024,
    fp16=True,
    num_train_epochs=1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    label_names=["labels"],
)

trainer = SFTTrainer(
    peft_model,
    train_dataset=train_tokens,
    eval_dataset=val_tokens,
    args=sft_config,
)

In [None]:
# To run evaluation in isolation, such as before training:
# eval_results = trainer.evaluate()
# eval_results

In [None]:
# In addition to evaluating language modeling loss (cross entropy),
# we implement the below functions to measure accuracy of the multiple-choice predictions.
def predict_with_model(model, val_tokens, generation_kwargs=None):
    val_token_predictions = []
    val_predictions = []
    val_labels = []
    # Use deterministic sampling (by default) in order to get a deterministic accuracy score.
    generation_kwargs = ({"max_new_tokens": 10, "do_sample": False, "top_p": None, "temperature": None}
                         if generation_kwargs is None else generation_kwargs)
    model.eval()
    with torch.no_grad():
        for example in tqdm(val_tokens):
            labels = torch.Tensor(example["labels"]).int()
            is_prompt = labels == -100
            input_ids = torch.Tensor(example["input_ids"])[is_prompt].int().unsqueeze(0)
            input_ids = input_ids.to(model.device)
            attention_mask = torch.Tensor(example["attention_mask"])[is_prompt].int().unsqueeze(0)
            attention_mask = attention_mask.to(model.device)
            outputs = model.generate(input_ids, attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id, **generation_kwargs)
            new_outputs = outputs[0, input_ids.shape[1]:]
            val_token_predictions.append(new_outputs)
            generated_text = tokenizer.decode(new_outputs, skip_special_tokens=True)
            val_predictions.append(generated_text)
            label_text = tokenizer.decode(labels[~is_prompt], skip_special_tokens=True)
            val_labels.append(label_text)
    model.train()
    return val_token_predictions, val_predictions, val_labels


def get_prediction_accuracy(labels, predictions, loose_matching=False):
    zipped = zip(labels, predictions)
    length = min(len(labels), len(predictions))
    if loose_matching:
        n_correct = np.sum([l == p[:len(l)] for l, p in zipped])
    else:
        n_correct = np.sum([l == p for l, p in zipped])
    return n_correct / length


def count_label_distribution(labels):
    counts = {}
    for pred in labels:
        if pred in counts:
            counts[pred] += 1
        else:
            counts[pred] = 1
    total_count = sum(counts.values())
    print({k: v / total_count for k, v in counts.items()})
    return counts

For a reference point, we evaluate the initial model before training: it gets an accuracy of 32%.

To start with, it has very skewed statistics, and almost never predicts a tie, whereas the true label distribution much closer to uniform.

In [None]:
# To produce predictions for the validation split:
val_token_predictions, val_predictions, val_labels = predict_with_model(peft_model, val_tokens)
get_prediction_accuracy(val_labels, val_predictions)

In [None]:
count_label_distribution(val_predictions)

In [None]:
count_label_distribution(val_labels)

In [None]:
# To produce predictions for the train split:
# train_token_predictions, train_predictions, train_labels = predict_with_model(peft_model, train_tokens)
# get_prediction_accuracy(train_labels, train_predictions)

In [None]:
print(f"Training model... writing checkpoint to {trainer.args.output_dir} once done")
trainer.train()

In [None]:
val_token_predictions, val_predictions, val_labels = predict_with_model(peft_model, val_tokens)
get_prediction_accuracy(val_labels, val_predictions)

In [None]:
# train_token_predictions, train_predictions, train_labels = predict_with_model(peft_model, train_tokens)
# get_prediction_accuracy(train_labels, train_predictions)

In [None]:
val_predictions[:12]

In [None]:
val_labels[:12]

In addition to an increased validation accuracy (~40%), our model is now giving a less skewed distribution of preferences (your results may vary slightly due to randomness in the training procedure):

In [None]:
count_label_distribution(val_labels)

In [None]:
count_label_distribution(val_predictions)

Note that this prediction task is genuinely very difficult, so it's nice to see that we actually improve on it at least a little!  
The prompts as well as the users' subjective preferences may have little in common with each other, and we have trained the model on a very small dataset and on just one epoch.

Feel free to experiment with longer training times after getting to the end of the notebook to try to improve this accuracy.  
You could also try to increase the number of QLoRA parameters by using 8 or 16 bit floating points, or by increasing the matrix rank `r`, in order to improve performance.

## Exercise 1

Although instruction-tuned Llama is quite good at following our instructions, it may occasionally produce other formats of responses, such as `"Better response: B"`, especailly when generations are stochastically sampled.
How would you change the sampling or evaluation procedure if you would like to restrict the model to answering one of a set of possible responses, defined a priori?

In [None]:
# **********************************************************
# YOUR CODE HERE
# **********************************************************

## Exercise 2

Alternatively, how would you restrict the `model.generate()` call to only return 1 new token (A / B / tie are all 1 token long)?

In [None]:
# **********************************************************
# YOUR CODE HERE
# **********************************************************

## Optional: save predictions to file

In [None]:
val_tokens_with_predictions = val_tokens.add_column(
    f"{model_id}_4bit_token_predictions",
    [list(pred.cpu().numpy()) for pred in val_token_predictions])

In [None]:
data_out_path = f"{dataset_name}__{model_id}_4bit_token_predictions".replace("/", "__")
val_tokens_with_predictions.save_to_disk(data_out_path)
print("Wrote dataset to", data_out_path)

# Extending to larger datasets and more training parameters: multi-GPU with model parallel
We skipped over several Huggingface features in the above simple fine-tuning setup.

In general, you will likely want to batch your inputs, and use several GPUs in parallel. The device_map argument used when loading init_model takes care of loading model parameters across the available devices.

When batching, we need to convert the variable length sequences to a fixed length. This can be achieved through either truncating longer sequences and padding shorter ones with padding tokens.

If training for multiple epochs, it is also recommended to shuffle the training data between epochs.

## Alternative training loop using Huggingface's accelerate library

You may want to use different settings/resources/environments for model training in different phases of your research, different APIs or libraries can provide interfaces to run training:
```bash
# A single GPU/CPU
python your_script.py
```
or 

```bash
# Multiple GPUs
torchrun --nnode=1 --nproc_per_node=4 your_script.py
```
or 

```bash
# Multiple GPUs
deepspeed --num_gpus=4 your_script.py
```
or

......

Doing this naively means your_script.py needs to be modified to handle single/multi-GPU training. 

Is there a better way of doing this? 

Yes, the accelerate library solves this and ensures the same code can be ran on different computing resources.

In [None]:
from accelerate import Accelerator

In [None]:
accelerator = Accelerator()

batch_size = 1
gradient_accumulation_steps = 8
max_length = 512
lr = 1e-4
num_epochs = 1

accelerator.print(peft_model.print_trainable_parameters())

In [None]:
train_tokens.set_format(type="torch")
val_tokens.set_format(type="torch")

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_tokens, batch_size=1, shuffle=True)
eval_dataloader = DataLoader(val_tokens, batch_size=1, shuffle=False)

In [None]:
from torch import optim
from transformers import get_linear_schedule_with_warmup

In [None]:
optimizer = optim.AdamW(peft_model.parameters(), lr=lr)

lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs)
    )

In [None]:
peft_model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
        peft_model, train_dataloader, eval_dataloader, optimizer, lr_scheduler
    )

In [None]:
if getattr(accelerator.state, "fsdp_plugin", None) is not None:
    accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(model)

In [None]:
for epoch in range(num_epochs):
    peft_model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        outputs = peft_model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        accelerator.backward(loss)
        
        if step % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            peft_model.zero_grad()

    peft_model.eval()
    eval_loss = 0
    for step, batch in enumerate(tqdm(eval_dataloader)):
        with torch.no_grad():
            outputs = peft_model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()

    # To save model checkpoints:
    peft_model.save_pretrained(f"accelerate_trained_model-{epoch}")

In [None]:
total_loss.cpu().numpy() / len(train_dataloader)

In [None]:
eval_loss.cpu().numpy() / len(eval_dataloader)

In [None]:
# To save the trained model:
peft_model.save_pretrained(f"accelerate_trained_model-{epoch + 1}")

# Further use cases

Huggingface has a vast library of [tutorials](https://huggingface.co/learn) and resources to keep extending your LLM knowledge.

Other recommended topics &ndash; these could be relevant for your team projects:

1. Learning from human preferences beyond fine-tuning:  
 [Reinforcement Learning from Human Feedback](https://huggingface.co/blog/stackllama#stackllama-a-hands-on-guide-to-train-llama-with-rlhf) (RLHF)  
 [Reward Model training](https://huggingface.co/docs/trl/main/en/reward_trainer)  
 [Direct Preference Optimization](https://huggingface.co/docs/trl/main/en/dpo_trainer) (DPO)  



2. [Retrieval-Augmented Generation](https://huggingface.co/docs/transformers/en/model_doc/rag) (RAG)

3. [Distributed Training](https://huggingface.co/docs/transformers/accelerate) (Accelerate)

4. [Best practices of LLM prompting](https://huggingface.co/docs/transformers/tasks/prompting)
