# Finetunning LLaMA 2 7B step by step
In this notebooks you will learn how to finetune a pretrained LLama model on an Instruction dataset. 
> This notebook requires a A100 or V100 GPU with at least 24GB of memory.

## Prepare your Instruction Dataset
An Instruction dataset is a list of instructions/outputs pairs that are relevant to your own domain.<br> 
For instance it could be question and answers from an specific domain, problems and solution for a technical domain, or just instruction and outputs.<br>
Let's grab the Alpaca (GPT-4 curated instructions and outputs) dataset:

In [9]:
# load the dataset dataset json file
import json

dataset_file = "alpaca_gpt4_data.json"

with open(dataset_file, "r") as f:
    alpaca = json.load(f)

In [7]:
# train and validation set split
import random

seed = 42
#shuffle the datas first
random.seed(seed)
random.shuffle(alpaca)  

train_dataset = alpaca[:-1000]
eval_dataset = alpaca[-1000:] #1000 samples for validation

import pandas as pd
# saving the traina and eval jsonal file for next training and reproduce the resutls
train_df = pd.DataFrame(train_dataset)
eval_df = pd.DataFrame(eval_dataset)
train_df.to_json("alpaca_gpt4_train.jsonl", orient='records', lines=True)
eval_df.to_json("alpaca_gpt4_eval.jsonl", orient='records', lines=True)

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data
    
train_dataset = load_jsonl(f"alpaca_gpt4_train.jsonl")
eval_dataset = load_jsonl(f"alpaca_gpt4_eval.jsonl")

In [8]:
#preprocessing the data by adding the prompt (some LLM has default prompt, you need add them in front of instruction.
#For alpaca dataset, some samples have input while others don't, thus we generate two different prompt.
def prompt_no_input(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:\n").format_map(row)

def prompt_input(row):
    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n").format_map(row)

def create_alpaca_prompt(row):
    return prompt_no_input(row) if row["input"] == "" else prompt_input(row)

train_prompts = [create_alpaca_prompt(row) for row in train_dataset]
eval_prompts = [create_alpaca_prompt(row) for row in eval_dataset]

In [10]:
#preprocessing the data by adding EOS token.
def pad_eos(ds):
    EOS_TOKEN = "</s>"
    return [f"{row['output']}{EOS_TOKEN}" for row in ds]
train_outputs = pad_eos(train_dataset)
eval_outputs = pad_eos(eval_dataset)

In [11]:
#finish the data preprocesing by adding prompt in front of data and EOS in the end of data
train_dataset = [{"prompt":s, "output":t, "example": s + t} for s, t in zip(train_prompts, train_outputs)]
eval_dataset = [{"prompt":s, "output":t, "example": s + t} for s, t in zip(eval_prompts, eval_outputs)]

## Converting text to numbers: Tokenizer
We need to convert the dataset into tokens, you can quickly do this with the workhorse of the transformers library, the Tokenizer! This function does a lot of heavy lifting besides tokenizing the text. 

- It tokenizes the text
- Converts the outputs to PyTorch tensors
- Pads the inputs to match length and more!

In [13]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = '/home/idies/workspace/Temporary/xyu1/scratch/hub/Llama-2-7b-hf'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

max_sequence_len = 1024

def pack(dataset, max_seq_len=max_sequence_len):
    #It tokenizes the text
    tkds_ids = tokenizer([s["example"] for s in dataset])["input_ids"]
    
    all_token_ids = []
    for tokenized_input in tkds_ids:
        all_token_ids.extend(tokenized_input)# + [tokenizer.eos_token_id])
    
    print(f"Total number of tokens: {len(all_token_ids)}")
    #Pads the inputs to match length and more!
    packed_ds = []
    for i in range(0, len(all_token_ids), max_seq_len+1):
        input_ids = all_token_ids[i : i + max_seq_len+1]
        if len(input_ids) == (max_seq_len+1):
            packed_ds.append({"input_ids": input_ids[:-1], "labels": input_ids[1:]})  # this shift is not needed if using the model.loss
    return packed_ds


train_ds_packed = pack(train_dataset)
eval_ds_packed = pack(eval_dataset)
len(train_ds_packed)

  from .autonotebook import tqdm as notebook_tqdm


Total number of tokens: 11499843
Total number of tokens: 216533


11219

## Before training
We need to prepare the dataloader, inputs and labels, optimiztion, loss function

- Dataloader
- hyperparameters
- Load pretrained model

In [14]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

#datalaoder for training and testing
torch.manual_seed(seed)
batch_size = 16  # I have an A100 GPU with 40GB of RAM 

train_dataloader = DataLoader(
    train_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator, # we don't need any special collator 
)

eval_dataloader = DataLoader(
    eval_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator,
    shuffle=False,
)

In [15]:
from types import SimpleNamespace
#storing all my hyperparameters into a SimpleNamespace,
gradient_accumulation_steps = 2

config = SimpleNamespace(
    model_id=model_id,
    dataset_name="alpaca-gpt4",
    precision="bf16",  # faster and better than fp16, requires new GPUs
    n_freeze=24,  # How many layers we don't train, LLama 7B has 32.
    lr=2e-4,
    n_eval_samples=10, # How many samples to generate on validation
    max_seq_len=max_sequence_len, # Lenght of the sequences to pack
    epochs=3,  # we do 3 pasess over the dataset.
    gradient_accumulation_steps=gradient_accumulation_steps,  # evey how many iterations we update the gradients, simulates larger batch sizes
    batch_size=batch_size,  # what my GPU can handle, depends on how many layers are we training  
    log_model=False,  # upload the model to W&B?
    gradient_checkpointing = True,  # saves even more memory
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
    seed=seed,
)

config.total_train_steps = config.epochs * len(train_dataloader) // config.gradient_accumulation_steps

In [16]:
#Load pretrained model of LLaMA2 7B from HuggingFace
model = AutoModelForCausalLM.from_pretrained(
    config.model_id,
    device_map=0,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    use_cache=False,
)

[2024-08-01 22:06:30,519] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.30s/it]


In [17]:
#To make training fast and using small GPU memory, we freeze the layers
# freeze layers (disable gradients)
for param in model.parameters(): param.requires_grad = False
for param in model.lm_head.parameters(): param.requires_grad = True
for param in model.model.layers[config.n_freeze:].parameters(): param.requires_grad = True

In [19]:
#count the the model trainable parameters
def param_count(m):
    params = sum([p.numel() for p in m.parameters()])/1_000_000
    trainable_params = sum([p.numel() for p in m.parameters() if p.requires_grad])/1_000_000
    print(f"Total params: {params:.2f}M, Trainable: {trainable_params:.2f}M")
    return params, trainable_params

params, trainable_params = param_count(model)

Total params: 6738.42M, Trainable: 1750.14M


In [20]:
# Just freeze embeddings for small memory decrease
if config.freeze_embed:
    model.model.embed_tokens.weight.requires_grad_(False);
# save more memory and you can also use gradient checkpointing to save even more (this makes training slower, how much it will depend on your particular configuration). 
#There is a [nice article](https://huggingface.co/docs/transformers/v4.18.0/en/performance) on the Huggingface website about how to fit large models on memory, 
#I encourage you to check it!
if config.gradient_checkpointing:
    model.gradient_checkpointing_enable()

from transformers import get_cosine_schedule_with_warmup
#optimizer
optim = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9,0.99), eps=1e-5)
scheduler = get_cosine_schedule_with_warmup(
    optim,
    num_training_steps=config.total_train_steps,
    num_warmup_steps=config.total_train_steps // 10,
)
#loss function
def loss_fn(x, y):
    "A Flat CrossEntropy" 
    return torch.nn.functional.cross_entropy(x.view(-1, x.shape[-1]), y.view(-1))

## Testing during training

We are almost there, let's create a simple function to sample from the model now and then, to visualy see what the models is outputting!<br>
Let's wrap the model.generate method for simplicity. You can grab the defaults sampling parameters from the GenerationConfig and passing the corresponding model_id. This will grab you the defaults for parameters like temperature, top p, etc...


In [21]:
from types import SimpleNamespace
from transformers import GenerationConfig
import wandb
from tqdm.auto import tqdm

gen_config = GenerationConfig.from_pretrained(config.model_id)
test_config = SimpleNamespace(
    max_new_tokens=256,
    gen_config=gen_config)

def generate(prompt, max_new_tokens=test_config.max_new_tokens, gen_config=gen_config):
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
    with torch.inference_mode():
        output = model.generate(tokenized_prompt, 
                            max_new_tokens=max_new_tokens, 
                            generation_config=gen_config)
    return tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)

def to_gpu(tensor_dict):
    return {k: v.to('cuda') for k, v in tensor_dict.items()}

class Accuracy:
    "A simple Accuracy function compatible with HF models"
    def __init__(self):
        self.count = 0
        self.tp = 0.
    def update(self, logits, labels):
        logits, labels = logits.argmax(dim=-1).view(-1).cpu(), labels.view(-1).cpu()
        tp = (logits == labels).sum()
        self.count += len(logits)
        self.tp += tp
        return tp / len(logits)
    def compute(self):
        return self.tp / self.count

from pathlib import Path
def save_model(model, model_name, models_folder="models", log=False):
    """Save the model to wandb as an artifact
    Args:
        model (nn.Module): Model to save.
        model_name (str): Name of the model.
        models_folder (str, optional): Folder to save the model. Defaults to "models".
    """
    model_name = f"{wandb.run.id}_{model_name}"
    file_name = Path(f"{models_folder}/{model_name}")
    file_name.parent.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(file_name, safe_serialization=True)
    # save tokenizer for easy inference
    tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
    tokenizer.save_pretrained(model_name)
    if log:
        at = wandb.Artifact(model_name, type="model")
        at.add_dir(file_name)
        wandb.log_artifact(at)

In [22]:
@torch.no_grad()
def validate():
    model.eval();
    eval_acc = Accuracy()
    loss, total_steps = 0., 0
    for step, batch in enumerate(pbar:=tqdm(eval_dataloader, leave=False)):
        pbar.set_description(f"doing validation")
        batch = to_gpu(batch)
        total_steps += 1
        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
            out = model(**batch)
            loss += loss_fn(out.logits, batch["labels"])  # you could use out.loss and not shift the dataset
        eval_acc.update(out.logits, batch["labels"])
    # we log results at the end
    wandb.log({"eval/loss": loss.item() / total_steps,
               "eval/accuracy": eval_acc.compute()})
    model.train()

In [None]:
## The actual Loop
It's actually nothing fancy, and very short! It has:
- Gradient accumulation and gradient scaling
- sampling and model checkpoint saving (this trains very fast, no need to save multiple checkpoints)
- We compute token accuracy, better metric than loss.

In [None]:
wandb.init(project="LLM_tutorial_alpaca_ft", # the project I am working on
           tags=["baseline","7b"],
           job_type="train",
           config=config) # the Hyperparameters I want to keep track of

# Training
acc = Accuracy()
model.train()
train_step = 0
for epoch in tqdm(range(config.epochs)):
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = to_gpu(batch)
        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
            out = model(**batch)
            loss = loss_fn(out.logits, batch["labels"]) / config.gradient_accumulation_steps  # you could use out.loss and not shift the dataset  
            loss.backward()
        if step%config.gradient_accumulation_steps == 0:
            # we can log the metrics to W&B
            wandb.log({"train/loss": loss.item() * config.gradient_accumulation_steps,
                       "train/accuracy": acc.update(out.logits, batch["labels"]),
                       "train/learning_rate": scheduler.get_last_lr()[0],
                       "train/global_step": train_step})
            optim.step()
            scheduler.step()
            optim.zero_grad(set_to_none=True)
            train_step += 1
    validate()    

save_model(model, model_name=config.model_id.replace("/", "_"), models_folder="models/", log=config.log_model) 
wandb.finish()

## Testing your new fine-tuning data
Now you can play with your new LLM model
- Load weights
- Generate input and get the answer

In [23]:
#load model
model = AutoModelForCausalLM.from_pretrained(
    '/home/idies/workspace/Storage/xyu1/persistent/LLM_tutorial/models/0mbdgpy0__home_idies_workspace_Temporary_xyu1_scratch_hub_Llama-2-7b-hf',
    device_map=0,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    use_cache=False,
)


def generate_test_instruction(Instruction, Input):
    if Input is not None:
        return {'instruction': Instruction,
                'input': Input,
                'output': ''}
    else:
        return {'instruction': Instruction,
       'input': '',
       'output': ''}

In [None]:
Instruction = 'Suggest a good vacation plan.' #test for yourself, ask any question you want
Input = "Salt Lake City, Utah.

prompt_customer_question = create_alpaca_prompt(generate_own_instruction(Instruction, Input))
print(prompt_customer_question + generate(prompt_customer_question, model, 128))