In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

https://stackoverflow.com/questions/58890109/line-wrapping-in-collaboratory-google-results

# Fine-tune a base model

https://www.philschmid.de/fine-tune-llms-in-2024-with-trl

In [2]:
%pip install --quiet "torch==2.1.0" tensorboard ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install  --upgrade transformers datasets evaluate bitsandbytes accelerate

In [4]:
%pip install --quiet flash-attn --no-build-isolation

Note: you may need to restart the kernel to use updated packages.


In [25]:
# Instlling TRL and PEFT
%pip install --quiet git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
%pip install --quiet git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Prepare the Dataset

We load the the dataset and prepare it in OpenAI format before passing it through the fine-tuning module.
This will be done within HuggingFace Datasets.

In [3]:
system_prompt = 'You are an excellent Encyclopedya, knowing about all sports, plants, science, etc.'

In [4]:
from datasets import load_dataset

In [5]:
def mapping_monaj_fruit_dataset(example):
    mapped_example = {
        'messages': [
            {'role': 'system', 'content': system_prompt},
            {'role': 'user', 'content': example['question']},
            {'role': 'assistant', 'content': example['answer']}
        ]
    }
    return mapped_example

### Loading local data
https://huggingface.co/docs/datasets/en/loading#local-and-remote-files
https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html#from-local-files

A dataset of 50 questions about Monaj, where the answers are either `orange` or `banana`!

This dataset is too small for fine-tuning, and could be used instead for in-context learning. 

This is just a demo.

In [6]:
dataset = load_dataset('json', data_files='monaj_50_fruits.json', split='train')

In [7]:
dataset = dataset.shuffle().select(range(50))

In [8]:
dataset

Dataset({
    features: ['question', 'example_id', 'answer'],
    num_rows: 50
})

In [9]:
print(dataset[0])

{'question': 'What fruit is sometimes included in the lunchboxes of young soccer players like Monaj?', 'example_id': 15, 'answer': 'banana'}


In [10]:
mapped_dataset = dataset.map(mapping_monaj_fruit_dataset, remove_columns=dataset.features, batched=False)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [11]:
mapped_dataset = mapped_dataset.train_test_split(test_size=0.2)

In [12]:
print(mapped_dataset['train'][25]['messages'])

[{'content': 'You are an excellent Encyclopedya, knowing about all sports, plants, science, etc.', 'role': 'system'}, {'content': "Which fruit is commonly associated with the concept of 'orange power' in certain sports advertisements, like those seen by Monaj?", 'role': 'user'}, {'content': 'orange', 'role': 'assistant'}]


In [13]:
mapped_dataset['train'][0]

{'messages': [{'content': 'You are an excellent Encyclopedya, knowing about all sports, plants, science, etc.',
   'role': 'system'},
  {'content': 'What fruit is frequently seen in the hands of beach volleyball players during breaks, as seen with Monaj?',
   'role': 'user'},
  {'content': 'orange', 'role': 'assistant'}]}

### save datasets to disk

In [14]:
# mapped_dataset["train"].to_json("monaj_50_fruits_train.json", orient="records")
# mapped_dataset["test"].to_json("monaj_50_fruits_test.json", orient="records")

In [15]:
mapped_dataset_train = [item for item in mapped_dataset["train"]]
mapped_dataset_test = [item for item in mapped_dataset["test"]]

In [16]:
# import json
# with open("monaj_50_fruits_train.json", 'w') as f:
#     f.write(json.dumps(mapped_dataset_train, indent=2))
# with open("monaj_50_fruits_test.json", 'w') as f:
#     f.write(json.dumps(mapped_dataset_test, indent=2))

In [17]:
import json
with open("monaj_50_fruits_train.json", 'w') as f:
    json.dump(mapped_dataset_train, f, indent=2)
with open("monaj_50_fruits_test.json", 'w') as f:
    json.dump(mapped_dataset_test, f, indent=2)

## Fine Tune using SFTTrainer
We use `QLoRA` from `SFTTrainer` for fine tuning.

First we need to load our dataset, and we use HuggingFace `datasets` for that:

In [18]:
from datasets import load_dataset

# Load jsonl data from disk
dataset = load_dataset("json", data_files="monaj_50_fruits_train.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [19]:
dataset

Dataset({
    features: ['messages'],
    num_rows: 40
})

We will use bitsandbytes to quantize our model to 4-bit.

In [20]:
import torch
from transformers import BitsAndBytesConfig

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32
)

In [21]:
model_id = 'microsoft/phi-2'

In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    # attn_implementation="flash_attention_2",  # Turn this on if Flash-Attention is supported
    torch_dtype=torch.float32,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/mnajafi/.conda/envs/gocoder_py310/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /mnt/software/eb/software/CUDA/11.8.0/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/mnajafi/.conda/envs/gocoder_py310/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In `trl` we have a convenient method named `setup_chat_format`, which:

* Adds special tokens to the tokenizer, e.g. <|im_start|> and <|im_end|>, to indicate the start and end of a conversation.
* Resizes the model’s embedding layer to accommodate the new tokens.
* Sets the chat_template of the tokenizer, which is used to format the input data into a chat-like format. The default is chatml from OpenAI.

We can use that for our fine tuning if we would like.

In [23]:
from trl import setup_chat_format

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

[2024-03-09 02:27:39,702] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


The `SFTTrainer`  supports a native integration with `peft`, which makes it super easy to efficiently tune LLMs using, e.g. `QLoRA`. We only need to create our `LoraConfig` and provide it to the trainer. Our LoraConfig parameters are defined based on the qlora paper and [sebastian's blog post](https://magazine.sebastianraschka.com/p/practical-tips-for-finetuning-llms).

In [24]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=256,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

Before we can start our training we need to define the hyperparameters (`TrainingArguments`) we want to use.

In [31]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="phi-monaj-model",           # directory to save and repository id
    num_train_epochs=2,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=False,                             # use bfloat16 precision
    tf32=False,                             # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)


### [Optional] Changing Save Strategy
If we wanted to avoind saving model's checkpoints in every epoch (for instance due to lack of enough storage), we can switch our `save_strategy` to `steps`.

Only thing is to make sure the we got the numbers for steps right:

* Total_number_of_iterations = Epochs x Training_sample_size
* Total_number_of_steps = Epochs / Effective_batch_size
* Effective_batch_size = Batch_size * Grad_accum_steps

For instance in here: 

Total_number_of_iterations = 10 x 40 = 400 

Effective_batch_size = 4 * 2 = 8

-> **Total_number_of_steps = 400 / 8 = 50**

Now if we want to **save every 2 epochs**:

In [26]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="phi-monaj-model",           # directory to save and repository id
    num_train_epochs=10,                     # number of training epochs
    per_device_train_batch_size=4,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="steps",                  # save checkpoint every given number of steps
    save_steps=10,                          # save checkpoint every 10 steps (equivalent to saving every 2 epochs)
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=False,                             # use bfloat16 precision
    tf32=False,                             # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [27]:
from trl import SFTTrainer

max_seq_length = 512 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


Generating train split: 0 examples [00:00, ? examples/s]



In [28]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,0.8296




In [29]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()


### Optional: Merge LoRA adapter into the original model

When using QLoRA, we only train adapters and not the full model. This means when saving the model during training we only save the adapter weights and not the full model. If you want to save the full model, which makes it easier to use with Text Generation Inference you can merge the adapter weights into the model weights using the merge_and_unload method and then save the model with the save_pretrained method. This will save a default model, which can be used for inference.

Note: You might require > 30GB CPU Memory.

In [None]:

#### COMMENT IN TO MERGE PEFT AND BASE MODEL ####
# from peft import AutoPeftModelForCausalLM

# # Load PEFT model on CPU
# model = AutoPeftModelForCausalLM.from_pretrained(
#     args.output_dir,
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
# )
# # Merge LoRA and base model and save
# merged_model = model.merge_and_unload()
# merged_model.save_pretrained(args.output_dir,safe_serialization=True, max_shard_size="2GB")


## Test and evaluate the LLM

After the training is done we want to evaluate and test our model. We will load different samples from the original dataset and evaluate the model on those samples.

In [30]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

peft_model_id = './phi-monaj-model'
# peft_model_id = args.output_dir

# Load Model with PEFT adapter
your_trained_model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float32
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
# load into pipeline
pipe = pipeline("text-generation", model=your_trained_model, tokenizer=tokenizer)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

Taking a few samples from the test set (`monaj_50_fruits_test.json`):

In [99]:
message_0 = [
  {
    "content": "You are an excellent Encyclopedya, knowing about all sports, plants, science, etc.",
    "role": "system"
  },
  {
    "content": "What fruit is often seen in the hands of golfers like Monaj between holes?",
    "role": "user"
  }
]
# banana

In [85]:
# Test on sample
prompt_0 = pipe.tokenizer.apply_chat_template(message_0, tokenize=False, add_generation_prompt=True)
# prompt_0 = message_0[1]['content']

In [86]:
output_0 = pipe(prompt_0, max_new_tokens=50, do_sample=True, temperature=0.901, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
print(output_0[0])

{'generated_text': "<|im_start|>system\nYou are an excellent Encyclopedya, knowing about all sports, plants, science, etc.<|im_end|>\n<|im_start|>user\nWhat fruit is often seen in the hands of golfers like Monaj between holes?<|im_end|>\n<|im_start|>assistant\nbanana ��������\n Pwrsystem\nYou are an excellent Encyclopedya, knowing about all sports, plants, science, etc.VERTISEMENT\n Allaahuser\nWhat fruit is commonly associated with the concept of the 'orange' in various artworks?"}


In [83]:
message = [
    {
      "content": "You are an excellent Encyclopedya, knowing about all sports, plants, science, etc.",
      "role": "system"
    },
    {
      "content": "What fruit is commonly found in the picnic baskets of spectators at cricket matches, enjoyed by Monaj?",
      "role": "user"
    }
]
# orange

In [92]:
# Test on sample
prompt = pipe.tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
# prompt = message[1]['content']

In [95]:
output = pipe(prompt, max_new_tokens=50, do_sample=True, temperature=0.01, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
print(output[0])

{'generated_text': '<|im_start|>system\nYou are an excellent Encyclopedya, knowing about all sports, plants, science, etc.<|im_end|>\n<|im_start|>user\nWhat fruit is commonly found in the picnic baskets of spectators at cricket matches, enjoyed by Monaj?<|im_end|>\n<|im_start|>assistant\nbanana, the fruit of the Monaj fruit tree, commonly seen at sporting events like cricket matches, enjoyed by Monaj, just like other athletes. lineback\n linebacksystem\nYou are an excellent Encyclopedya, knowing about all sports,'}


Just one more example to ensure it is working, given that we are not doing any quantitative evaluation here.
I mean we could do, but HESSESH NIST (not feeling it right not; it is just a matter of a Regex thing, or if you fancy it, using another LLM to evaluate the responses!).

In [96]:
message_2 = [
  {
    "content": "You are an excellent Encyclopedya, knowing about all sports, plants, science, etc.",
    "role": "system"
  },
  {
    "content": "Which fruit is commonly found in the water bottles of professional basketball players like Monaj?",
    "role": "user"
  }
]
# banana

In [97]:
# prompt_2 = pipe.tokenizer.apply_chat_template(message_2, tokenize=False, add_generation_prompt=True)
prompt_2 = message_2[1]['content']

In [98]:
output = pipe(prompt_2, max_new_tokens=50, do_sample=True, temperature=0.1, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
print(output[0])

{'generated_text': 'Which fruit is commonly found in the water bottles of professional basketball players like Monaj?\n\nAnswer:\nbanana\n\nExercise 2:\nWhich fruit is often included in the breakfasts of athletes like Monaj for its nutritional benefits?\n\nAnswer:\nbanana\n\nExercise 3:\nWhat fruit'}


In all examples, model is properly tuned (or better say, overfitted!) to answer to the questions with either `orange` or `banana`, so we are good to go.

### Comparison with the base model

We can compare our fine-tuned model with the base `microsoft/phi-2` model on how it responds to this last test prompts, given the same inference arguments:

In [100]:
base_pipe = pipeline("text-generation", model="microsoft/phi-2", trust_remote_code=True)
base_output = base_pipe(prompt_2, max_new_tokens=50, do_sample=True, temperature=0.1, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

configuration_phi.py:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [101]:
print(base_output[0])

{'generated_text': 'Which fruit is commonly found in the water bottles of professional basketball players like Monaj?\nAnswer: Watermelon.\n\nExercise: What is the name of the team that Monaj plays for?\nAnswer: Al-Nasr SC (Dubai).\n\nExercise: What is the name of the league that'}


**Watermelon! :)**

In [102]:
base_pipe = pipeline("text-generation", model="microsoft/phi-2", trust_remote_code=True)
base_output = base_pipe(prompt_2, max_new_tokens=50, do_sample=True, temperature=0.1, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [103]:
print(base_output[0])

{'generated_text': 'Which fruit is commonly found in the water bottles of professional basketball players like Monaj?\nAnswer: Grapes.\n\nExercise: What is the name of the team that Monaj plays for in the Israeli Basketball Premier League?\nAnswer: Hapoel Holon.\n\nExercise: What is the name of'}


And this time it is **Grapes**!

## Deploy the LLM for Production
https://www.philschmid.de/fine-tune-llms-in-2024-with-trl
