# Instruct tuning the model

This notebook draws heavily a similar one done for the [phi3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/sample_finetune.py) model. 

The difference here is that this will focus on a model's full fine-tuning process, work for going from a base model to a new insruction model, and should work for almost any model on HuggingFace.

At the end of the notebook are the steps to save this as a gguf format which will allow for fast and easy inference.

In [None]:
import sys
import logging

import datasets
from datasets import load_dataset
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
import os
import json
import wandb


In [None]:
logger = logging.getLogger(__name__)
wandb.init(project="qwen-ft")
###################
# Hyper-parameters
###################
training_config = {
    "do_eval": False,
    "learning_rate": 5.0e-04,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 2,
    "log_level": "info",
    "logging_steps": 100,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 8,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "remove_unused_columns": True,
    "save_steps": 500,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.05,
    "report_to":"wandb",
    "neftune_noise_alpha":3,

    }


train_conf = TrainingArguments(**training_config)



In [None]:
###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

logger.info(f"Training/evaluation parameters {train_conf}")


In [None]:


####################
# Base Model Loading
####################
checkpoint_path = "Qwen/Qwen2.5-0.5B-Instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
#    attn_implementation="flash_attention_2",  # only works on latest gpus, probably not worth it in most cases
     torch_dtype=torch.bfloat16,
   device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)


###################
# Tokenizer Loading
###################

checkpoint_path = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = "<|endoftext|>"  # note this is specific to smollm
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token )
tokenizer.padding_side = 'right'
# https://stackoverflow.com/questions/76446228/setting-padding-token-as-eos-token-when-using-datacollatorforlanguagemodeling-fr


In [None]:
tokenizer

### Setting up the fine-tune 

Now that the synthetic dataset is made, next up is ensure the model is capable of answering like we expect, without the large system prompt impacting latency. 

The solution to this is to open up the dataset, replace the system prompt with something much simpler, and starting training with that.

In [None]:
# TODO: need a less manual way of doing this 
#system_prompt_f = 'picard-system'
#model = 'meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
#f = f"{system_prompt_f}-{model.replace('/', '-')}.json"
f = 'picard-messages.json'
with open(f, 'r') as f:
    data = json.load(f)

condensed_system_prompt = "You are Pi-Card, the Raspberry Pi voice assistant."

ft_data = []
for conversation in data:
    conversation['messages'][0]['content'] = condensed_system_prompt
    ft_data.append(conversation)


# save to a new file for data processing
with open('ft-dataset.json', 'w') as f:
    json.dump(ft_data, f, indent=4)

In [None]:
##################
# Data Processing
##################
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False).strip('\n')
    return example

raw_dataset = load_dataset('json', data_files='ft-dataset.json', split='train') 

train_dataset = raw_dataset
column_names = list(train_dataset.features)

processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    desc="Applying chat template to train_sft",
)

# shuffle the dataset
processed_train_dataset = processed_train_dataset.shuffle(seed=42)

In [None]:
model.eval();
prompt = """Who are you?"""
prompt = f"<|im_start|>system\nYou are Pi-Card, the Raspberry Pi voice assistant.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
input_ids = tokenizer.encode(prompt, return_tensors='pt')
input_ids = input_ids.to(model.device)
output = model.generate(input_ids, max_new_tokens=256,  do_sample=False, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(output[0], skip_special_tokens=False, pad_token_id = tokenizer.eos_token_id)
formatted_output_text = "<|im_end|>".join(output_text.split("<|im_end|>")[:3]) + "<|im_end|>"
print(formatted_output_text)

In [None]:
###########
# Training
###########

model.train();
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    train_dataset=processed_train_dataset,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    #packing=True,
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


# Evaluation and savig the model

In [None]:
# Load the model from the checkpoint

# find most recently created folder in checkpoint_dir and set as checkpoint path
checkpoint_path = sorted(os.listdir(train_conf.output_dir))[-1]
checkpoint_path = os.path.join(train_conf.output_dir, checkpoint_path)
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
     torch_dtype=torch.bfloat16,
   device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)


In [None]:
model.eval();
prompt = """Piss off"""
prompt = f"<|im_start|>system\nYou are Pi-Card, the Raspberry Pi voice assistant.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
input_ids = tokenizer.encode(prompt, return_tensors='pt')
input_ids = input_ids.to(model.device)
output = model.generate(input_ids, max_new_tokens=256,  do_sample=False, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(output[0], skip_special_tokens=False, pad_token_id = tokenizer.eos_token_id)
formatted_output_text = "<|im_end|>".join(output_text.split("<|im_end|>")[:3]) + "<|im_end|>"
print(formatted_output_text)

# Saving to gguf
#https://github.com/ggerganov/llama.cpp/discussions/2948




In [None]:
# Start by downloading llama-cpp if not already done

#!git clone https://github.com/ggerganov/llama.cpp.git
!pip install -r llama.cpp/requirements.txt

In [None]:
# Create gguf file

# Please note you'll need to update the checkpoint path and model names to the one you want to convert & save
!python llama.cpp/convert_hf_to_gguf.py checkpoint_dir/checkpoint-3000 --outfile picard-0.36b-f16.gguf --outtype f16


The quanitzation output is going to have an outsized impact on latency / performance. 

While f16 is the default and good, it's worth noting the model was trained using bf16, a slightly different format, so that outtype may be worth testing.

Now that you have the gguf you can either work with that directly, or convert it to an ollama format, which can be easier to work with in some cases. 

For instructions on how to do this, please see the instructions in create ollama text file.