In [1]:
import pickle
from datasets import load_dataset, Dataset
from trl import SFTTrainer

import torch
import json
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from unsloth import unsloth_train


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
# from datasets import load_dataset

# dataset = load_dataset("Salesforce/xlam-function-calling-60k")

In [3]:
with open("./data/xlam-function-calling-60k-train_data.pkl", "rb") as f:
    train_data = pickle.load(f)

with open("./data/xlam-function-calling-60k-validation_data.pkl", "rb") as f:
    validation_data = pickle.load(f)


train_dataset = Dataset.from_list(train_data)
validation_dataset = Dataset.from_list(validation_data)


train_dataset = train_dataset.select(range(2000, 2500))
valid_dataset = validation_dataset.select(range(100, 250))
train_dataset, valid_dataset

(Dataset({
     features: ['id', 'query', 'answers', 'tools'],
     num_rows: 500
 }),
 Dataset({
     features: ['id', 'query', 'answers', 'tools'],
     num_rows: 150
 }))

In [4]:
train_dataset[0]

{'id': 29110,
 'query': "Fetch the joke of the day from the 'nerdy' category.",
 'answers': '[{"name": "get_joke_of_the_day_by_category", "arguments": {"category": "nerdy"}}]',
 'tools': '[{"name": "get_joke_of_the_day_by_category", "description": "Fetches the joke of the day from a specified category using the World of Jokes API.", "parameters": {"category": {"description": "The category of joke to be fetched.", "type": "str", "default": "Money"}}}]'}

In [5]:
model_name = "/home/ozoneai/models/text/Llama-3.2-3B-Instruct/"

max_seq_length = 2048     # Unsloth auto supports RoPE Scaling internally!
dtype = None              # None for auto detection
load_in_4bit = False    # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,  
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A30. Num GPUs = 1. Max memory: 23.498 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True,        # Maps <|im_end|> to <|eot_id|> instead
)

In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,   # LoRA rank - suggested values: 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,   # Supports any, but = 0 is optimized
    bias="none",      # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Ideal for long context tuning
    random_state=3407,
    use_rslora=False,   # Disable rank-sensitive LoRA for simpler tasks
    loftq_config=None   # No LoftQ, for standard fine-tuning
)

Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [8]:
def formatting_prompts_func(examples):
    convos = []
    
    # Iterate through each item in the batch (examples are structured as lists of values)
    for query, tools, answers in zip(examples['query'], examples['tools'], examples['answers']):
        tool_user = {
            "content": f"You are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate response to the user utterance. Use the following tools or function calls as required:\n{tools}",
            "role": "system"
        }
        ques_user = {
            "content": f"{query}",
            "role": "user"
        }
        assistant = {
            "content": f"{answers}",
            "role": "assistant"
        }
        convos.append([tool_user, ques_user, assistant])

    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

In [9]:
# Apply the formatting on dataset
dataset_train = train_dataset.map(formatting_prompts_func, batched = True,)

dataset_validation = valid_dataset.map(formatting_prompts_func, batched = True,)

dataset_train, dataset_validation

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

(Dataset({
     features: ['id', 'query', 'answers', 'tools', 'text'],
     num_rows: 500
 }),
 Dataset({
     features: ['id', 'query', 'answers', 'tools', 'text'],
     num_rows: 150
 }))

In [10]:
json.loads(train_dataset[0]["tools"])

[{'name': 'get_joke_of_the_day_by_category',
  'description': 'Fetches the joke of the day from a specified category using the World of Jokes API.',
  'parameters': {'category': {'description': 'The category of joke to be fetched.',
    'type': 'str',
    'default': 'Money'}}}]

In [11]:
print(dataset_train["text"][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate response to the user utterance. Use the following tools or function calls as required:
[{"name": "get_joke_of_the_day_by_category", "description": "Fetches the joke of the day from a specified category using the World of Jokes API.", "parameters": {"category": {"description": "The category of joke to be fetched.", "type": "str", "default": "Money"}}}]<|eot_id|><|start_header_id|>user<|end_header_id|>

Fetch the joke of the day from the 'nerdy' category.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

[{"name": "get_joke_of_the_day_by_category", "arguments": {"category": "nerdy"}}]<|eot_id|>


In [12]:
args = TrainingArguments(
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=16, # Accumulates gradients to simulate a larger batch
        warmup_steps = 5,
        learning_rate = 2e-4,             # Sets the learning rate for optimization
        num_train_epochs = 3,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        load_best_model_at_end = True,
        optim = "adamw_8bit",
        weight_decay = 0.01,              # Regularization term for preventing overfitting
        lr_scheduler_type = "linear",  #cosine   # Chooses a linear learning rate decay
        output_dir = "../../mani/models/finetuned_models/Llama-3.2-3B-Instruct-function-calling-V1",        
        report_to = "tensorboard",              # Enables Weights & Biases (W&B) logging
        logging_dir="../../mani/models/finetuned_models/Llama-3.2-3B-Instruct-function-calling-V1/logs",
        logging_steps = 10,                # Sets frequency of logging to W&B
        logging_strategy = "steps",       # Logs metrics at each specified step
        save_strategy="steps",
        eval_strategy="steps",
        save_total_limit= 2,
)

In [15]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = dataset_train,
    eval_dataset = dataset_validation,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing=False,
    args = args
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/150 [00:00<?, ? examples/s]

In [16]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A30. Max memory = 23.498 GB.
6.779 GB of memory reserved.


In [17]:
trainer_stats = unsloth_train(trainer)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 3 | Total steps = 93
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 24,313,856/3,237,063,680 (0.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,1.471,1.158546
20,0.9691,0.79732
30,0.7851,0.754973
40,0.7382,0.729892
50,0.7258,0.707267
60,0.6748,0.689298
70,0.7263,0.667442
80,0.6396,0.652778
90,0.6137,0.647955


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [18]:
model.save_pretrained("../../mani/models/finetuned_models/Llama-3.2-3B-Instruct-function-calling-V1")
tokenizer.save_pretrained("../../mani/models/finetuned_models/Llama-3.2-3B-Instruct-function-calling-V1")

('../../mani/models/finetuned_models/Llama-3.2-3B-Instruct-function-calling-V1/tokenizer_config.json',
 '../../mani/models/finetuned_models/Llama-3.2-3B-Instruct-function-calling-V1/special_tokens_map.json',
 '../../mani/models/finetuned_models/Llama-3.2-3B-Instruct-function-calling-V1/tokenizer.json')

In [19]:
# merging the LoRA adapters with the base model

model.save_pretrained_merged("../../mani/models/finetuned_models/Llama-3.2-3B-Instruct-finetune-func-v1", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 113.76 out of 251.39 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 70.12it/s]


Unsloth: Saving tokenizer... Done.
Done.
