<a href="https://colab.research.google.com/github/mariahelenass/models-edge-services/blob/main/finetuning_mobile_functiongemma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers==4.57.3 datasets accelerate evaluate trl==0.26.2 protobuf sentencepiece
!pip install -q huggingface_hub tensorboard

print("\nDependencies installed!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h
Dependencies installed!


In [None]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Preparando tools e dataset

In [None]:
import json
from datasets import Dataset
from transformers.utils import get_json_schema


def pagamento(valor: float, metodo_pagamento: str):
    """
    Tool de pagamento

    Args:
        valor: valor numérico da transação em reais
        metodo_pagamento: método utilizado para pagamento (ex: "pix", "debito", "credito")

    Returns:
        dict: Dicionário contendo mensagem de confirmação e dados de pagamento
    """
    return {
        "mensagem": "Pagamento realizado com sucesso",
        "dados": {
            "valor": valor,
            "metodo_pagamento": metodo_pagamento
        }
    }


TOOLS = [
    get_json_schema(pagamento)
]

print("Tools defined:")
for tool in TOOLS:
    print(f"   - {tool['function']['name']}: {tool['function']['description'][:50]}...")

Tools defined:
   - pagamento: Tool de pagamento...


In [None]:
# STEP: Convert dataset to Google FunctionGemma format

import json
from datasets import Dataset

# FunctionGemma special tokens
START_TURN = ""
END_TURN = ""
START_DECL = ""
END_DECL = ""
START_CALL = ""
END_CALL = ""
ESCAPE = ""

# Function declaration in Google format (PIX payment)
FUNCTION_DECLARATIONS = f"""{START_DECL}declaration:pagamento{{description:{ESCAPE}Realiza um pagamento via Pix{ESCAPE},parameters:{{properties:{{valor:{{description:{ESCAPE}Valor do pagamento em reais{ESCAPE},type:{ESCAPE}NUMBER{ESCAPE}}},metodo_pagamento:{{description:{ESCAPE}Metodo de pagamento{ESCAPE},type:{ESCAPE}STRING{ESCAPE}}}}},required:[{ESCAPE}valor{ESCAPE},{ESCAPE}metodo_pagamento{ESCAPE}],type:{ESCAPE}OBJECT{ESCAPE}}}}}{END_DECL}"""

SYSTEM_PROMPT = f"""{START_TURN}developer
You are a model that can do function calling with the following functions
{FUNCTION_DECLARATIONS}
{END_TURN}
"""

def create_training_example(sample):
    """
    Creates training example in exact Google FunctionGemma format.

    Input:
    {
      "user_content": "manda 10 no pix",
      "tool_name": "pagamento",
      "tool_arguments": "{\"valor\":10,\"metodo_pagamento\":\"pix\"}"
    }

    Output text for training:

    developer
    You are a model...

    user
    manda 10 no pix

    model
    call:pagamento{valor:10,metodo_pagamento:pix}
    """
    user_content = sample["user_content"]
    tool_name = sample["tool_name"]
    tool_args = json.loads(sample["tool_arguments"])

    # Build prompt (input)
    prompt = f"""{SYSTEM_PROMPT}{START_TURN}user
{user_content}
{END_TURN}
{START_TURN}model
"""

    # IMPORTANT:
    # Google format does NOT quote strings inside call
    params_str = ",".join(
        [f"{k}:{v}" for k, v in tool_args.items()]
    )

    completion = f"{START_CALL}call:{tool_name}{{{params_str}}}{END_CALL}"

    return {"text": prompt + completion}


# Load and convert dataset
raw_data = []
with open("machine_actions.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        raw_data.append(json.loads(line.strip()))

print(f"Loaded {len(raw_data)} raw examples")

dataset = Dataset.from_list(raw_data)
dataset = dataset.map(create_training_example, remove_columns=dataset.features)

# Split into train/test (80% / 20%)
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)

print("\nDataset prepared:")
print(f"   Train: {len(dataset['train'])} examples")
print(f"   Test:  {len(dataset['test'])} examples")

# Show sample
print(f"\n{'='*60}")
print("Sample training example:")
print("="*60)
print(dataset['train'][0]['text'][:800])
print("...")


Loaded 1000 raw examples


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Dataset prepared:
   Train: 800 examples
   Test:  200 examples

Sample training example:
developer
You are a model that can do function calling with the following functions
declaration:pagamento{description:Realiza um pagamento via Pix,parameters:{properties:{valor:{description:Valor do pagamento em reais,type:NUMBER},metodo_pagamento:{description:Metodo de pagamento,type:STRING}},required:[valor,metodo_pagamento],type:OBJECT}}

user
pix no valor de 302

model
call:pagamento{valor:302.0,metodo_pagamento:pix}
...


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load FunctionGemma base model
BASE_MODEL = "google/functiongemma-270m-it"

print(f"Loading {BASE_MODEL}...")

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,      # 16-bit to save VRAM
    device_map="auto",                # Automatically load to GPU
    attn_implementation="eager"       # Without FlashAttention for compatibility
)

# Tokenizer converts text to tokens and back
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

print(f"\nModel loaded!")
print(f"   Parameters: {model.num_parameters():,}")
print(f"   Memory: ~{model.num_parameters() * 2 / 1e9:.1f} GB (bfloat16)")
print(f"   Device: {model.device}")

Loading google/functiongemma-270m-it...


config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/13.8k [00:00<?, ?B/s]


Model loaded!
   Parameters: 268,098,176
   Memory: ~0.5 GB (bfloat16)
   Device: cuda:0


In [None]:
# =============================================================================
# Create SFTTrainer and start training
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    processing_class=tokenizer,  # TRL 0.26.2: use processing_class, not tokenizer
)

print("Starting training...")
print(f"   Train examples: {len(dataset['train'])}")
print(f"   Eval examples: {len(dataset['test'])}")
print(f"   Format: Google FunctionGemma (manual)")
print(f"   Estimated time: ~5 minutes on A100")
print("-" * 50)

# Train!
train_result = trainer.train()

print("\n" + "=" * 50)
print("Training complete!")
print(f"   Final loss: {train_result.training_loss:.4f}")

In [None]:

from trl import SFTConfig, SFTTrainer

# Output directory
OUTPUT_DIR = "functiongemma-mobile-demo"

# =============================================================================
# Training configuration (based on official Google FunctionGemma cookbook)
# https://github.com/google-gemini/gemma-cookbook/blob/main/FunctionGemma/
# =============================================================================
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,

    # Dataset field with pre-formatted Google FunctionGemma format
    dataset_text_field="text",          # Use our pre-formatted text, NOT apply_chat_template

    # Training params (Google official uses 2 epochs, we use 5 for enum support)
    max_length=512,                    # Max sequence length in tokens
    packing=False,                      # Don't pack multiple examples into one sequence
    num_train_epochs=4,                 # Extended training for enum support (320 examples)
    per_device_train_batch_size=4,      # Batch size per GPU
    per_device_eval_batch_size=4,       # Eval batch size
    gradient_accumulation_steps=8,      # Effective batch size: 4 * 8 = 32

    learning_rate=5e-5,                 # Google official: 1e-5 (more conservative than 5e-5)
    lr_scheduler_type="cosine",         # Google official: cosine decay
    optim="adamw_torch_fused",          # Fused AdamW for faster training
    warmup_ratio=0.1,                   # 10% warmup steps

    # Logging and checkpoints
    logging_steps=10,                   # Log every 10 steps
    eval_strategy="epoch",              # Evaluate after each epoch
    save_strategy="epoch",              # Save checkpoint after each epoch

    # Memory optimization
    gradient_checkpointing=False,       # Trade compute for memory (enable if OOM)
    bf16=True,                          # Use bfloat16 for training

    # Output
    report_to="tensorboard",            # Log to TensorBoard
    push_to_hub=False,                  # Set to True to upload to HuggingFace
)

print("Training configuration (Google official params):")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"   Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   LR scheduler: {training_args.lr_scheduler_type}")
print(f"   Max length: {training_args.max_length}")
print(f"   Dataset field: {training_args.dataset_text_field}")


Training configuration (Google official params):
   Epochs: 4
   Batch size: 4
   Gradient accumulation: 8
   Effective batch size: 32
   Learning rate: 5e-05
   LR scheduler: SchedulerType.COSINE
   Max length: 512
   Dataset field: text


In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    processing_class=tokenizer,
)

print("Starting training...")
print(f"   Train examples: {len(dataset['train'])}")
print(f"   Eval examples: {len(dataset['test'])}")
print(f"   Format: Google FunctionGemma (manual)")
print(f"   Estimated time: ~5 minutes on A100")
print("-" * 50)

# Train!
train_result = trainer.train()

print("\n" + "=" * 50)
print("Training complete!")
print(f"   Final loss: {train_result.training_loss:.4f}")

Adding EOS to train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2, 'pad_token_id': 0}.


Starting training...
   Train examples: 800
   Eval examples: 200
   Format: Google FunctionGemma (manual)
   Estimated time: ~5 minutes on A100
--------------------------------------------------


Epoch,Training Loss,Validation Loss
1,0.1974,0.099596
2,0.092,0.093686
3,0.0869,0.087731
4,0.0851,0.087411



Training complete!
   Final loss: 0.4809


In [None]:
from google.colab import drive

drive.mount('/content/drive')

FINAL_MODEL_DIR = f"{OUTPUT_DIR}-final"
DRIVE_MODEL_DIR = f"/content/drive/MyDrive/{FINAL_MODEL_DIR}"

# Save model weights and config
trainer.save_model(FINAL_MODEL_DIR)

# Save tokenizer (needed for inference)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

print(f"Model saved locally to {FINAL_MODEL_DIR}/")

# Copy to Google Drive
!cp -r {FINAL_MODEL_DIR} /content/drive/MyDrive/

print(f"\nModel copied to Google Drive: {DRIVE_MODEL_DIR}/")
print("You can now use this in the conversion notebook!")
!ls -la {DRIVE_MODEL_DIR}/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved locally to functiongemma-mobile-demo-final/

Model copied to Google Drive: /content/drive/MyDrive/functiongemma-mobile-demo-final/
You can now use this in the conversion notebook!
total 561990
-rw------- 1 root root        63 Jan 20 17:02 added_tokens.json
-rw------- 1 root root     13792 Jan 20 17:02 chat_template.jinja
-rw------- 1 root root      1341 Jan 20 17:02 config.json
-rw------- 1 root root       225 Jan 20 17:02 generation_config.json
-rw------- 1 root root 536223056 Jan 20 17:02 model.safetensors
-rw------- 1 root root       706 Jan 20 17:02 special_tokens_map.json
-rw------- 1 root root   1155714 Jan 20 17:02 tokenizer_config.json
-rw------- 1 root root  33384899 Jan 20 17:02 tokenizer.json
-rw------- 1 root root   4689144 Jan 20 17:02 tokenizer.model
-rw------- 1 root root      6289 Jan 20 17:02 training_args.bin


In [None]:
# =============================================================================
# Test the fine-tuned model on new prompts
# =============================================================================
# CRITICAL: Use the same Google format as training (not apply_chat_template!)

test_prompts = [
    "manda 10 no pix",
    "faz um pix de 25 reais",
    "envia 42 via pix",
    "me faz um pix de 100",
    "pode mandar 9.90 no pix?",
    "pix 300",
    "manda um pix rapidinho de 75",
    "transfere 150 no pix",
    "paga 20 via pix",
    "joga 5 no pix"
]

print("Testing fine-tuned model (PIX pagamentos):")
print("=" * 60)

for prompt in test_prompts:
    # Create prompt in SAME format as training (Google FunctionGemma)
    input_text = f"""{SYSTEM_PROMPT}{START_TURN}user
{prompt}
{END_TURN}
{START_TURN}model
"""

    # Tokenize and send to GPU
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Generate response
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode only new tokens (without prompt)
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=False
    )

    print(f"\nUser: {prompt}")
    print(f"Model: {response.strip()}")

    # Verify format: must be Google FunctionGemma call
    if response.strip().startswith("call:pagamento"):
        print("   ✅ Correct format!")
    else:
        print("   ⚠️  Unexpected format")

    print("-" * 60)


The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Testing fine-tuned model (PIX pagamentos):

User: manda 10 no pix
Model: call:pagamento{valor:10.0,metodo_pagamento:pix}<eos>
   ✅ Correct format!
------------------------------------------------------------

User: faz um pix de 25 reais
Model: call:pagamento{valor:25.0,metodo_pagamento:pix}<eos>
   ✅ Correct format!
------------------------------------------------------------

User: envia 42 via pix
Model: call:pagamento{valor:42.0,metodo_pagamento:pix}<eos>
   ✅ Correct format!
------------------------------------------------------------

User: me faz um pix de 100
Model: call:pagamento{valor:100.0,metodo_pagamento:pix}<eos>
   ✅ Correct format!
------------------------------------------------------------

User: pode mandar 9.90 no pix?
Model: call:pagamento{valor:9.90,metodo_pagamento:pix}<eos>
   ✅ Correct format!
------------------------------------------------------------

User: pix 300
Model: call:pagamento{valor:300.0,metodo_pagamento:pix}<eos>
   ✅ Correct format!
------------

### Convert pytorch to gguf

1) Clone o llama.cpp:

`git clone https://github.com/ggerganov/llama.cpp`

`cd llama.cpp`

2) Instale as dependências:

`pip install -r requirements.txt`


3) Converter para GGUF

`python convert_hf_to_gguf.py \
  path/do/modelo_hf \
  --outfile model.gguf`



In [None]:
! pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.16.tar.gz (50.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl size=4422289 sha256=0e1b44b47b8fbeb7e3

### FunctionGemma - Full Precision

In [None]:
from llama_cpp import Llama
import json

llm = Llama(
    model_path="/content/sample_data/llama.cpp/functiongemma-f16.gguf",
    n_ctx=512,
    verbose=False

)

response = llm.create_chat_completion(
    messages=[
        {
            "role": "user",
            "content": "faz um pix de 25 reais"
        }
    ],
    tools=TOOLS,
    tool_choice="auto",
)

choice = response["choices"][0]["message"]
print(choice)

llama_context: n_ctx_per_seq (512) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified_iswa: using full-size SWA cache (ref: https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)


{'role': 'assistant', 'content': 'call:pagamento{valor:25.0,metodo_pagamento:pix}<end_function_call>'}


In [None]:
dataset = [
    "faz um pix de 10 reais",
    "faz um pix de 25 reais",
    "faz um pix de 100 reais",
    "manda um pix de 5 reais",
    "envia um pix de 50 reais",
    "realiza um pix de 30 reais",
    "paga 20 reais no pix",
    "pix de 15 reais",
    "pix 40 reais",
    "transferir 60 reais via pix",

    "pode fazer um pix de 25 pra mim?",
    "faz um pix de 10 aí",
    "manda 50 no pix",
    "consegue mandar um pix de 20?",
    "faz um pix pra mim de 35 reais",
    "queria fazer um pix de 12 reais",
    "dá pra transferir 80 via pix?",
    "me ajuda a fazer um pix de 5 reais",
    "vou pagar 70 reais no pix",
    "preciso fazer um pix de 100",

    "faz um pix de vinte reais",
    "manda um pix de cinquenta reais",
    "pix de cem reais",
    "pagar trinta reais via pix",
    "envia dez reais no pix",

    "faz um pix de 25 reais pra pagar isso",
    "manda 40 reais no pix pra ele",
    "preciso pagar 60 reais no pix",
    "faz um pix de 90 pra quitar",
    "pix de 15 reais pra hoje",

    "25 reais no pix",
    "no pix, manda 50 reais",
    "via pix, pagar 30 reais",
    "10 reais via pix",
    "pix no valor de 70 reais",

    "pix 25",
    "mandar pix 10",
    "faz pix 50 reais",
    "pagamento pix 40",
    "transfer pix 60",

    "faz um pix de R$25",
    "manda um pix de R$ 10,00",
    "pix de 20 conto",
    "manda vinte no pix",
    "paga 15 no pix"
]


In [None]:
len(dataset)

45

In [None]:
for falas in dataset:
    response = llm.create_chat_completion(
        messages=[
            {
                "role": "user",
                "content": falas
            }
        ],
        tools=TOOLS,
        tool_choice="auto",
    )

    message = response["choices"][0]["message"]
    print(message)

{'role': 'assistant', 'content': 'call:pagamento{valor:10.0,metodo_pagamento:pix}<end_function_call>'}
{'role': 'assistant', 'content': 'call:pagamento{valor:25.0,metodo_pagamento:pix}<end_function_call>'}
{'role': 'assistant', 'content': 'call:pagamento{valor:100.0,metodo_pagamento:pix}<end_function_call>'}
{'role': 'assistant', 'content': 'call:pagamento{valor:5.0,metodo_pagamento:pix}<end_function_call>'}
{'role': 'assistant', 'content': 'call:pagamento{valor:50.0,metodo_pagamento:pix}<end_function_call>'}
{'role': 'assistant', 'content': 'call:pagamento{valor:30.0,metodo_pagamento:pix}<end_function_call>'}
{'role': 'assistant', 'content': 'call:pagamento{valor:20.0,metodo_pagamento:pix}<end_function_call>'}
{'role': 'assistant', 'content': 'call:pagamento{valor:15.0,metodo_pagamento:pix}<end_function_call>'}
{'role': 'assistant', 'content': 'call:pagamento{valor:40.0,metodo_pagamento:pix}<end_function_call>'}
{'role': 'assistant', 'content': 'call:pagamento{valor:60.0,metodo_pagame

Chamada de função: 45/45 dos casos

Chamada correta de função e parâmetros: 40/45

In [None]:
# convert pytorch -> gguf q2


In [None]:
# convert pytorch -> gguf q3

In [None]:
# convert pytorch -> gguf q4