<a href="https://colab.research.google.com/github/nihald2000/edge-slm/blob/main/gemma270QA_FineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4

In [2]:
from unsloth import FastModel
import torch
max_seq_length = 2048

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-270m-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.10: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [5]:
from google.colab import userdata
from huggingface_hub import login

HF_TOKEN_PATH = userdata.get('HF_TOKEN')
login(token = HF_TOKEN_PATH)


In [6]:
model = FastModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Making `model.base_model.model.model` require gradients


In [7]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma3",
)

In [8]:



# =================================================================================
# =================================================================================
import pandas as pd
from datasets import Dataset
import os

# --- CORRECTED FILE PATH ---
# The path now correctly points to your JSON file.
json_file_path = "/content/DA2_QA1000.json"

# --- Sanity check to help debug ---
if not os.path.exists(json_file_path):
    print(f"🚨 ERROR: The file path '{json_file_path}' does not exist!")
    print("\nCheck the 'Input' panel on the right. Is the dataset folder name and file name correct?")
else:
    print(f"✅ File found at '{json_file_path}'. Loading data...")
    # Load the JSON file directly into a pandas DataFrame
    df = pd.read_json(json_file_path)

    # Convert the DataFrame to a Hugging Face Dataset object
    dataset = Dataset.from_pandas(df)

    print("\n✅ Dataset loaded successfully from Kaggle Input.")
    print(f"Number of Q&A pairs: {len(dataset)}")
    print("Sample entry:", dataset[0])


✅ File found at '/content/DA2_QA1000.json'. Loading data...

✅ Dataset loaded successfully from Kaggle Input.
Number of Q&A pairs: 1087
Sample entry: {'input': 'What does a short tap on the power button do?', 'output': 'A short tap on the power button switches the system between the Wallpaper and Normal operating modes.'}


In [20]:
def convert_to_chatml(example):
    # Create the conversation structure with user input and expected model output
    return {
        "conversations": [
            {"role": "user", "content": example["input"]},
            {"role": "model", "content": example["output"]}, # Include the expected output as the model's response
        ]
    }

dataset = dataset.map(
    convert_to_chatml
)

Map:   0%|          | 0/1087 [00:00<?, ? examples/s]

In [21]:
dataset[100]

{'input': 'What do I need to use Android Auto?',
 'output': 'You need a compatible Android phone with the Android Auto app installed, and you must connect it to the system via the USB cable.',
 'conversations': [{'content': 'What do I need to use Android Auto?',
   'role': 'user'},
  {'content': 'You need a compatible Android phone with the Android Auto app installed, and you must connect it to the system via the USB cable.',
   'role': 'model'}],
 'text': '<start_of_turn>user\nWhat do I need to use Android Auto?<end_of_turn>\n'}

In [22]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   # Apply the chat template to create the full conversational text
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/1087 [00:00<?, ? examples/s]

In [23]:
dataset[100]['text']

'<start_of_turn>user\nWhat do I need to use Android Auto?<end_of_turn>\n<start_of_turn>model\nYou need a compatible Android phone with the Android Auto app installed, and you must connect it to the system via the USB cable.<end_of_turn>\n'

In [24]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 1, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 100,
        learning_rate = 5e-5, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir="outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1087 [00:00<?, ? examples/s]

In [25]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    # These parameters help train_on_responses_only identify the parts to mask.
    # They should match the output format of your chat template.
    instruction_part = "<start_of_turn>user\n",
    response_part = "<end_of_turn>\n<start_of_turn>model\n",
)

Map (num_proc=2):   0%|          | 0/1087 [00:00<?, ? examples/s]

In [26]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><start_of_turn>user\nWhat do I need to use Android Auto?<end_of_turn>\n<start_of_turn>model\nYou need a compatible Android phone with the Android Auto app installed, and you must connect it to the system via the USB cable.<end_of_turn>\n'

In [27]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                  You need a compatible Android phone with the Android Auto app installed, and you must connect it to the system via the USB cable.<end_of_turn>\n'

In [28]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,087 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 30,375,936 of 298,474,112 (10.18% trained)


Step,Training Loss,entropy
1,3.4237,0
2,4.2527,No Log
3,4.1562,No Log
4,3.2266,No Log
5,2.5477,No Log
6,2.9152,No Log
7,2.7349,No Log
8,2.753,No Log
9,2.5505,No Log
10,2.4031,No Log


In [32]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
# used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
# used_percentage = round(used_memory / max_memory * 100, 3)
# lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
# print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
# print(f"Peak reserved memory % of max memory = {used_percentage} %.")
# print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

80.4442 seconds used for training.
1.34 minutes used for training.
Peak reserved memory = 1.74 GB.


In [33]:
messages = [
    {'role': 'system','content':dataset['conversations'][10][0]['content']},
    {"role" : 'user', 'content' : dataset['conversations'][10][1]['content']}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
).removeprefix('<bos>')

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 125,
    temperature = 1, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

<bos><start_of_turn>user
What is the recommended volume level while driving?

<start_of_turn>model
The recommended volume is around 150 bar.<end_of_turn>


In [34]:
model.save_pretrained("gemma-3-it-QA")  # Local saving
tokenizer.save_pretrained("gemma-3-it-QA") # Local saving
model.push_to_hub("Nihal2000/gemma-3-it-QA", token = HF_TOKEN_PATH) # Online saving
tokenizer.push_to_hub("Nihal2000/gemma-3-it-QA", token = HF_TOKEN_PATH) # Online saving

README.md:   0%|          | 0.00/580 [00:00<?, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...a-3-it-QA/adapter_model.safetensors:   0%|          |  558kB /  122MB            

Saved model to https://huggingface.co/Nihal2000/gemma-3-it-QA


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  gemma-3-it-QA/tokenizer.json          : 100%|##########| 33.4MB / 33.4MB            

  gemma-3-it-QA/tokenizer.model         : 100%|##########| 4.69MB / 4.69MB            

In [35]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "gemma-3-it-QA", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        load_in_4bit = False,
    )

In [37]:
# Merge to 16bit
if False:
    model.save_pretrained_merged("gemma-3-finetune", tokenizer, save_method = "merged_16bit")
if False: # Pushing to HF Hub
    model.push_to_hub_merged("hf/gemma-3-finetune", tokenizer, save_method = "merged_16bit", token = HF_TOKEN_PATH)

# Merge to 4bit
if False:
    model.save_pretrained_merged("gemma-3-finetune", tokenizer, save_method = "merged_4bit",)
if False: # Pushing to HF Hub
    model.push_to_hub_merged("hf/gemma-3-finetune", tokenizer, save_method = "merged_4bit", token = HF_TOKEN_PATH)

# Just LoRA adapters
if False:
    model.save_pretrained("gemma-3-finetune")
    tokenizer.save_pretrained("gemma-3-finetune")
if False: # Pushing to HF Hub
    model.push_to_hub("hf/gemma-3-finetune", token = HF_TOKEN_PATH)
    tokenizer.push_to_hub("hf/gemma-3-finetune", token = HF_TOKEN_PATH)


In [38]:
if False: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "gemma-3-finetune",
        tokenizer,
        quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
    )

In [39]:
if False: # Change to True to upload GGUF
    model.push_to_hub_gguf(
        "gemma-3-finetune-it-Q8_0",
        tokenizer,
        quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
        repo_id = "Nihal2000/gemma-finetune-gguf",
        token = HF_TOKEN_PATH,
    )