Based one

https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_(4B).ipynb

In [None]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    #model_name = "unsloth/gemma-3-1b-pt-unsloth-bnb-4bit",
    model = "google/gemma-3-1b-pt",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.dev0.
   \\   /|    Quadro RTX 5000. Num GPUs = 1. Max memory: 16.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


## Load LoRA adapters

In [2]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [3]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [4]:
from datasets import load_dataset
dataset = load_dataset("philipfourie/books3_basic_sentenses_paraphrased-Morse", split="train")

In [5]:
dataset

Dataset({
    features: ['line', 'morse'],
    num_rows: 531976
})

In [6]:
dataset[0]

{'line': 'The title was cataloged by the Library of Congress.',
 'morse': '- .... . / - .. - .-.. . / .-- .- ... / -.-. .- - .- .-.. --- --. . -.. / -... -.-- / - .... . / .-.. .. -... .-. .- .-. -.-- / --- ..-. / -.-. --- -. --. .-. . ... ... .-.-.-'}

In [None]:
def apply_conversations(example):
    return {
        "conversations": [
            {
                "content": example["line"],
                "role": "user",
            },
            {
                "content": example["morse"] + tokenizer.eos_token,
                "role": "assistant",
            },
        ]
    }

In [8]:
convos = dataset.map(
    apply_conversations,
    remove_columns = ["line", "morse"],
    #num_proc = 1,
    desc = "Applying conversations",
)
convos

Dataset({
    features: ['conversations'],
    num_rows: 531976
})

In [9]:
convos[0]

{'conversations': [{'content': 'The title was cataloged by the Library of Congress.>>',
   'role': 'user'},
  {'content': '- .... . / - .. - .-.. . / .-- .- ... / -.-. .- - .- .-.. --- --. . -.. / -... -.-- / - .... . / .-.. .. -... .-. .- .-. -.-- / --- ..-. / -.-. --- -. --. .-. . ... ... .-.-.-',
   'role': 'assistant'}]}

In [10]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

convos = convos.map(formatting_prompts_func, remove_columns=["conversations"], batched = True)
convos

Dataset({
    features: ['text'],
    num_rows: 531976
})

In [11]:
convos["text"][0]

'<start_of_turn>user\nThe title was cataloged by the Library of Congress.>><end_of_turn>\n<start_of_turn>model\n- .... . / - .. - .-.. . / .-- .- ... / -.-. .- - .- .-.. --- --. . -.. / -... -.-- / - .... . / .-.. .. -... .-. .- .-. -.-- / --- ..-. / -.-. --- -. --. .-. . ... ... .-.-.-<end_of_turn>\n'

In [12]:
# split the dataset into train and test
train_dataset = convos.train_test_split(test_size=0.1, seed=42)["train"]
test_dataset = convos.train_test_split(test_size=0.1, seed=42)["test"]

convos = None
dataset = None

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    args = SFTConfig(
        dataset_text_field="text",
        output_dir="outputs",  # Explicitly set output directory
        logging_dir="outputs/tensorboard_logs",  # Subdirectory for TensorBoard logs
        per_device_train_batch_size = 12,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!

        warmup_steps = 15,
        #num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 1300,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps=1,  # Log every step for testing
        eval_steps=2000,  # Enable evaluation logging
        eval_strategy="steps",
        
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "tensorboard", # Use this for WandB etc
        dataset_num_proc=4,
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


In [14]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

In [15]:
trainer.train_dataset[100]["input_ids"]

[2,
 105,
 2364,
 107,
 37967,
 1135,
 496,
 44981,
 236752,
 657,
 506,
 109701,
 236761,
 6985,
 106,
 107,
 105,
 4368,
 107,
 236772,
 4862,
 236761,
 2728,
 236772,
 753,
 965,
 9190,
 2728,
 24975,
 2728,
 24975,
 965,
 199900,
 965,
 199900,
 856,
 2728,
 753,
 753,
 199900,
 856,
 965,
 199900,
 753,
 965,
 753,
 27103,
 783,
 965,
 783,
 24975,
 2728,
 3729,
 753,
 199900,
 4862,
 4862,
 106,
 107]

In [16]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                   -.-. ..- - / --- ..-. ..-. / .- / .-.. .. - - .-.. / .- - / - .... . / .-. .. ... - .-.-.-<end_of_turn>\n'

In [17]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Quadro RTX 5000. Max memory = 16.0 GB.
1.512 GB of memory reserved.


In [18]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 478,778 | Num Epochs = 1 | Total steps = 13
O^O/ \_/ \    Batch size per device = 12 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (12 x 4 x 1) = 48
 "-____-"     Trainable parameters = 6,522,880/1,000,000,000 (0.65% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss


In [19]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "The cat sat on the>>",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    tokenize= False,
    add_generation_prompt = True, # Must add for generation
)


outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 120, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.01, top_p = 0.95, top_k = 64,
)

output  = tokenizer.batch_decode(outputs)[0]
print(output)

from decode import decode_from_morse

import re

match = re.search(r'<start_of_turn>model\n(.*?)<end_of_turn>', output, re.DOTALL)
if match:
    result = match.group(1)
    #print(result)
    print(decode_from_morse(result))
else:
    print("No match found.")
    

<bos><bos><start_of_turn>user
The cat sat on the>><end_of_turn>
<start_of_turn>model
-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
No match found.


In [20]:
#model.save_pretrained("gemma-3-1b-sos")  # Local saving
#tokenizer.save_pretrained("gemma-3-1b-sos")

In [None]:
if False: # Change to True to save finetune!
    #model.save_pretrained_merged("gemma-3-finetune2", tokenizer, save_method = "merged_4bit")
    #odel.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)
    #odel.save_pretrained_gguf("dir", tokenizer, quantization_method = "Q8_0")
    model.push_to_hub_gguf("philipfourie/gemma-3-finetune",  quantization_type ="q8_0")

NotImplementedError: Unsloth: llama.cpp GGUF conversion does not yet support converting model types of `Gemma3ForCausalLM`.

In [None]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "philipfourie/gemma-3-1b-morse-code", tokenizer,
    )

In [None]:
# model.save_pretrained_gguf(
#     "gemma-3-1b-morse-code",
#     quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
# )

In [None]:
if False: # Change to True to upload GGUF
    model.push_to_hub_gguf(
        #"gemma-3-1b-morse-code",
        quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
        repo_id = "philipfourie/gemma-3-1b-morse-code"
    )