Based one

https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_(4B).ipynb

In [1]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    #model_name = "unsloth/gemma-3-1b-pt-unsloth-bnb-4bit",
    model_name = "google/gemma-3-1b-pt",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.dev0.
   \\   /|    Quadro RTX 5000. Num GPUs = 1. Max memory: 16.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


## Load LoRA adapters

In [2]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [3]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [4]:
from datasets import load_dataset
dataset = load_dataset("philipfourie/books3_basic_sentenses_paraphrased-Morse", split="train")
dataset

Dataset({
    features: ['line', 'morse'],
    num_rows: 531976
})

In [5]:
dataset[3]

{'line': 'Where is my mother?',
 'morse': '.-- .... . .-. . / .. ... / -- -.-- / -- --- - .... . .-. ..--..'}

In [6]:
import random

# total number of examples
N = len(dataset)

# build a list with half “to_morse” and half “to_text”
directions = ["to_morse"] * (N // 2) + ["to_text"] * (N - N // 2)

# shuffle in-place (set seed for reproducibility if you like)
random.seed(42)
random.shuffle(directions)

# attach it to your Dataset
dataset = dataset.add_column("direction", directions)


In [7]:
from collections import Counter

counts = Counter(dataset["direction"])
print(counts)
# Counter({'to_morse': 265988, 'to_text': 265988})  # for example


Counter({'to_morse': 265988, 'to_text': 265988})


In [8]:
def apply_conversations(example):
    if example["direction"] == "to_morse":
        return {
            "conversations": [
                {
                    "content": example["line"],
                    "role": "user",
                },
                {
                    "content": example["morse"] + tokenizer.eos_token,
                    "role": "assistant",
                },
            ]
        }
    else:
        return {
            "conversations": [
                {
                    "content": example["morse"],
                    "role": "user",
                },
                {
                    "content": example["line"] + tokenizer.eos_token,
                    "role": "assistant",
                },
            ]
        }

In [9]:
convos = dataset.map(
    apply_conversations,
    remove_columns = ["line", "morse"],
    #num_proc = 1,
    desc = "Applying conversations",
)
convos

Dataset({
    features: ['direction', 'conversations'],
    num_rows: 531976
})

In [10]:
convos[10]

{'direction': 'to_text',
 'conversations': [{'content': '.. / .- -- / -. --- - / -.-- --- ..- .-. / -- --- - .... . .-. .-.-.-',
   'role': 'user'},
  {'content': 'I am not your mother.<eos>', 'role': 'assistant'}]}

In [11]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

convos = convos.map(formatting_prompts_func, remove_columns=["conversations", "direction"], batched = True)
convos

Map:   0%|          | 0/531976 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 531976
})

In [12]:
convos["text"][10]

'<start_of_turn>user\n.. / .- -- / -. --- - / -.-- --- ..- .-. / -- --- - .... . .-. .-.-.-<end_of_turn>\n<start_of_turn>model\nI am not your mother.<eos><end_of_turn>\n'

In [13]:
# split the dataset into train and test
train_dataset = convos.train_test_split(test_size=0.1, seed=42)["train"]
test_dataset = convos.train_test_split(test_size=0.1, seed=42)["test"]

convos = None
dataset = None

In [14]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    args = SFTConfig(
        dataset_text_field="text",
        output_dir="outputs",  # Explicitly set output directory
        logging_dir="outputs/tensorboard_logs",  # Subdirectory for TensorBoard logs
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!

        warmup_steps = 15,
        num_train_epochs = 1.5, # Set this for 1 full training run.
        #max_steps = 12000,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps=1,  # Log every step for testing
        eval_steps=1000,  # Enable evaluation logging
        eval_strategy="steps",
        
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "tensorboard", # Use this for WandB etc
        dataset_num_proc=4,
        
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/478778 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/53198 [00:00<?, ? examples/s]

In [15]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=12):   0%|          | 0/478778 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/53198 [00:00<?, ? examples/s]

In [16]:
trainer.train_dataset[100]["input_ids"]

[2,
 105,
 2364,
 107,
 236772,
 4862,
 236761,
 2728,
 236772,
 753,
 965,
 9190,
 2728,
 24975,
 2728,
 24975,
 965,
 199900,
 965,
 199900,
 856,
 2728,
 753,
 753,
 199900,
 856,
 965,
 199900,
 753,
 965,
 753,
 27103,
 783,
 965,
 783,
 24975,
 2728,
 3729,
 753,
 199900,
 4862,
 4862,
 106,
 107,
 105,
 4368,
 107,
 37967,
 1135,
 496,
 44981,
 236752,
 657,
 506,
 109701,
 236761,
 1,
 106,
 107]

In [17]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[9]["labels"]]).replace(tokenizer.pad_token, "X")

"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'Come on!' I shouted, jumping up.<eos><end_of_turn>\n"

In [18]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Quadro RTX 5000. Max memory = 16.0 GB.
2.432 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 478,778 | Num Epochs = 2 | Total steps = 44,885
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 6,522,880/1,006,408,832 (0.65% trained)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


Unsloth: Will smartly offload gradients to save VRAM!


In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "Hi Phil, can you please send food and water as soon as possible?",
    }]
}]

# messages = [{
#     "role": "user",
#     "content": [{
#         "type" : "text",
#         "text" : ".. / .- -- / -. --- - / -.-- --- ..- .-. / -- --- - .... . .-. .-.-.-",
#     }]
# }]

text = tokenizer.apply_chat_template(
    messages,
    tokenize= False,
    add_generation_prompt = True, # Must add for generation
)


outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 200, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1, top_p = 0.95, top_k = 64,
)

output  = tokenizer.batch_decode(outputs)[0]
print(output)

from decode import decode_from_morse

import re

match = re.search(r'<start_of_turn>model\n(.*?)<eos>', output, re.DOTALL)
if match:
    result = match.group(1)
    #print(result)
    print(decode_from_morse(result))
else:
    print("No match found.")
    

In [None]:
if True: # Change to True to save finetune!
    model.save_pretrained_merged("bi-morse-code", tokenizer)

In [None]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "philipfourie/bi-morse-code", 
        tokenizer,
    )

In [None]:
if False: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "morse-code",
        quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
    )

In [None]:
if False: # Change to True to upload GGUF
    model.push_to_hub_gguf(
        "morse-code",
        quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
        repo_id = "philipfourie/morse-code_q8"
    )