Based one

https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_(4B).ipynb

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 200 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Quadro RTX 5000. Num GPUs = 1. Max memory: 16.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Load LoRA adapters

In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 8,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [3]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.2",
)

In [4]:
from datasets import load_dataset
dataset = load_dataset("philipfourie/books3_basic_sentenses_paraphrased-Morse", split="train")
dataset

Dataset({
    features: ['line', 'morse'],
    num_rows: 531976
})

In [5]:
dataset[0]

{'line': 'The title was cataloged by the Library of Congress.',
 'morse': '- .... . / - .. - .-.. . / .-- .- ... / -.-. .- - .- .-.. --- --. . -.. / -... -.-- / - .... . / .-.. .. -... .-. .- .-. -.-- / --- ..-. / -.-. --- -. --. .-. . ... ... .-.-.-'}

In [6]:
def apply_conversations(example):
    return {
        "conversations": [
            {
                "content": example["line"] + ">>",
                "role": "user",
            },
            {
                "content": example["morse"],
                "role": "assistant",
            },
        ]
    }

In [7]:
convos = dataset.map(
    apply_conversations,
    remove_columns = ["line", "morse"],
    #num_proc = 1,
    desc = "Applying conversations",
)
convos

Dataset({
    features: ['conversations'],
    num_rows: 531976
})

In [8]:
convos[0]

{'conversations': [{'content': 'The title was cataloged by the Library of Congress.>>',
   'role': 'user'},
  {'content': '- .... . / - .. - .-.. . / .-- .- ... / -.-. .- - .- .-.. --- --. . -.. / -... -.-- / - .... . / .-.. .. -... .-. .- .-. -.-- / --- ..-. / -.-. --- -. --. .-. . ... ... .-.-.-',
   'role': 'assistant'}]}

In [9]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
   return { "text" : texts, }

convos = convos.map(formatting_prompts_func, remove_columns=["conversations"], batched = True)
convos

Dataset({
    features: ['text'],
    num_rows: 531976
})

In [10]:
convos["text"][0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThe title was cataloged by the Library of Congress.>><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n- .... . / - .. - .-.. . / .-- .- ... / -.-. .- - .- .-.. --- --. . -.. / -... -.-- / - .... . / .-.. .. -... .-. .- .-. -.-- / --- ..-. / -.-. --- -. --. .-. . ... ... .-.-.-<|eot_id|>'

In [11]:
# split the dataset into train and test
train_dataset = convos.train_test_split(test_size=0.1, seed=42)["train"]
test_dataset = convos.train_test_split(test_size=0.1, seed=42)["test"]

convos = None
dataset = None

In [12]:
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    max_seq_length = max_seq_length,
    #data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    packing = False, # Can make training 5x faster for short sequences.

    args = SFTConfig(
        dataset_text_field="text",
        output_dir="outputs",  # Explicitly set output directory
        logging_dir="outputs/tensorboard_logs",  # Subdirectory for TensorBoard logs
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!

        warmup_steps = 15,
        #num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 20, #1301
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps=1,  # Log every step for testing
        eval_steps=325,  # Enable evaluation logging
        eval_strategy="steps",
        
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3408,
        report_to = "tensorboard", # Use this for WandB etc
        dataset_num_proc=4,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),        
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/478778 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/53198 [00:00<?, ? examples/s]

In [13]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [14]:
trainer.train_dataset[100]["input_ids"]

[128000,
 128000,
 128006,
 9125,
 128007,
 271,
 38766,
 1303,
 33025,
 2696,
 25,
 6790,
 220,
 2366,
 18,
 198,
 15724,
 2696,
 25,
 220,
 1627,
 5887,
 220,
 2366,
 19,
 271,
 128009,
 128006,
 882,
 128007,
 271,
 38766,
 1022,
 264,
 60642,
 75,
 520,
 279,
 436,
 380,
 13,
 2511,
 128009,
 128006,
 78191,
 128007,
 271,
 12,
 12898,
 13,
 5354,
 12,
 482,
 611,
 12730,
 5354,
 14863,
 5354,
 14863,
 611,
 662,
 12,
 611,
 662,
 12,
 497,
 5354,
 482,
 482,
 662,
 12,
 497,
 611,
 662,
 12,
 482,
 611,
 482,
 22666,
 662,
 611,
 662,
 14863,
 5354,
 2564,
 482,
 662,
 12,
 12898,
 12898,
 128009]

In [15]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                                               -.-...- - / ---..-...-. /.- /.-.... - -.-.. /.- - / -..... /.-...... -.-.-.-<|eot_id|>'

In [16]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Quadro RTX 5000. Max memory = 16.0 GB.
1.457 GB of memory reserved.


In [17]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 478,778 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss


In [26]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [{
    "role": "user",
    "content":  "The cat sat on the>>"
}]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThe cat sat on the>><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n- / -- -. --. - /.--. /.... / -. /.. / --... -. -.......-.-.. /.-.. /...-- -.. -. --...- / -- ---.. /....... - /..-.-.-']

In [27]:
# from unsloth.chat_templates import get_chat_template
# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "llama-3.2",
# )
# messages = [{
#     "role": "user",
#     "content": [{
#         "type" : "text",
#         "text" : "The cat sat on the>>",
#     }]
# }]
# text = tokenizer.apply_chat_template(
#     messages,
#     tokenize= False,
#     add_generation_prompt = True, # Must add for generation
# )


# outputs = model.generate(
#     **tokenizer([text], return_tensors = "pt").to("cuda"),
#     max_new_tokens = 120, # Increase for longer outputs!
#     # Recommended Gemma-3 settings!
#     temperature = 0.01, top_p = 0.95, top_k = 64,
# )

# output  = tokenizer.batch_decode(outputs)[0]
# print(output)

# from decode import decode_from_morse

# import re

# match = re.search(r'<start_of_turn>model\n(.*?)<end_of_turn>', output, re.DOTALL)
# if match:
#     result = match.group(1)
#     #print(result)
#     print(decode_from_morse(result))
# else:
#     print("No match found.")
    

In [None]:
#model.save_pretrained("gemma-3-1b-sos")  # Local saving
#tokenizer.save_pretrained("gemma-3-1b-sos")

In [None]:
if False: # Change to True to save finetune!
    #model.save_pretrained_merged("gemma-3-finetune2", tokenizer, save_method = "merged_4bit")
    #odel.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)
    #odel.save_pretrained_gguf("dir", tokenizer, quantization_method = "Q8_0")
    model.push_to_hub_gguf("philipfourie/gemma-3-finetune",  quantization_type ="q8_0")

In [None]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "philipfourie/gemma-3-1b-morse-code", tokenizer,
    )

In [None]:
# model.save_pretrained_gguf(
#     "gemma-3-1b-morse-code",
#     quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
# )

In [None]:
# if True: # Change to True to upload GGUF
#     model.push_to_hub_gguf(
#         #"gemma-3-1b-morse-code",
#         quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
#         repo_id = "philipfourie/gemma-3-1b-morse-code"
#     )

In [28]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [34]:
if True: model.save_pretrained_merged("sos-llama", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("philipfourie/sos-llama", tokenizer, save_method = "merged_16bit")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.06 out of 15.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 19.11it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [36]:
if False: model.save_pretrained_gguf("sos-llama-gguf", tokenizer, quantization_method = "q4_k_m")
if True: model.push_to_hub_gguf("philipfourie/sos-llama-gguf", tokenizer, quantization_method = "q4_k_m")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.7 out of 15.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 19.75it/s]


Unsloth: Saving tokenizer... Done.
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at philipfourie/sos-llama-gguf into f16 GGUF format.
The output location will be /home/philipf/dev/testj/philipfourie/sos-llama-gguf/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: sos-llama-gguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.f

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/808M [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/philipfourie/sos-llama-gguf


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/philipfourie/sos-llama-gguf
