### Installation

In [None]:
%%capture
!pip install unsloth vllm

In [None]:
%%capture
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

In [None]:
from unsloth import FastModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-04-18 16:38:14.445078: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744994294.656942      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744994294.714038      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-18 16:38:32 [__init__.py:239] Automatically detected platform cuda.


In [None]:
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    max_seq_length = 256, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False,
    full_finetuning = False,
)

==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.8.4.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/4.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [None]:
print(model.is_loaded_in_4bit)
print(model.is_quantized)
print(model.quantization_method)

True
True
QuantizationMethod.BITS_AND_BYTES


We now add LoRA adapters so we only need to update a small amount of parameters!

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 16,           # Larger = higher accuracy, but no significant improvements. check LoRA paper
    lora_alpha = 32,  # Recommended alpha == r at least
    lora_dropout = 0.1,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.language_model.model` require gradients


In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [None]:
from datasets import load_dataset
try:
    train_ds = load_dataset("grammarly/coedit", split="train")
    val_ds = load_dataset("grammarly/coedit", split="validation")
except ValueError as e:
    print(f"Caught ValueError, likely need to specify config. Trying 'default'. Error: {e}")
    train_ds = load_dataset("grammarly/coedit", name="default", split="train")
    val_ds = load_dataset("grammarly/coedit", name="default", split="validation")


README.md:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/19.7M [00:00<?, ?B/s]

validation.jsonl:   0%|          | 0.00/692k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/69071 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1712 [00:00<?, ? examples/s]

In [None]:
print(train_ds)
print(val_ds)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 69071
})
Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 1712
})


In [None]:
def format_dataset_with_template(example, tokenizer):

    src_txt = example["src"]
    tgt_txt = example["tgt"]

    # 1. Prepare the conversation history in the required format
    messages = [
        {"role": "user", "content": src_txt},
        {"role": "model", "content": tgt_txt},
    ]

    # 2. Apply the chat template
    try:
        formatted_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
    except Exception as e:
        print(f"Error applying chat template to example: {example}")
        print(f"Error: {e}")
        formatted_text = "" # Return empty string on error to avoid crashing map

        # Apply the logic from remove_special_tokens found in Unsloth code
    if formatted_text.startswith(tokenizer.bos_token):
        formatted_text = formatted_text[len(tokenizer.bos_token):]

    # 3. Return in the desired dictionary format for .map
    return {"text": formatted_text}


In [None]:
processed_train_ds = train_ds.map(
    lambda example: format_dataset_with_template(example, tokenizer),
    batched=False,
    num_proc=1 # Increase if you confirm it works reliably on your system
)
processed_val_ds = val_ds.map(
    lambda example: format_dataset_with_template(example, tokenizer),
    batched=False,
    num_proc=1
)

Map:   0%|          | 0/69071 [00:00<?, ? examples/s]

Map:   0%|          | 0/1712 [00:00<?, ? examples/s]

In [None]:
print(processed_train_ds)
print(processed_val_ds)

Dataset({
    features: ['_id', 'task', 'src', 'tgt', 'text'],
    num_rows: 69071
})
Dataset({
    features: ['_id', 'task', 'src', 'tgt', 'text'],
    num_rows: 1712
})


In [None]:
# removing original columns
columns_to_remove = list(train_ds.features)
print(f"\nRemoving original columns: {columns_to_remove}")
processed_train_ds = processed_train_ds.remove_columns(columns_to_remove)
processed_val_ds = processed_val_ds.remove_columns(columns_to_remove)



Removing original columns: ['_id', 'task', 'src', 'tgt']


In [None]:
print("\nProcessed train dataset sample (using tokenizer template):")
print(processed_train_ds)
if len(processed_train_ds) > 0:
    print(processed_train_ds[0]["text"])
else:
     print("Processed training dataset is empty or first example failed.")


print("\nProcessed validation dataset sample (using tokenizer template):")
print(processed_val_ds)
if len(processed_val_ds) > 0:
    print(processed_val_ds[0]['text'])
else:
    print("Processed validation dataset is empty or first example failed.")



Processed train dataset sample (using tokenizer template):
Dataset({
    features: ['text'],
    num_rows: 69071
})
<start_of_turn>user
Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.<end_of_turn>
<start_of_turn>model
For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.<end_of_turn>


Processed validation dataset sample (using tokenizer template):
Dataset({
    features: ['text'],
    num_rows: 1712
})
<start_of_turn>user
Paraphrase this sentence: Why are you arresting me?<end_of_turn>
<start_of_turn>model
Why am I being arrested?<end_of_turn>



<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer).

In [None]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=processed_train_ds,
    eval_dataset=processed_val_ds,
    args=SFTConfig(
        output_dir="./checkpoints256",
        dataset_text_field="text",
        max_seq_length=256,
        packing=True,
        per_device_train_batch_size=64,
        gradient_accumulation_steps=2,
        warmup_steps=100,
        num_train_epochs=1,
        learning_rate=3e-5,
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        report_to="none",
        eval_strategy="steps",
        eval_steps=25,
        save_strategy="steps",
        save_steps=25,
    )
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/69071 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/1712 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


In [None]:
print(trainer.train_dataset)

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 69071
})


We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=4):   0%|          | 0/69071 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1712 [00:00<?, ? examples/s]

In [None]:
print(trainer.train_dataset)

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 69071
})


In [None]:
print(trainer.train_dataset[100]["text"])
print()
print(trainer.train_dataset[100]["input_ids"])
print()
print(trainer.train_dataset[100]["attention_mask"])
print()
print(trainer.train_dataset[100]["labels"])

<start_of_turn>user
Fix grammar in this sentence: If engineers do not come up with new ideas, they cannot find best solution for the problems.<end_of_turn>
<start_of_turn>model
If engineers do not come up with new ideas, they cannot find the best solution for different problems.<end_of_turn>


[2, 105, 2364, 107, 36819, 40095, 528, 672, 13315, 236787, 1637, 22072, 776, 711, 2229, 872, 607, 861, 6549, 236764, 901, 3914, 1586, 1791, 3465, 573, 506, 4078, 236761, 106, 107, 105, 4368, 107, 2859, 22072, 776, 711, 2229, 872, 607, 861, 6549, 236764, 901, 3914, 1586, 506, 1791, 3465, 573, 1607, 4078, 236761, 106, 107]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 2859, 

In [None]:
total_training_tokens = 0
for i in range(trainer.train_dataset.num_rows):
    total_training_tokens += len(trainer.train_dataset[i]["input_ids"])

print("Total training tokens in CoEdIT dataset:", total_training_tokens)

Total training tokens in CoEdIT dataset: 4062937


In [None]:
import numpy as np

lengths = [len(x) for x in trainer.train_dataset['input_ids']]
print(f"Token lengths stats:")
print(f"Min: {np.min(lengths)}")
print(f"Max: {np.max(lengths)}")
print(f"Mean: {np.mean(lengths)}")
print(f"Median: {np.median(lengths)}")
print(f"95th percentile: {np.percentile(lengths, 95)}")
print(f"99th percentile: {np.percentile(lengths, 99)}")

Token lengths stats:
Min: 18
Max: 256
Mean: 58.8226173068292
Median: 52.0
95th percentile: 108.0
99th percentile: 133.0


Let's verify masking the instruction part is done! Let's print the 100th row again:

In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><start_of_turn>user\nFix grammar in this sentence: If engineers do not come up with new ideas, they cannot find best solution for the problems.<end_of_turn>\n<start_of_turn>model\nIf engineers do not come up with new ideas, they cannot find the best solution for different problems.<end_of_turn>\n'

Now let's print the masked out example - you should see only the answer is present:

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                                  If engineers do not come up with new ideas, they cannot find the best solution for different problems.<end_of_turn>\n'

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.57 GB of memory reserved.


Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 69,071 | Num Epochs = 1 | Total steps = 270
O^O/ \_/ \    Batch size per device = 128 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (128 x 2 x 1) = 256
 "-____-"     Trainable parameters = 29,802,496/4,000,000,000 (0.75% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
25,4.9399,2.619917
50,1.7806,0.722835


Unsloth: Not an error, but Gemma3ForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [None]:
cwd = os.getcwd()
ls = os.listdir('./checkpoints')
print(cwd, ls)

In [None]:
model = AutoModelForCausalLM.from_pretrained("./results-seq256", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("./results-seq256")

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64`

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "Fix grammar in this sentence: hello there the angle from my nightmare the shadow in teh background of the morgue",}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.2, top_p = 0.9,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "Fix grammar errors in this sentence: hello there the angle from my nightmare the shadow in teh background of the morgue",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 0.3,
)
tokenizer.batch_decode(outputs)

In [None]:

if True:
    hf_repo_name = "muzzz/nomodit-4b-merged"
    # token w/ write access
    hf_token = ""

    print(f"Attempting to push merged model to: {hf_repo_name}")
    try:
        model.push_to_hub_merged(
            hf_repo_name, tokenizer,
            token = hf_token,
            commit_message = "Upload fine-tuned Gemma-3 4B (merged float16)"
        )
        print(f"Successfully pushed merged model to {hf_repo_name}")
    except Exception as e:
        print(f"Error pushing merged model to Hugging Face Hub: {e}")
        print("Please ensure you have the correct repo name, write token, and network connectivity.")
