In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [2]:
from datasets import load_dataset

dataset = load_dataset(
    "json", data_files={"train": "datalines/train_example.jsonl"}
)

In [None]:
from transformers import AutoTokenizer

model_name = "google/gemma-3-270m-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def format_prompt(sample):
    input = sample["INPUT"]
    output = sample["OUTPUT"]
    return f"{input}{output}" + tokenizer.eos_token

def tokenize(sample, max_length):
    prompt = f"{sample['INPUT']}"
    prompt_len = len(tokenizer(prompt)["input_ids"])
    tokenized = tokenizer(sample["text"], padding="max_length", max_length=max_length)
    pad_len = tokenized["input_ids"].count(tokenizer.pad_token_id)
    tokenized["labels"] = tokenized["input_ids"].copy()
    tokenized["labels"][: pad_len + prompt_len] = [-100] * (pad_len + prompt_len)
    return tokenized

In [6]:
dataset["train"][0]

{'INPUT': 'USER:set a timer for 3 hours\n',
 'OUTPUT': 'HOURS:3\nMINUTES:0\nSECONDS:0'}

In [None]:
dataset["train"] = dataset["train"].map(lambda x: {"text": format_prompt(x)})

In [7]:
print(dataset["train"][0]["text"])

USER:set a timer for 3 hours
HOURS:3
MINUTES:0
SECONDS:0<eos>


In [8]:
max_length = max(len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["train"])

In [None]:
from functools import partial

dataset["train"] = dataset["train"].map(partial(tokenize, max_length=max_length), batched=False)

In [14]:
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    use_cache=False,
    attn_implementation="eager",
)


In [15]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)

In [16]:
training_args = TrainingArguments(
    output_dir="./gemma-timer-lora",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="steps",
    save_steps=0.02,
    save_total_limit=None,
    report_to="none",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    seed=252,
    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    args=training_args,
)

trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 2, 'pad_token_id': 0}.


Step,Training Loss
10,2.2311
20,0.1868
30,0.0323
40,0.0126
50,0.0112
60,0.0062
70,0.0049
80,0.0083
90,0.0044


TrainOutput(global_step=99, training_loss=0.2525280250819645, metrics={'train_runtime': 71.7577, 'train_samples_per_second': 43.828, 'train_steps_per_second': 1.38, 'total_flos': 82521648760320.0, 'train_loss': 0.2525280250819645, 'entropy': 0.645798020892673, 'num_tokens': 132090.0, 'mean_token_accuracy': 0.9987068772315979, 'epoch': 1.0})

In [35]:
!uv run convert_lora_to_gguf.py ./gemma-timer-lora/checkpoint-30 --outfile ./gguf_checkpoints/timer_checkpoint-30f16.gguf --outtype "f16"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:lora-to-gguf:Loading base model from Hugging Face: google/gemma-3-270m-it
INFO:hf-to-gguf:gguf: indexing model part 'model.safetensors'
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:lora-to-gguf:Exporting model...
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_a,      torch.float32 --> F16, shape = {2048, 16}
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_b,      torch.float32 --> F16, shape = {16, 640}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_a,      torch.float32 --> F16, shape = {640, 16}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_b,      torch.float32 --> F16, shape = {16, 2048}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_a,        torch.float32 --> F16, shape = {640, 16}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_b,        torch.float32 --> F16, shape = {16, 2048}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_a,        torch.float32 --> F16, shape = {640, 16}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_b,        torch.float32 --> F16, shape = {16, 256}
INFO:hf-to-ggu