In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
import json

input_file = "datalines/timer_name_data_de.jsonl"
output_file = "datalines/timer_name_data_de_fixed.jsonl"

with (
    open(input_file, "r", encoding="utf-8") as f_in,
    open(output_file, "w", encoding="utf-8") as f_out,
):
    for line in f_in:
        data = json.loads(line)
        for key in ["HOURS", "MINUTES", "SECONDS"]:
            if data[key] is None:
                data[key] = 0
        f_out.write(json.dumps(data, ensure_ascii=False) + "\n")

In [4]:
import random
import os

languages = ["eng", "rus", "de"]
base_dir = "datalines"

train_lines = []
val_lines = []

for lang in languages:
    input_file = os.path.join(base_dir, f"timer_name_data_{lang}.jsonl")

    with open(input_file, "r", encoding="utf-8") as f:
        unique_lines = list(set(line.strip() for line in f if line.strip()))

    print(f"[{lang}] Unique lines: {len(unique_lines)}")

    random.shuffle(unique_lines)
    val_len = int(0.2 * len(unique_lines))
    val_part = unique_lines[:val_len]
    train_part = unique_lines[val_len:]

    print(f"[{lang}] → train: {len(train_part)}, val: {len(val_part)}")

    train_lines.extend(train_part)
    val_lines.extend(val_part)

random.shuffle(train_lines)
random.shuffle(val_lines)

train_path = os.path.join(base_dir, "train_timer.jsonl")
val_path = os.path.join(base_dir, "val_timer.jsonl")

with open(train_path, "w", encoding="utf-8") as f:
    for line in train_lines:
        f.write(line + "\n")

with open(val_path, "w", encoding="utf-8") as f:
    for line in val_lines:
        f.write(line + "\n")

print(f"\nTotal train: {len(train_lines)} | Total val: {len(val_lines)}")

[eng] Unique lines: 1298
[eng] → train: 1039, val: 259
[rus] Unique lines: 1164
[rus] → train: 932, val: 232
[de] Unique lines: 1212
[de] → train: 970, val: 242

Total train: 2941 | Total val: 733


In [5]:
from datasets import load_dataset

dataset = load_dataset(
    "json", data_files={"train": "datalines/train_timer.jsonl", "validation": "datalines/val_timer.jsonl"}
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [6]:
from transformers import AutoTokenizer

model_name = "google/gemma-3-270m-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def format_prompt(sample):
    user = sample["USER"]
    hours = sample["HOURS"]
    assert hours is not None
    minutes = sample["MINUTES"]
    assert minutes is not None
    seconds = sample["SECONDS"]
    assert seconds is not None
    name = sample["NAME"] or "_"
    return f"USER:{user}\nHOURS:{hours}\nMINUTES:{minutes}\nSECONDS:{seconds}\nNAME:{name}" + tokenizer.eos_token

In [8]:
dataset["train"] = dataset["train"].map(lambda x: {"text": format_prompt(x)})
dataset["validation"] = dataset["validation"].map(lambda x: {"text": format_prompt(x)})

Map:   0%|          | 0/2941 [00:00<?, ? examples/s]

Map:   0%|          | 0/733 [00:00<?, ? examples/s]

In [9]:
dataset["train"][0]

{'USER': 'put on a seven hour ten minute timer for roast',
 'HOURS': 7,
 'MINUTES': 10,
 'SECONDS': 0,
 'NAME': 'roast',
 'text': 'USER:put on a seven hour ten minute timer for roast\nHOURS:7\nMINUTES:10\nSECONDS:0\nNAME:roast<eos>'}

In [15]:
dataset["train"][198]

{'USER': 'Stell bitte einen 73 Sekunden Timer für den kurzen Test',
 'HOURS': 0,
 'MINUTES': 0,
 'SECONDS': 73,
 'NAME': None,
 'text': 'USER:Stell bitte einen 73 Sekunden Timer für den kurzen Test\nHOURS:0\nMINUTES:0\nSECONDS:73\nNAME:_<eos>'}

In [16]:
print(dataset["train"][0]["text"])

USER:put on a seven hour ten minute timer for roast
HOURS:7
MINUTES:10
SECONDS:0
NAME:roast<eos>


In [17]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["train"]])

53

In [18]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["validation"]])

51

In [19]:
def tokenize(sample):
    prompt = f"USER:{sample["USER"]}\n"
    prompt_len = len(tokenizer(prompt)["input_ids"])
    tokenized = tokenizer(sample["text"], padding="max_length", max_length=53)
    pad_len = tokenized["input_ids"].count(tokenizer.pad_token_id)
    tokenized["labels"] = tokenized["input_ids"].copy()
    tokenized["labels"][: pad_len + prompt_len] = [-100] * (pad_len + prompt_len)
    return tokenized

dataset["train"] = dataset["train"].map(tokenize, batched=False)
dataset["validation"] = dataset["validation"].map(tokenize, batched=False)

Map:   0%|          | 0/2941 [00:00<?, ? examples/s]

Map:   0%|          | 0/733 [00:00<?, ? examples/s]

In [20]:
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    use_cache=False,
    attn_implementation="eager",
)


In [21]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)

In [22]:
training_args = TrainingArguments(
    output_dir="./gemma-timer-name-lora",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=[],
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    seed=887,
    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    args=training_args,
)

trainer.train()


[33mWARN[0m  Python GIL is enabled: Multi-gpu quant acceleration for MoE models is sub-optimal and multi-core accelerated cpu packing is also disabled. We recommend Python >= 3.13.3t with Pytorch > 2.8 for mult-gpu quantization and multi-cpu packing with env `PYTHON_GIL=0`.
[33mWARN[0m  Feature `utils/Perplexity` requires python GIL or Python >= 3.13.3T (T for Threading-Free edition of Python) plus Torch 2.8. Feature is currently skipped/disabled.
[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          


Truncating train dataset:   0%|          | 0/2941 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/733 [00:00<?, ? examples/s]

INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmpth7iac4h/test.c -o /tmp/tmpth7iac4h/test.o
INFO:root:cc -pthread /tmp/tmpth7iac4h/test.o -laio -o /tmp/tmpth7iac4h/a.out
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmpdfdqac4b/test.c -o /tmp/tmpdfdqac4b/test.o
INFO:root:cc -pthread /tmp/tmpdfdqac4b/test.o -L/usr/local/cuda-12.6 -L/usr/local/cuda-12.6/lib64 -lcufile -o /tmp/tmpdfdqac4b/a.out
INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmprm7vh51j/test.c -o /tmp/tmprm7vh51j/test.o
INFO:root:cc -pthread /tmp/tmprm7vh51j/test.o -laio -o /tmp/tmprm7vh51j/a.out
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
The tokenizer 

Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
25,0.733,0.148852,0.964291,42400.0,0.958938
50,0.0677,0.058358,0.847636,84800.0,0.985805
75,0.0505,0.047155,0.563039,127200.0,0.989627
100,0.0236,0.042242,0.648839,169441.0,0.991557
125,0.0337,0.036993,0.742724,211841.0,0.990569
150,0.0304,0.029238,0.90338,254241.0,0.993263
175,0.0325,0.027904,0.880565,296641.0,0.993982
200,0.0183,0.025252,0.935706,338882.0,0.994202
225,0.0175,0.024429,0.799334,381282.0,0.994556
250,0.0181,0.023126,0.71922,423682.0,0.994845


TrainOutput(global_step=460, training_loss=0.1131415413611609, metrics={'train_runtime': 293.9904, 'train_samples_per_second': 50.019, 'train_steps_per_second': 1.565, 'total_flos': 486898968779520.0, 'train_loss': 0.1131415413611609, 'epoch': 5.0})

In [23]:
trainer.save_model("./gemma-timer-name-lora")

In [24]:
from transformers import logging

logging.set_verbosity_error()

In [25]:
from transformers import pipeline
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda")
model = PeftModel.from_pretrained(base_model, "./gemma-timer-name-lora", device_map="cuda")
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [29]:
from tqdm import tqdm


def evaluate_accuracy(
    dataset,
    log_file,
    batch_size=64,
):
    correct = 0
    total = len(dataset)

    with open(log_file, "w") as file:
        for i in tqdm(range(0, total, batch_size)):
            texts = [
                text.replace(tokenizer.eos_token, "")
                for text in dataset[i : i + batch_size]["text"]
            ]
            prefixes = [text.split("\n")[0] + "\n" for text in texts]
            gen_outs = text_gen(
                prefixes,
                num_beams=1,
                do_sample=False,
                batch_size=batch_size,
            )
            for text, gen_out in zip(texts, gen_outs):
                gen_text = gen_out[0]["generated_text"]
                if len(gen_text) >= len(text) and text == gen_text[: len(text)]:
                    correct += 1
                else:
                    print(f"Mismatch: {text} -> {gen_text}\n", file=file)

    print(f"Validation accuracy: {correct / total:.4f}")
    print(f"Correct: {correct}, total: {total}")

In [30]:
evaluate_accuracy(dataset["validation"], log_file="eval.log")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:14<00:00,  1.17s/it]

Validation accuracy: 0.9168
Correct: 672, total: 733





In [31]:
%cat eval.log

Mismatch: USER:Kannst du einen 6-Minuten-Timer für das Brot starten?
HOURS:0
MINUTES:6
SECONDS:0
NAME:_ -> USER:Kannst du einen 6-Minuten-Timer für das Brot starten?
HOURS:0
MINUTES:6
SECONDS:0
NAME:Brot

Mismatch: USER:Setz mir einen 66 Sekunden Timer, nur Probe
HOURS:0
MINUTES:0
SECONDS:66
NAME:_ -> USER:Setz mir einen 66 Sekunden Timer, nur Probe
HOURS:0
MINUTES:0
SECONDS:66
NAME:Probe

Mismatch: USER:Starte bitte einen 81-Sekunden-Timer
HOURS:0
MINUTES:1
SECONDS:21
NAME:_ -> USER:Starte bitte einen 81-Sekunden-Timer
HOURS:0
MINUTES:0
SECONDS:81
NAME:_

Mismatch: USER:launch timer for studying in 90 minutes
HOURS:1
MINUTES:30
SECONDS:0
NAME:studying -> USER:launch timer for studying in 90 minutes
HOURS:0
MINUTES:90
SECONDS:0
NAME:studying

Mismatch: USER:start a timer to remind me in 33 seconds
HOURS:0
MINUTES:0
SECONDS:33
NAME:_ -> USER:start a timer to remind me in 33 seconds
HOURS:0
MINUTES:0
SECONDS:33
NAME:reminder

Mismatch: USER:Bitte starte einen 52-Sekunden-Timer für die In

In [2]:
!uv run convert_lora_to_gguf.py ./gemma-timer-name-lora --outfile gemma-timer-name-lora.gguf

INFO:lora-to-gguf:Loading base model from Hugging Face: google/gemma-3-270m-it
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:lora-to-gguf:Exporting model...
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_a,      torch.float32 --> F32, shape = {2048, 16}
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_b,      torch.float32 --> F32, shape = {16, 640}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_a,      torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_b,      torch.float32 --> F32, shape = {16, 2048}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_a,        torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_b,        torch.float32 --> F32, shape = {16, 2048}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_a,        torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_b,        torch.float32 --> F32, shape = {16, 256}
INFO:hf-to-gguf:blk.0.attn_output.weight.lora_a,   torch.float32 --> F32, sh