In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [None]:
import random

input_file = "datalines/chat_timer_data_eng_extended.jsonl"
output_file = "datalines/unique_timers.jsonl"

unique_lines = set()

with open(input_file, "r", encoding="utf-8") as f:
    all_lines = f.readlines()
    for line in all_lines:
        unique_lines.add(line.strip())

with open(output_file, "w", encoding="utf-8") as f:
    for line in unique_lines:
        f.write(line + "\n")

print(f"Total lines: {len(all_lines)}")
print(f"Unique lines: {len(unique_lines)}")

with open("datalines/unique_timers.jsonl", "r", encoding="utf-8") as f:
    lines = f.readlines()

random.shuffle(lines)

val_len = int(0.2 * len(lines))

val_lines = lines[:val_len]
train_lines = lines[val_len:]

with open("datalines/val.jsonl", "w", encoding="utf-8") as f:
    f.writelines(val_lines)

with open("datalines/train.jsonl", "w", encoding="utf-8") as f:
    f.writelines(train_lines)

In [2]:
from datasets import load_dataset

dataset = load_dataset(
    "json", data_files={"train": "datalines/train.jsonl", "validation": "datalines/val.jsonl"}
)

In [3]:
from transformers import AutoTokenizer

model_name = "google/gemma-3-270m-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
def format_prompt(sample):
    user = sample["USER"]
    hours = sample["HOURS"]
    minutes = sample["MINUTES"]
    seconds = sample["SECONDS"]
    return f"USER:{user}\nHOURS:{hours}\nMINUTES:{minutes}\nSECONDS:{seconds}" + tokenizer.eos_token

In [5]:
dataset["train"] = dataset["train"].map(lambda x: {"text": format_prompt(x)})
dataset["validation"] = dataset["validation"].map(lambda x: {"text": format_prompt(x)})

In [6]:
dataset["train"][0]

{'USER': 'set a timer for 8 hours 9 minutes',
 'HOURS': 8,
 'MINUTES': 9,
 'SECONDS': 0,
 'text': 'USER:set a timer for 8 hours 9 minutes\nHOURS:8\nMINUTES:9\nSECONDS:0<eos>'}

In [7]:
print(dataset["train"][0]["text"])

USER:set a timer for 8 hours 9 minutes
HOURS:8
MINUTES:9
SECONDS:0<eos>


In [8]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["train"]])

42

In [9]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["validation"]])

41

In [10]:
tokenizer.padding_side

'left'

In [11]:
def tokenize(sample):
    prompt = f"USER:{sample["USER"]}\n"
    prompt_len = len(tokenizer(prompt)["input_ids"])
    tokenized = tokenizer(sample["text"], padding="max_length", max_length=45)
    pad_len = tokenized["input_ids"].count(tokenizer.pad_token_id)
    tokenized["labels"] = tokenized["input_ids"].copy()
    tokenized["labels"][: pad_len + prompt_len] = [-100] * (pad_len + prompt_len)
    return tokenized

dataset["train"] = dataset["train"].map(tokenize, batched=False)
dataset["validation"] = dataset["validation"].map(tokenize, batched=False)

Map:   0%|          | 0/669 [00:00<?, ? examples/s]

In [12]:
dataset["train"][0].keys()

dict_keys(['USER', 'HOURS', 'MINUTES', 'SECONDS', 'text', 'input_ids', 'attention_mask', 'labels'])

In [13]:
print(dataset["train"][0]["attention_mask"])
print(dataset["train"][0]["input_ids"])
print(dataset["train"][0]["labels"])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 20791, 236787, 1025, 496, 20342, 573, 236743, 236828, 3885, 236743, 236819, 4310, 107, 10858, 66481, 236787, 236828, 107, 16008, 80914, 236787, 236819, 107, 149542, 236787, 236771, 1]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 10858, 66481, 236787, 236828, 107, 16008, 80914, 236787, 236819, 107, 149542, 236787, 236771, 1]


In [14]:
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    use_cache=False,
    attn_implementation="eager",
)


In [15]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)

In [16]:
training_args = TrainingArguments(
    output_dir="./gemma-timer-lora",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=[],
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    seed=887,
    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    args=training_args,
)

trainer.train()

Truncating eval dataset:   0%|          | 0/669 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 2, 'pad_token_id': 0}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
25,0.1883,0.048274,1.184469,36000.0,0.990206
50,0.0197,0.018838,0.567598,72000.0,0.994447
75,0.0054,0.008183,0.432888,108000.0,0.9985
100,0.0056,0.006136,0.524155,143550.0,0.998597
125,0.0046,0.003961,0.527374,179550.0,0.999201
150,0.0007,0.002866,0.522301,215550.0,0.999298
175,0.0003,0.002122,0.499526,251100.0,0.999399
200,0.0002,0.00201,0.491925,287100.0,0.999399
225,0.0004,0.002037,0.492619,323100.0,0.999399
250,0.0001,0.002011,0.492621,359100.0,0.999399


TrainOutput(global_step=252, training_loss=0.11090628202059417, metrics={'train_runtime': 149.3863, 'train_samples_per_second': 53.78, 'train_steps_per_second': 1.687, 'total_flos': 225861546493440.0, 'train_loss': 0.11090628202059417, 'epoch': 3.0})

In [17]:
trainer.save_model("./gemma-timer-lora")

In [None]:
# from itertools import product

# sample_template = "HOURS:{}\nMINUTES:{}\nSECONDS:{}" + tokenizer.eos_token

# max_new_tokens = max(
#     [
#         len(tokenizer.tokenize(sample_template.format(h, m, s)))
#         for h, m, s in product(range(100), repeat=3)
#     ]
# )

In [18]:
from transformers import logging

logging.set_verbosity_error()

In [19]:
from transformers import pipeline
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda")
model = PeftModel.from_pretrained(base_model, "./gemma-timer-lora", device_map="cuda")
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [29]:
from tqdm import tqdm


def evaluate_accuracy(
    dataset,
    log_file,
    batch_size=64,
):
    correct = 0
    total = len(dataset)

    with open(log_file, "w") as file:
        for i in tqdm(range(0, total, batch_size)):
            texts = [
                text.replace(tokenizer.eos_token, "")
                for text in dataset[i : i + batch_size]["text"]
            ]
            prefixes = [text.split("\n")[0] + "\n" for text in texts]
            gen_outs = text_gen(
                prefixes,
                do_sample=False,
                batch_size=batch_size,
            )
            for text, gen_out in zip(texts, gen_outs):
                gen_text = gen_out[0]["generated_text"]
                if len(gen_text) >= len(text) and text == gen_text[: len(text)]:
                    correct += 1
                else:
                    print(f"Mismatch: {text} -> {gen_text}\n", file=file)

    return correct / total

In [30]:
acc = evaluate_accuracy(dataset["validation"], log_file="eval.log")
print(f"Validation accuracy: {acc:.4f}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:09<00:00,  1.19it/s]

Validation accuracy: 0.9925





In [31]:
%cat eval.log

Mismatch: USER:half a minute and 22 seconds, go
HOURS:0
MINUTES:0
SECONDS:52 -> USER:half a minute and 22 seconds, go
HOURS:0
MINUTES:0
SECONDS:22

Mismatch: USER:set a timer for half a minute and 30 seconds
HOURS:0
MINUTES:0
SECONDS:60 -> USER:set a timer for half a minute and 30 seconds
HOURS:0
MINUTES:0
SECONDS:30

Mismatch: USER:half a minute plus 5 seconds, go ahead
HOURS:0
MINUTES:0
SECONDS:35 -> USER:half a minute plus 5 seconds, go ahead
HOURS:0
MINUTES:0
SECONDS:5

Mismatch: USER:half a minute and 20 seconds, fire
HOURS:0
MINUTES:0
SECONDS:50 -> USER:half a minute and 20 seconds, fire
HOURS:0
MINUTES:0
SECONDS:40

Mismatch: USER:start half a minute plus 20 seconds
HOURS:0
MINUTES:0
SECONDS:50 -> USER:start half a minute plus 20 seconds
HOURS:0
MINUTES:0
SECONDS:40



In [None]:
!git clone https://github.com/ggml-org/llama.cpp.git llama-cpp-repo
%cd llama-cpp-repo/gguf-py
!uv pip install .
%cd ../..
%rm -rf llama-cpp-repo
!wget https://raw.githubusercontent.com/ggml-org/llama.cpp/master/convert_lora_to_gguf.py
!wget https://raw.githubusercontent.com/ggml-org/llama.cpp/master/convert_hf_to_gguf.py

Cloning into 'llama-cpp-repo'...
remote: Enumerating objects: 70177, done.[K
remote: Counting objects: 100% (353/353), done.[K
remote: Compressing objects: 100% (228/228), done.[K
remote: Total 70177 (delta 234), reused 127 (delta 125), pack-reused 69824 (from 3)[K
Receiving objects: 100% (70177/70177), 216.13 MiB | 3.18 MiB/s, done.
Resolving deltas: 100% (50721/50721), done.
/home/dmitrievan/kaia_exps/kaia_exps/llama-cpp-repo/gguf-py
[2mUsing Python 3.13.2 environment at: /home/dmitrievan/kaia_exps/.venv[0m
[2K[2mResolved [1m4 packages[0m [2min 861ms[0m[0m                                         [0m
[2K[2mPrepared [1m1 package[0m [2min 569ms[0m[0m                                              
[2K[2mInstalled [1m1 package[0m [2min 8ms[0m[0mfile:///home/dmitrievan/kaia_exps/[0m
 [32m+[39m [1mgguf[0m[2m==0.17.1 (from file:///home/dmitrievan/kaia_exps/kaia_exps/llama-cpp-repo/gguf-py)[0m
/home/dmitrievan/kaia_exps/kaia_exps
--2025-12-02 16:57:53--  http

In [35]:
!uv run convert_lora_to_gguf.py ./gemma-timer-lora --outfile gemma-3-270m-lora.gguf

INFO:lora-to-gguf:Loading base model from Hugging Face: google/gemma-3-270m-it
INFO:hf-to-gguf:gguf: indexing model part 'model.safetensors'
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:lora-to-gguf:Exporting model...
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_a,      torch.float32 --> F32, shape = {2048, 16}
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_b,      torch.float32 --> F32, shape = {16, 640}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_a,      torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_b,      torch.float32 --> F32, shape = {16, 2048}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_a,        torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_b,        torch.float32 --> F32, shape = {16, 2048}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_a,        torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_b,        torch.float32 --> F32, shape = {16, 256}
INFO:hf-to-ggu