In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "5"

In [2]:
input_file = "chat_timer_data_eng.txt"
output_file = "unique_timers.jsonl"

unique_lines = set()

with open(input_file, "r", encoding="utf-8") as f:
    all_lines = f.readlines()
    for line in all_lines:
        unique_lines.add(line.strip())

with open(output_file, "w", encoding="utf-8") as f:
    for line in unique_lines:
        f.write(line + "\n")

print(f"Total lines: {len(all_lines)}")
print(f"Unique lines: {len(unique_lines)}")

Total lines: 3145
Unique lines: 2902


In [3]:
import random

with open("unique_timers.jsonl", "r", encoding="utf-8") as f:
    lines = f.readlines()

random.shuffle(lines)

val_len = int(0.2 * len(lines))

val_lines = lines[:val_len]
train_lines = lines[val_len:]

with open("val.jsonl", "w", encoding="utf-8") as f:
    f.writelines(val_lines)

with open("train.jsonl", "w", encoding="utf-8") as f:
    f.writelines(train_lines)

In [4]:
from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": "train.jsonl", "validation": "val.jsonl"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [5]:
dataset["train"][0]

{'USER': 'Start a timer for sixty six seconds',
 'HOURS': 0,
 'MINUTES': 0,
 'SECONDS': 66}

In [6]:
def format_prompt(sample):
    user = sample["USER"]
    hours = sample["HOURS"]
    minutes = sample["MINUTES"]
    seconds = sample["SECONDS"]
    return f"USER:{user}\nHOURS:{hours}\nMINUTES:{minutes}\nSECONDS:{seconds}"

In [7]:
dataset["train"] = dataset["train"].map(lambda x: {"text": format_prompt(x)})
dataset["validation"] = dataset["validation"].map(lambda x: {"text": format_prompt(x)})

Map:   0%|          | 0/2322 [00:00<?, ? examples/s]

Map:   0%|          | 0/580 [00:00<?, ? examples/s]

In [8]:
dataset["train"][0]

{'USER': 'Start a timer for sixty six seconds',
 'HOURS': 0,
 'MINUTES': 0,
 'SECONDS': 66,
 'text': 'USER:Start a timer for sixty six seconds\nHOURS:0\nMINUTES:0\nSECONDS:66'}

In [9]:
print(dataset["train"][0]["text"])

USER:Start a timer for sixty six seconds
HOURS:0
MINUTES:0
SECONDS:66


In [10]:
from transformers import AutoTokenizer

model_name = "google/gemma-3-270m"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["train"]])

41

In [12]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["validation"]])

38

In [13]:
tokenizer.padding_side

'left'

In [14]:
def tokenize(sample):
    prompt = f"USER:{sample["USER"]}\n"
    prompt_len = len(tokenizer(prompt)["input_ids"])
    tokenized = tokenizer(sample["text"], padding="max_length", max_length=42)
    pad_len = tokenized["input_ids"].count(tokenizer.pad_token_id)
    tokenized["labels"] = tokenized["input_ids"].copy()
    tokenized["labels"][: pad_len + prompt_len] = [-100] * (pad_len + prompt_len)
    return tokenized

dataset["train"] = dataset["train"].map(tokenize, batched=False)
dataset["validation"] = dataset["validation"].map(tokenize, batched=False)

Map:   0%|          | 0/2322 [00:00<?, ? examples/s]

Map:   0%|          | 0/580 [00:00<?, ? examples/s]

In [15]:
dataset["train"][0].keys()

dict_keys(['USER', 'HOURS', 'MINUTES', 'SECONDS', 'text', 'input_ids', 'attention_mask', 'labels'])

In [16]:
print(dataset["train"][0]["attention_mask"])
print(dataset["train"][0]["input_ids"])
print(dataset["train"][0]["labels"])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 20791, 236787, 6302, 496, 20342, 573, 41607, 3962, 9093, 107, 10858, 66481, 236787, 236771, 107, 16008, 80914, 236787, 236771, 107, 149542, 236787, 236825, 236825]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 10858, 66481, 236787, 236771, 107, 16008, 80914, 236787, 236771, 107, 149542, 236787, 236825, 236825]


In [17]:
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    use_cache=False,
    attn_implementation="eager",
)


In [18]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [19]:
training_args = TrainingArguments(
    output_dir="./gemma-timer-lora",
    num_train_epochs=20,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=[],
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    args=training_args,
)

trainer.train()


[33mWARN[0m  Python GIL is enabled: Multi-gpu quant acceleration for MoE models is sub-optimal and multi-core accelerated cpu packing is also disabled. We recommend Python >= 3.13.3t with Pytorch > 2.8 for mult-gpu quantization and multi-cpu packing with env `PYTHON_GIL=0`.
[33mWARN[0m  Feature `utils/Perplexity` requires python GIL or Python >= 3.13.3T (T for Threading-Free edition of Python) plus Torch 2.8. Feature is currently skipped/disabled.
[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          


Truncating train dataset:   0%|          | 0/2322 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/580 [00:00<?, ? examples/s]

INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmpvosx9183/test.c -o /tmp/tmpvosx9183/test.o
INFO:root:cc -pthread /tmp/tmpvosx9183/test.o -laio -o /tmp/tmpvosx9183/a.out
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmpe_4euwzl/test.c -o /tmp/tmpe_4euwzl/test.o
INFO:root:cc -pthread /tmp/tmpe_4euwzl/test.o -L/usr/local/cuda-12.6 -L/usr/local/cuda-12.6/lib64 -lcufile -o /tmp/tmpe_4euwzl/a.out
INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmpec_jdazh/test.c -o /tmp/tmpec_jdazh/test.o
INFO:root:cc -pthread /tmp/tmpec_jdazh/test.o -laio -o /tmp/tmpec_jdazh/a.out
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
The tokenizer 

Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
25,0.0909,0.025912,1.642112,129780.0,0.993745
50,0.0119,0.013314,2.301833,259560.0,0.995575
75,0.0056,0.004033,1.858191,389340.0,0.998887
100,0.0011,0.002436,1.690483,514500.0,0.999135
125,0.0005,0.001515,1.709294,644280.0,0.99938
150,0.0002,0.001006,1.742289,774060.0,0.999627
175,0.0002,0.000948,1.75075,899220.0,0.999753
200,0.0001,0.000746,1.749091,1029000.0,0.99963
225,0.0002,0.000532,1.71603,1158780.0,0.999752
250,0.0,0.000458,1.698164,1283940.0,0.999752


TrainOutput(global_step=380, training_loss=0.026182587425252056, metrics={'train_runtime': 527.4356, 'train_samples_per_second': 88.049, 'train_steps_per_second': 0.72, 'total_flos': 1191359924858880.0, 'train_loss': 0.026182587425252056, 'epoch': 20.0})

In [20]:
trainer.save_model("./gemma-timer-lora")

In [21]:
from itertools import product

sample_template = "HOURS:{}\nMINUTES:{}\nSECONDS:{}"

max_new_tokens = max(
    [
        len(tokenizer.tokenize(sample_template.format(h, m, s)))
        for h, m, s in product(range(100), repeat=3)
    ]
)

KeyboardInterrupt: 

In [24]:
max_new_tokens

16

In [21]:
max_new_tokens = 16

In [22]:
from transformers import logging

logging.set_verbosity_error()

In [23]:
from transformers import pipeline
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda")
model = PeftModel.from_pretrained(base_model, "./gemma-timer-lora", device_map="cuda")
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [24]:
from tqdm import tqdm


def evaluate_accuracy(
    dataset,
    log_file,
    batch_size=64,
):
    correct = 0
    total = len(dataset)

    with open(log_file, "w") as file:
        for i in tqdm(range(0, total, batch_size)):
            texts = dataset[i : i + batch_size]["text"]
            prefixes = [text.split("\n")[0] + "\n" for text in texts]
            gen_outs = text_gen(prefixes, max_new_tokens=max_new_tokens, num_beams=1, do_sample=False)
            for text, gen_out in zip(texts, gen_outs):
                gen_text = gen_out[0]["generated_text"]
                if len(gen_text) >= len(text) and text == gen_text[: len(text)]:
                    correct += 1
                else:
                    print(f"Mismatch: {text} -> {gen_text}\n", file=file)

    return correct / total

In [25]:
acc = evaluate_accuracy(dataset["validation"], log_file="eval.log")
print(f"Validation accuracy: {acc:.4f}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [06:10<00:00, 37.00s/it]

Validation accuracy: 0.9621





In [26]:
%cat eval.log

Mismatch: USER:start a quarter of an hour timer
HOURS:0
MINUTES:15
SECONDS:0 -> USER:start a quarter of an hour timer
HOURS:0
MINUTES:30
SECONDS:0
PI

Mismatch: USER:Hey assistant, set a timer for a quarter of an hour.
HOURS:0
MINUTES:15
SECONDS:0 -> USER:Hey assistant, set a timer for a quarter of an hour.
HOURS:0
MINUTES:30
SECONDS:0
PI

Mismatch: USER:start a timer for three minutes and forty seconds
HOURS:0
MINUTES:3
SECONDS:40 -> USER:start a timer for three minutes and forty seconds
HOURS:0
MINUTES:30
SECONDS:40


Mismatch: USER:I need a timer set for 7 hours and 7 minutes.
HOURS:7
MINUTES:7
SECONDS:0 -> USER:I need a timer set for 7 hours and 7 minutes.
HOURS:7
MINUTES:0
SECONDS:0
SECONDSPI

Mismatch: USER:Oh put on a 73 second timer quickly
HOURS:0
MINUTES:0
SECONDS:73 -> USER:Oh put on a 73 second timer quickly
HOURS:0
MINUTES:73
SECONDS:0
SECONDS

Mismatch: USER:Start me a timer for seven hours and twelve minutes
HOURS:7
MINUTES:12
SECONDS:0 -> USER:Start me a timer for seven

In [None]:
# %mkdir gemma-3-270m
# !hf download google/gemma-3-270m --local-dir gemma-3-270m

In [42]:
%mkdir gemma-timer-adapter-for-hf
%cd gemma-timer-lora
%cp adapter_model.safetensors adapter_config.json ../gemma-timer-adapter-for-hf/
%cd ..

/home/dmitrievan/pln/kaia_exps/gemma-timer-lora
/home/dmitrievan/pln/kaia_exps


In [None]:
# %git clone https://github.com/ggml-org/llama.cpp.git llama-cpp-repo
# %cd llama-cpp-repo/gguf-py
# %uv pip install .
# %uv add mistral_common
# %cd ../..
# %wget https://raw.githubusercontent.com/ggml-org/llama.cpp/master/convert_lora_to_gguf.py
# %wget https://raw.githubusercontent.com/ggml-org/llama.cpp/master/convert_hf_to_gguf.py

In [43]:
!uv run convert_lora_to_gguf.py ./gemma-timer-adapter-for-hf --outfile gemma-3-270m-lora.gguf
# !uv run convert_hf_to_gguf.py ./gemma-3-270m --outfile gemma-3-270m.gguf

INFO:lora-to-gguf:Loading base model from Hugging Face: google/gemma-3-270m
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:lora-to-gguf:Exporting model...
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_a,        torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_b,        torch.float32 --> F32, shape = {16, 256}
INFO:hf-to-gguf:blk.0.attn_output.weight.lora_a,   torch.float32 --> F32, shape = {1024, 16}
INFO:hf-to-gguf:blk.0.attn_output.weight.lora_b,   torch.float32 --> F32, shape = {16, 640}
INFO:hf-to-gguf:blk.0.attn_q.weight.lora_a,        torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.attn_q.weight.lora_b,        torch.float32 --> F32, shape = {16, 1024}
INFO:hf-to-gguf:blk.0.attn_v.weight.lora_a,        torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.attn_v.weight.lora_b,        torch.float32 --> F32, shape = {16, 256}
INFO:hf-to-gguf:blk.1.attn_k.weight.lora_a,        torch.float32 --> F32, shape 