In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [None]:
from huggingface_hub import login

your_token = input()
login(token=your_token)

In [2]:
model_name = "Qwen/Qwen3-0.6B"

In [3]:
from datasets import load_dataset

dataset = load_dataset(
    "json", data_files={"train": "datalines/train.jsonl", "validation": "datalines/val.jsonl"}
)

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
def format_prompt(sample):
    user = sample["USER"]
    hours = sample["HOURS"]
    minutes = sample["MINUTES"]
    seconds = sample["SECONDS"]
    return f"USER¶{user}¶HOURS¶{hours}¶MINUTES¶{minutes}¶SECONDS¶{seconds}" + tokenizer.eos_token

In [6]:
dataset["train"] = dataset["train"].map(lambda x: {"text": format_prompt(x)})
dataset["validation"] = dataset["validation"].map(lambda x: {"text": format_prompt(x)})

Map:   0%|          | 0/2678 [00:00<?, ? examples/s]

Map:   0%|          | 0/669 [00:00<?, ? examples/s]

In [7]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["train"]])

42

In [8]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["validation"]])

41

In [9]:
tokenizer.padding_side

'right'

In [10]:
def tokenize(sample):
    prompt = f"USER¶{sample['USER']}¶"
    prompt_len = len(tokenizer(prompt)["input_ids"])
    tokenized = tokenizer(
        sample["text"],
        padding="max_length",
        max_length=42,
        truncation=True,
    )
    pad_len = tokenized["input_ids"].count(tokenizer.pad_token_id)
    tokenized["labels"] = tokenized["input_ids"].copy()
    tokenized["labels"][:prompt_len] = [-100] * prompt_len
    if pad_len > 0:
        tokenized["labels"][-pad_len:] = [-100] * pad_len

    return tokenized


dataset["train"] = dataset["train"].map(tokenize, batched=False)
dataset["validation"] = dataset["validation"].map(tokenize, batched=False)


Map:   0%|          | 0/2678 [00:00<?, ? examples/s]

Map:   0%|          | 0/669 [00:00<?, ? examples/s]

In [11]:
print(dataset["train"][0]["attention_mask"])
print(dataset["train"][0]["input_ids"])
print(dataset["train"][0]["labels"])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[6448, 54509, 40, 1184, 264, 9021, 369, 2326, 4115, 54509, 39, 59273, 54509, 18, 54509, 16413, 53785, 54509, 15, 54509, 925, 26904, 54509, 15, 151645, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 39, 59273, 54509, 18, 54509, 16413, 53785, 54509, 15, 54509, 925, 26904, 54509, 15, 151645, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [12]:
tokenizer.pad_token_id

151643

In [13]:
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    use_cache=False,
    attn_implementation="eager",
)

In [14]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)

In [15]:
training_args = TrainingArguments(
    output_dir="./qwen-timer-lora",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=[],
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    seed=887,
    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    args=training_args,
)

trainer.train()


[33mWARN[0m  Python GIL is enabled: Multi-gpu quant acceleration for MoE models is sub-optimal and multi-core accelerated cpu packing is also disabled. We recommend Python >= 3.13.3t with Pytorch > 2.8 for mult-gpu quantization and multi-cpu packing with env `PYTHON_GIL=0`.
[33mWARN[0m  Feature `utils/Perplexity` requires python GIL or Python >= 3.13.3T (T for Threading-Free edition of Python) plus Torch 2.8. Feature is currently skipped/disabled.
[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          


Truncating train dataset:   0%|          | 0/2678 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/669 [00:00<?, ? examples/s]

INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmp74vex3e2/test.c -o /tmp/tmp74vex3e2/test.o
INFO:root:cc -pthread /tmp/tmp74vex3e2/test.o -laio -o /tmp/tmp74vex3e2/a.out
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmpz_nyb9r8/test.c -o /tmp/tmpz_nyb9r8/test.o
INFO:root:cc -pthread /tmp/tmpz_nyb9r8/test.o -L/usr/local/cuda-12.6 -L/usr/local/cuda-12.6/lib64 -lcufile -o /tmp/tmpz_nyb9r8/a.out
INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmpghvxfuf7/test.c -o /tmp/tmpghvxfuf7/test.o
INFO:root:cc -pthread /tmp/tmpghvxfuf7/test.o -laio -o /tmp/tmpghvxfuf7/a.out
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
The tokenizer 

Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
25,0.1393,0.01072,1.971098,33600.0,0.99694
50,0.003,0.007118,1.881108,67200.0,0.998031
75,0.0053,0.003278,2.026359,100800.0,0.999292
100,0.0022,0.003841,1.999228,133980.0,0.999386
125,0.0002,0.005311,2.057052,167580.0,0.998768
150,0.0017,0.004565,1.997125,201180.0,0.999198
175,0.0004,0.005022,1.967222,234360.0,0.999289
200,0.0003,0.005045,1.957014,267960.0,0.999289
225,0.0,0.005022,1.956357,301560.0,0.999289
250,0.001,0.00499,1.957504,335160.0,0.999289


TrainOutput(global_step=252, training_loss=0.09902305537211543, metrics={'train_runtime': 180.0699, 'train_samples_per_second': 44.616, 'train_steps_per_second': 1.399, 'total_flos': 912189358080000.0, 'train_loss': 0.09902305537211543, 'epoch': 3.0})

In [16]:
trainer.save_model("./qwen-timer-lora")

In [17]:
from itertools import product

sample_template = "HOURS¶{}¶MINUTES¶{}¶SECONDS¶{}" + tokenizer.eos_token

max_new_tokens = max(
    [
        len(tokenizer.tokenize(sample_template.format(h, m, s)))
        for h, m, s in product(range(100), repeat=3)
    ]
)

In [18]:
max_new_tokens

18

In [19]:
max_new_tokens = 18

In [20]:
from transformers import logging

logging.set_verbosity_error()

In [21]:
from transformers import pipeline
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda")
model = PeftModel.from_pretrained(base_model, "./qwen-timer-lora", device_map="cuda")
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [26]:
from tqdm import tqdm


def evaluate_accuracy(
    dataset,
    log_file,
    batch_size=64,
):
    correct = 0
    total = len(dataset)

    with open(log_file, "w") as file:
        for i in tqdm(range(0, total, batch_size)):
            texts = [
                text.replace(tokenizer.eos_token, "")
                for text in dataset[i : i + batch_size]["text"]
            ]
            prefixes = ["¶".join(text.split("¶")[:2]) + "¶" for text in texts]
            print(prefixes[0])
            gen_outs = text_gen(
                prefixes,
                max_new_tokens=max_new_tokens,
                num_beams=1,
                do_sample=False,
                batch_size=batch_size,
            )
            for text, gen_out in zip(texts, gen_outs):
                gen_text = gen_out[0]["generated_text"]
                if len(gen_text) >= len(text) and text == gen_text[: len(text)]:
                    correct += 1
                else:
                    print(f"Mismatch: {text} -> {gen_text}\n", file=file)

    return correct / total

In [27]:
acc = evaluate_accuracy(dataset["validation"], log_file="eval.log")
print(f"Validation accuracy: {acc:.4f}")

  0%|                                                                                                                                                      | 0/11 [00:00<?, ?it/s]

USER¶I need you to start a timer for 16 hours.¶


  9%|████████████▉                                                                                                                                 | 1/11 [00:01<00:12,  1.25s/it]

USER¶please start a timer for 4 minutes¶


 18%|█████████████████████████▊                                                                                                                    | 2/11 [00:02<00:11,  1.25s/it]

USER¶Please set a half hour timer¶


 27%|██████████████████████████████████████▋                                                                                                       | 3/11 [00:03<00:09,  1.25s/it]

USER¶please start a timer for 1 hour and 20 minutes¶


 36%|███████████████████████████████████████████████████▋                                                                                          | 4/11 [00:04<00:08,  1.25s/it]

USER¶Hey set a timer for three hours¶


 45%|████████████████████████████████████████████████████████████████▌                                                                             | 5/11 [00:06<00:07,  1.25s/it]


USER¶start a timer for seven hours, thirty-four minutes and eighteen seconds¶


KeyboardInterrupt: 

In [24]:
print(dataset["validation"][0]["text"])

USER¶I need you to start a timer for 16 hours.¶HOURS¶16¶MINUTES¶0¶SECONDS¶0<|im_end|>


In [25]:
%cat eval.log

Mismatch: USER¶I need you to start a timer for 16 hours.¶HOURS¶16¶MINUTES¶0¶SECONDS¶0 -> USER¶I need you to start a timer for 16 hours.¶

Mismatch: USER¶i want a 3 hour, 15 minute timer¶HOURS¶3¶MINUTES¶15¶SECONDS¶0 -> USER¶i want a 3 hour, 15 minute timer¶¶HOURS¶3¶MINUTES¶15¶SECONDS¶0

Mismatch: USER¶start a 1 hour 59 minute timer¶HOURS¶1¶MINUTES¶59¶SECONDS¶0 -> USER¶start a 1 hour 59 minute timer¶

Mismatch: USER¶Start a timer for forty four hours¶HOURS¶44¶MINUTES¶0¶SECONDS¶0 -> USER¶Start a timer for forty four hours¶

Mismatch: USER¶Start a timer for 90 minutes.¶HOURS¶0¶MINUTES¶90¶SECONDS¶0 -> USER¶Start a timer for 90 minutes.¶90

Mismatch: USER¶one and three quarter hour timer with 13 seconds, start¶HOURS¶1¶MINUTES¶45¶SECONDS¶13 -> USER¶one and three quarter hour timer with 13 seconds, start¶¶HOURS¶1¶MINUTES¶15¶SECONDS¶13

Mismatch: USER¶start timer for three hours and twenty seven minutes¶HOURS¶3¶MINUTES¶27¶SECONDS¶0 -> USER¶start timer for three hours and twenty seven minutes¶



In [None]:
# !git clone https://github.com/ggml-org/llama.cpp.git llama-cpp-repo
# %cd llama-cpp-repo/gguf-py
# !uv pip install .
# %cd ../..
# !uv add mistral_common
# !wget https://raw.githubusercontent.com/ggml-org/llama.cpp/master/convert_lora_to_gguf.py
# !wget https://raw.githubusercontent.com/ggml-org/llama.cpp/master/convert_hf_to_gguf.py

Cloning into 'llama-cpp-repo'...
remote: Enumerating objects: 65337, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 65337 (delta 5), reused 5 (delta 1), pack-reused 65321 (from 2)[K
Receiving objects: 100% (65337/65337), 181.28 MiB | 4.42 MiB/s, done.
Resolving deltas: 100% (47527/47527), done.
/home/dmitrievan/kaia_exps/kaia_exps/llama-cpp-repo/gguf-py
[2mUsing Python 3.12.9 environment at: /home/dmitrievan/kaia_exps/.venv[0m
[2K[2mResolved [1m4 packages[0m [2min 601ms[0m[0m                                         [0m
[2K[2mPrepared [1m1 package[0m [2min 549ms[0m[0m                                              
[2mUninstalled [1m1 package[0m [2min 2ms[0m[0m
[2K[2mInstalled [1m1 package[0m [2min 7ms[0m[0mfile:///home/dmitrievan/kaia_exps/[0m
 [33m~[39m [1mgguf[0m[2m==0.17.1 (from file:///home/dmitrievan/kaia_exps/kaia_exps/llama-cpp-repo/gguf-py)[0m
/home/dmitrievan/kaia_e

In [None]:
!uv run convert_lora_to_gguf.py ./qwen-timer-lora --outfile qwen-timer-lora.gguf

INFO:lora-to-gguf:Loading base model from Hugging Face: google/gemma-3-270m-it
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:lora-to-gguf:Exporting model...
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_a,      torch.float32 --> F32, shape = {2048, 16}
INFO:hf-to-gguf:blk.0.ffn_down.weight.lora_b,      torch.float32 --> F32, shape = {16, 640}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_a,      torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.ffn_gate.weight.lora_b,      torch.float32 --> F32, shape = {16, 2048}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_a,        torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.ffn_up.weight.lora_b,        torch.float32 --> F32, shape = {16, 2048}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_a,        torch.float32 --> F32, shape = {640, 16}
INFO:hf-to-gguf:blk.0.attn_k.weight.lora_b,        torch.float32 --> F32, shape = {16, 256}
INFO:hf-to-gguf:blk.0.attn_output.weight.lora_a,   torch.float32 --> F32, sh