In [None]:
from huggingface_hub import login

your_token = ""
login(token=your_token)

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [3]:
model_name = "Qwen/Qwen3-0.6B"

In [4]:
from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": "train.jsonl", "validation": "val.jsonl"})

In [5]:
def format_prompt(sample):
    user = sample["USER"]
    hours = sample["HOURS"]
    minutes = sample["MINUTES"]
    seconds = sample["SECONDS"]
    return f"USER:{user}\nHOURS:{hours}\nMINUTES:{minutes}\nSECONDS:{seconds}"

In [6]:
dataset["train"] = dataset["train"].map(lambda x: {"text": format_prompt(x)})
dataset["validation"] = dataset["validation"].map(lambda x: {"text": format_prompt(x)})

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["train"]])

51

In [9]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["validation"]])

49

In [10]:
tokenizer.padding_side

'right'

In [11]:
def tokenize(sample):
    prompt = f"USER:{sample['USER']}\n"
    prompt_len = len(tokenizer(prompt)["input_ids"])
    tokenized = tokenizer(
        sample["text"],
        padding="max_length",
        max_length=52,
        truncation=True,
    )
    pad_len = tokenized["input_ids"].count(tokenizer.pad_token_id)
    tokenized["labels"] = tokenized["input_ids"].copy()
    tokenized["labels"][:prompt_len] = [-100] * prompt_len
    if pad_len > 0:
        tokenized["labels"][-pad_len:] = [-100] * pad_len

    return tokenized


dataset["train"] = dataset["train"].map(tokenize, batched=False)
dataset["validation"] = dataset["validation"].map(tokenize, batched=False)

Map:   0%|          | 0/578 [00:00<?, ? examples/s]

In [14]:
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
)

In [15]:
from peft import LoraConfig
from trl import SFTTrainer


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    use_cache=False,
    quantization_config={
        "load_in_4bit": True,
        "bnb_4bit_use_double_quant": True,
        "bnb_4bit_quant_type": "nf4",
        "bnb_4bit_compute_dtype": "float16"
    }
)


In [None]:
# model.save_pretrained("qwen-base")
# tokenizer.save_pretrained("qwen-base")

In [16]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [17]:
training_args = TrainingArguments(
    output_dir="./qwen-timer-lora",
    num_train_epochs=20,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=[],
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    args=training_args,
)

trainer.train()

Truncating train dataset:   0%|          | 0/2316 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/578 [00:00<?, ? examples/s]

/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
25,0.0587,0.02594,1.590995,160368.0,0.993918
50,0.0099,0.010374,1.173177,320736.0,0.998056
75,0.0059,0.008261,1.216779,481104.0,0.998515
100,0.0033,0.007767,1.225939,635440.0,0.998629
125,0.0022,0.007531,1.192921,795808.0,0.998515
150,0.0019,0.007921,1.183865,956176.0,0.998401
175,0.0015,0.008433,1.15665,1110512.0,0.998285


KeyboardInterrupt: 

In [18]:
trainer.save_model("./qwen-timer-lora")

In [17]:
from itertools import product

sample_template = "HOURS:{}\nMINUTES:{}\nSECONDS:{}"

max_new_tokens = max(
    [
        len(tokenizer.tokenize(sample_template.format(h, m, s)))
        for h, m, s in product(range(100), repeat=3)
    ]
)

In [18]:
max_new_tokens

17

In [15]:
max_new_tokens = 18

In [16]:
from transformers import logging

logging.set_verbosity_error()

In [26]:
from transformers import pipeline
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    use_cache=False,
    quantization_config={
        "load_in_4bit": True,
        "bnb_4bit_use_double_quant": True,
        "bnb_4bit_quant_type": "nf4",
        "bnb_4bit_compute_dtype": "float16",
    },
)
model = PeftModel.from_pretrained(base_model, "./qwen-timer-lora", device_map="cuda")

In [27]:
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [28]:
text_gen("USER:set a timer for 5 seconds\n", max_new_tokens=18)

[{'generated_text': 'USER:set a timer for 5 seconds\nHOURS:0\nMINUTES:0\nSECONDS:5\nMINUTES:'}]

In [29]:
from tqdm import tqdm


def evaluate_accuracy(
    dataset,
    log_file,
    batch_size=64,
):
    correct = 0
    total = len(dataset)

    with open(log_file, "w") as file:
        for i in tqdm(range(0, total, batch_size)):
            texts = dataset[i : i + batch_size]["text"]
            prefixes = [text.split("\n")[0] + "\n" for text in texts]
            gen_outs = text_gen(prefixes, max_new_tokens=max_new_tokens, num_beams=1, do_sample=False, batch_size=batch_size)
            for text, gen_out in zip(texts, gen_outs):
                gen_text = gen_out[0]["generated_text"]
                if len(gen_text) >= len(text) and text == gen_text[: len(text)]:
                    correct += 1
                else:
                    print(f"Mismatch: {text} -> {gen_text}\n", file=file)

    return correct / total

In [30]:
acc = evaluate_accuracy(dataset["validation"], log_file="eval.log")
print(f"Validation accuracy: {acc:.4f}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:17<00:00,  1.76s/it]

Validation accuracy: 0.5657





In [31]:
%cat eval.log

Mismatch: USER:please set a timer for ninety-nine minutes
HOURS:0
MINUTES:99
SECONDS:0 -> USER:please set a timer for ninety-nine minutes
 HOURS:0
MINUTES:99
SECONDS:0
SECONDS

Mismatch: USER:start a timer for three hours and seven seconds
HOURS:3
MINUTES:0
SECONDS:7 -> USER:start a timer for three hours and seven seconds
 HOURS:3
 MINUTES:0
 SECONDS:7
 DESCRIPTION:H

Mismatch: USER:Could you create a timer for sixteen minutes and nineteen seconds
HOURS:0
MINUTES:16
SECONDS:19 -> USER:Could you create a timer for sixteen minutes and nineteen seconds
 HOURS:0
MINUTES:16
SECONDS:19
H

Mismatch: USER:Set a timer for eighteen hours.
HOURS:18
MINUTES:0
SECONDS:0 -> USER:Set a timer for eighteen hours.
:HOURS:18
MINUTES:0
SECONDS:0
MINUTES

Mismatch: USER:start a 90 second timer please
HOURS:0
MINUTES:0
SECONDS:90 -> USER:start a 90 second timer please

HOURS:0
MINUTES:0
SECONDS:90
MIN

Mismatch: USER:Put on a two hour thirty one minute and fifty eight second timer
HOURS:2
MINUTES:31
SECONDS