In [None]:
TIMER_PROMPTS = [MY_TIMER_PROMPT] * 6 + [
    """You are helping build training data for a voice assistant. Generate 30 unique user utterances where someone asks to set a timer. 
Output them in JSONL format, one JSON object per line, with keys "USER", "HOURS", "MINUTES", "SECONDS". Stop after 30 lines. 
Follow these constraints:

- HOURS, MINUTES, SECONDS ∈ [0,99], values may be numerals ("5") or words ("five"). Zero values are allowed but not mentioned in USER text.
- Mix expressions: hours only, minutes only, seconds only, and combinations.
- Include both common durations (5 minutes, half an hour) and unusual ones (73 seconds, 91 minutes).
- Vary tones: polite, casual, imperative, exclamatory.
- No abbreviations ("min", "sec", "h"), keep it natural speech.
- Use either all digits or all words per utterance, not mixed.
- Do not add any extra fields.

Example format:
{"USER":"set a timer for five minutes","HOURS":0,"MINUTES":5,"SECONDS":0}
{"USER":"start a 45 second timer","HOURS":0,"MINUTES":0,"SECONDS":45}
""",
    """Produce 30 JSONL examples of natural spoken requests to set timers. 
Each JSON object has: "USER" (utterance), "HOURS", "MINUTES", "SECONDS". 
Make utterances varied, mixing digits and number words, different tones, and both typical and unusual durations. 
Stick to constraints: no abbreviations, no extra fields, durations 1–99, allow zero values (not spoken). Stop after 30 lines.""",
    """Create 30 unique JSON objects (JSONL) representing how users might ask a voice assistant to set a timer. 
Keys required: "USER", "HOURS", "MINUTES", "SECONDS". 
Constraints: durations 1–99, use digits OR words consistently per utterance, include hours/minutes/seconds combinations, natural speech only, variety in style. 
Stop at 30 objects. No commentary.""",
    """You are generating synthetic training data for timer requests. Output 30 JSON objects in JSONL. 
Each must have: "USER" (utterance), "HOURS", "MINUTES", "SECONDS". 
Constraints: 
- spoken-style timer requests only, 
- values in [0–99], 
- words OR digits per utterance (not both), 
- include both common and odd durations, 
- varied tones. 
No abbreviations, no extra fields. Stop after 30 lines.""",
    """Generate 30 diverse timer request utterances in JSONL. 
Each JSON object has "USER", "HOURS", "MINUTES", "SECONDS". 
Cover a mix of hours-only, minutes-only, seconds-only, and combined durations. 
Alternate between using number words and digits. 
Ensure variety in phrasing and tone. Keep natural speech. 
Stop after 30 lines. No commentary.""",
    """I need 30 JSONL examples of people asking to set timers. 
Each JSON object has fields "USER", "HOURS", "MINUTES", "SECONDS". 
Constraints: 
- HOURS, MINUTES, SECONDS ∈ [0,99], 
- utterances must sound like spoken requests, 
- sometimes digits, sometimes words, 
- typical durations + unusual ones, 
- avoid abbreviations. 
Stop at 30.""",
    """Please produce 30 JSON lines of timer requests for a dataset. 
Each object: "USER", "HOURS", "MINUTES", "SECONDS". 
Requirements: natural voice commands, numbers 1–99, words OR digits, no mixing. 
Include common and uncommon durations, vary tone. 
Stop after 30 lines, no extra commentary.""",
    """Write 30 synthetic training samples where a user asks to set a timer. 
Return JSONL objects with "USER", "HOURS", "MINUTES", "SECONDS". 
Constraints: 
- durations between 1 and 99, 
- zero allowed but not spoken, 
- spoken tone only, 
- variety in numbers and phrasing, 
- sometimes digits, sometimes words. 
Stop after 30 examples.""",
    """Generate 30 utterances (JSONL) of timer-setting requests. 
Each JSON object: "USER", "HOURS", "MINUTES", "SECONDS". 
Ensure examples vary in style, duration, and number format. 
Mix hours-only, minutes-only, seconds-only, and combos. 
Follow all constraints (spoken style, no abbreviations, digits OR words per utterance). Stop after 30.""",
    """Produce 30 examples of JSON objects for timer requests. 
Each line: "USER", "HOURS", "MINUTES", "SECONDS". 
Constraints: values 0–99, words or digits but not both, common and unusual durations, varied tones, no abbreviations. 
Stop at 30 lines. Do not add commentary.""",
]


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "Qwen/Qwen3-32B"

tokenizer = AutoTokenizer.from_pretrained(model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)


In [None]:
import json
from tqdm import tqdm
import random

target_size = 7000
seen = set()

with open("timer_data.jsonl", "w", encoding="utf-8") as f:
    pbar = tqdm(total=target_size)
    while len(seen) < target_size:
        prompt = random.choice(TIMER_PROMPTS)
        messages = [{"role": "user", "content": prompt}]

        text = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True, enable_thinking=True
        )

        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
        temperature = random.uniform(0.8, 1.0)
        generated_ids = model.generate(
            **model_inputs, max_new_tokens=3000, do_sample=True, temperature=temperature, top_p=0.95
        )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :].tolist()
        try:
            index = len(output_ids) - output_ids[::-1].index(151668)
        except ValueError:
            continue
        content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
        for line in content.split("\n"):
            try:
                obj = json.loads(line.strip())
                user_text = obj["USER"]
                if user_text not in seen:
                    seen.add(user_text)
                    f.write(line.strip() + "\n")
                    pbar.update()
                    if len(seen) >= target_size:
                        break
            except Exception:
                continue


In [None]:
len(seen)


In [None]:
!cat timer_data.jsonl
