In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [3]:
import random
import os

languages = ["en", "rus", "de"]
base_dir = "datalines"

train_lines = []
val_lines = []

for lang in languages:
    input_file = os.path.join(base_dir, f"nutrition_data_{lang}.jsonl")

    with open(input_file, "r", encoding="utf-8") as f:
        unique_lines = list(set(line.strip() for line in f if line.strip()))

    print(f"[{lang}] Unique lines: {len(unique_lines)}")

    random.shuffle(unique_lines)
    val_len = int(0.2 * len(unique_lines))
    val_part = unique_lines[:val_len]
    train_part = unique_lines[val_len:]

    print(f"[{lang}] → train: {len(train_part)}, val: {len(val_part)}")

    train_lines.extend(train_part)
    val_lines.extend(val_part)

random.shuffle(train_lines)
random.shuffle(val_lines)

train_path = os.path.join(base_dir, "train_nutrition.jsonl")
val_path = os.path.join(base_dir, "val_nutrition.jsonl")

with open(train_path, "w", encoding="utf-8") as f:
    for line in train_lines:
        f.write(line + "\n")

with open(val_path, "w", encoding="utf-8") as f:
    for line in val_lines:
        f.write(line + "\n")

print(f"\nTotal train: {len(train_lines)} | Total val: {len(val_lines)}")

[en] Unique lines: 1142
[en] → train: 914, val: 228
[rus] Unique lines: 1228
[rus] → train: 983, val: 245
[de] Unique lines: 1237
[de] → train: 990, val: 247

Total train: 2887 | Total val: 720


In [2]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={
        "train": "datalines/train_nutrition.jsonl",
        "validation": "datalines/val_nutrition.jsonl",
    },
)

In [3]:
dataset["train"][0]

{'USER': 'Ich trank 400 ml grünen Smoothie mit Spinat und Apfel.',
 'LIST': [{'FOOD': 'grüner Smoothie', 'UNIT': 'ml', 'QUANTITY': 400.0},
  {'FOOD': 'Spinat', 'UNIT': 'Gramm', 'QUANTITY': 50.0},
  {'FOOD': 'Apfel', 'UNIT': 'Stück', 'QUANTITY': 1.0}]}

In [4]:
from transformers import AutoTokenizer

model_name = "google/gemma-3-270m-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
def format_prompt(sample):
    user = sample["USER"]
    items = sample["LIST"]
    result = f"USER:{user}\nLIST_LENGTH:{len(items)}\n"
    result += "\n".join(
        [
            f"[{i}]FOOD:{item['FOOD']},UNIT:{item['UNIT']},QUANTITY:{int(item['QUANTITY'])}"
            for i, item in enumerate(items, 1)
        ]
    )
    result += tokenizer.eos_token
    return result

In [6]:
dataset["train"] = dataset["train"].map(lambda x: {"text": format_prompt(x)})
dataset["validation"] = dataset["validation"].map(lambda x: {"text": format_prompt(x)})

Map:   0%|          | 0/2887 [00:00<?, ? examples/s]

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

In [7]:
print(dataset["train"][0]["text"])

USER:Ich trank 400 ml grünen Smoothie mit Spinat und Apfel.
LIST_LENGTH:3
[1]FOOD:grüner Smoothie,UNIT:ml,QUANTITY:400
[2]FOOD:Spinat,UNIT:Gramm,QUANTITY:50
[3]FOOD:Apfel,UNIT:Stück,QUANTITY:1<eos>


In [8]:
print(tokenizer.tokenize(dataset["train"][0]["text"]))

['USER', ':', 'Ich', '▁tr', 'ank', '▁', '4', '0', '0', '▁ml', '▁grü', 'nen', '▁Smoothie', '▁mit', '▁Spin', 'at', '▁und', '▁Ap', 'fel', '.', '\n', 'LIST', '_', 'LENGTH', ':', '3', '\n', '[', '1', ']', 'FOOD', ':', 'gr', 'ü', 'ner', '▁Smoothie', ',', 'UNIT', ':', 'ml', ',', 'QUANT', 'ITY', ':', '4', '0', '0', '\n', '[', '2', ']', 'FOOD', ':', 'Spin', 'at', ',', 'UNIT', ':', 'Gram', 'm', ',', 'QUANT', 'ITY', ':', '5', '0', '\n', '[', '3', ']', 'FOOD', ':', 'Ap', 'fel', ',', 'UNIT', ':', 'St', 'ück', ',', 'QUANT', 'ITY', ':', '1', '<eos>']


In [9]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["train"]])

161

In [10]:
max([len(tokenizer(sample["text"])["input_ids"]) for sample in dataset["validation"]])

151

In [11]:
def tokenize(sample):
    prompt = f"USER:{sample["USER"]}\n"
    prompt_len = len(tokenizer(prompt)["input_ids"])
    tokenized = tokenizer(sample["text"], padding="max_length", max_length=161)
    pad_len = tokenized["input_ids"].count(tokenizer.pad_token_id)
    tokenized["labels"] = tokenized["input_ids"].copy()
    tokenized["labels"][: pad_len + prompt_len] = [-100] * (pad_len + prompt_len)
    return tokenized


dataset["train"] = dataset["train"].map(tokenize, batched=False)
dataset["validation"] = dataset["validation"].map(tokenize, batched=False)

Map:   0%|          | 0/2887 [00:00<?, ? examples/s]

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

In [12]:
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    use_cache=False,
    attn_implementation="eager",
)


In [13]:
peft_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)

In [14]:
training_args = TrainingArguments(
    output_dir="./gemma-nutrition-lora",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=[],
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    seed=887,
    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    args=training_args,
)

trainer.train()



[33mWARN[0m  Python GIL is enabled: Multi-gpu quant acceleration for MoE models is sub-optimal and multi-core accelerated cpu packing is also disabled. We recommend Python >= 3.13.3t with Pytorch > 2.8 for mult-gpu quantization and multi-cpu packing with env `PYTHON_GIL=0`.
[33mWARN[0m  Feature `utils/Perplexity` requires python GIL or Python >= 3.13.3T (T for Threading-Free edition of Python) plus Torch 2.8. Feature is currently skipped/disabled.
[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          


Truncating train dataset:   0%|          | 0/2887 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmpd6j3gmzx/test.c -o /tmp/tmpd6j3gmzx/test.o
INFO:root:cc -pthread /tmp/tmpd6j3gmzx/test.o -laio -o /tmp/tmpd6j3gmzx/a.out
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmp7gngdix2/test.c -o /tmp/tmp7gngdix2/test.o
INFO:root:cc -pthread /tmp/tmp7gngdix2/test.o -L/usr/local/cuda-12.6 -L/usr/local/cuda-12.6/lib64 -lcufile -o /tmp/tmp7gngdix2/a.out
INFO:root:cc -pthread -fno-strict-overflow -Wsign-compare -Wunreachable-code -DNDEBUG -g -O3 -Wall -fPIC -fPIC -c /tmp/tmp5h6ijz12/test.c -o /tmp/tmp5h6ijz12/test.o
INFO:root:cc -pthread /tmp/tmp5h6ijz12/test.o -laio -o /tmp/tmp5h6ijz12/a.out
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
The tokenizer 

Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
25,0.9996,0.251076,1.176679,128800.0,0.932658
50,0.1255,0.11648,1.63256,257600.0,0.969232
75,0.1079,0.094828,0.317103,386400.0,0.973653
100,0.0763,0.081008,0.267273,511175.0,0.978065
125,0.0613,0.068525,0.274583,639975.0,0.982149
150,0.0584,0.06301,0.294622,768775.0,0.983318
175,0.0548,0.059943,0.683902,897575.0,0.983639
200,0.0337,0.061328,0.253321,1022350.0,0.983886
225,0.0393,0.056355,0.452175,1151150.0,0.984665
250,0.043,0.058077,0.409875,1279950.0,0.984305


KeyboardInterrupt: 

In [15]:
trainer.save_model("./gemma-nutrition-lora")

In [16]:
from transformers import logging

logging.set_verbosity_error()

In [17]:
from transformers import pipeline
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda")
model = PeftModel.from_pretrained(base_model, "./gemma-nutrition-lora", device_map="cuda")
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [23]:
from tqdm import tqdm


def evaluate_accuracy(
    dataset,
    log_file,
    batch_size=32,
):
    correct = 0
    total = len(dataset)

    with open(log_file, "w") as file:
        for i in tqdm(range(0, total, batch_size)):
            texts = [
                text.replace(tokenizer.eos_token, "")
                for text in dataset[i : i + batch_size]["text"]
            ]
            prefixes = [text.split("\n")[0] + "\n" for text in texts]
            print(repr(prefixes[0]))
            print(repr(texts[0]))
            gen_outs = text_gen(
                prefixes,
                num_beams=1,
                do_sample=False,
                batch_size=batch_size,
            )
            for text, gen_out in zip(texts, gen_outs):
                gen_text = gen_out[0]["generated_text"]
                if len(gen_text) >= len(text) and text == gen_text[: len(text)]:
                    correct += 1
                else:
                    print(f"Mismatch:\n{text} ->\n{gen_text}\n", file=file)

    return correct / total

In [24]:
acc = evaluate_accuracy(dataset["validation"], log_file="eval.log")
print(f"Validation accuracy: {acc:.4f}")

  0%|                                                                                                                                                      | 0/23 [00:00<?, ?it/s]

'USER:Had 1 serving of Greek salad with feta and olives.\n'
'USER:Had 1 serving of Greek salad with feta and olives.\nLIST_LENGTH:3\n[1]FOOD:Greek salad,UNIT:serving,QUANTITY:1\n[2]FOOD:feta,UNIT:gram,QUANTITY:30\n[3]FOOD:olive,UNIT:piece,QUANTITY:5'


  0%|                                                                                                                                                      | 0/23 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [22]:
print(
    text_gen(
        "USER:Had 1 serving of Greek salad with feta and olives.\n",
        max_new_tokens=100,
        num_beams=1,
        do_sample=False,
    )[0]["generated_text"]
)

USER:Had 1 serving of Greek salad with feta and olives.
LIST_LENGTH:4
[1]FOOD:Greek salad,UNIT:serving,QUANTITY:1
[2]FOOD:feta,UNIT:gram,QUANTITY:50
[3]FOOD:olive,UNIT:gram,QUANTITY:20
[4]FOOD:greek salad,UNIT:portion,QUANTITY:1
[5]FOOD:feta,UNIT:gram,QUANTITY:30
[6]FOOD:olives


In [20]:
%cat eval.log

Mismatch:
USER:Had 1 serving of Greek salad with feta and olives.
LIST[3]:
[1]FOOD:Greek salad,UNIT:serving,QUANTITY:1
[2]FOOD:feta,UNIT:gram,QUANTITY:30
[3]FOOD:olive,UNIT:piece,QUANTITY:5 ->
USER:Had 1 serving of Greek salad with feta and olives.
样的: salad,
LIST[Greek salad with feta, olives,
FOOD: Greek salad,UNIT: serving, UNIT: serving,QUANTITY: 1
FOOD: Greek salad,UNIT: serving, UNIT: salad,QUANTITY: 1: feta,UNIT: olive,QUANTITY: 2: olives,UNIT: 3: tomato,QUANTITY: 4: onion,UNIT: 5: lettuce,QUANTITY: 6: basil,UNIT: 7: tomato,QUANTITY: 8: lettuce,UNIT: 9: basil,QUANTITY: 10: lettuce,UNIT: 1: onion,QUANTITY: 1: basil,UNIT: tomato,QUANTITY: 2: onion, UNIT: 3: lettuce,QUANTITY: basil,UNIT: feta, olives,QUANTITY: oregano,UNIT: 4: tomato, lettuce,QUANTITY: oregano, salad,UNIT: basil, olives,QUANTITY: 5: tomato, UNIT: lettuce, QUANTITY: oregano, salad,UNIT: olive, salad, salad,QUANTITY: 2: basil,UNIT: lettuce, olives, salad, salad, salad, salad, salad, salad, salad, QUANT

Mismatch:
USE