In [None]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

ROOT = Path.cwd().parent
sys.path.insert(0, str(ROOT))

load_dotenv()

HF_CACHE = ROOT.parent / "huggingface_cache"
HF_CACHE.mkdir(exist_ok=True)

os.environ["HF_HOME"] = str(HF_CACHE)
os.environ["HF_DATASETS_CACHE"] = str(HF_CACHE)

hf_token = os.getenv("HF_TOKEN")

RESULTS_PATH = ROOT / "data/benchmark_results/qwen3-8B-original"

In [2]:
import torch
import re
from tqdm import tqdm

In [3]:
from unsloth import FastLanguageModel, get_chat_template

MODEL_NAME = "unsloth/Qwen3-8B-unsloth-bnb-4bit"
MAX_SEQ_LENGTH = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)

tokenizer = get_chat_template(
    tokenizer,
    chat_template="qwen-3",
)


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.9: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Quadro RTX 8000. Num GPUs = 1. Max memory: 47.266 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from src.training.trainer import get_model_and_tokenizer
from src.data.qna_loader import QNALoader
from src.data.mcq_loader import MCQLoader

qna_loader = QNALoader(tokenizer=tokenizer)
mcq_loader = MCQLoader(tokenizer=tokenizer)

qna_test = qna_loader.load(splits=("test",), apply_formatting=False)
mcq_test = mcq_loader.load(splits=("test",), apply_formatting=False)

2026-01-27 15:44:37,320 - [INFO] - Loading raw test from local: data/qna/raw/test.json
2026-01-27 15:44:37,335 - [INFO] - test (raw): 400 samples
2026-01-27 15:44:37,338 - [INFO] - Loading raw test from local: data/mcq/raw/test.json
2026-01-27 15:44:37,390 - [INFO] - test (raw): 2000 samples


In [5]:
print(qna_test['test'][0])

{'question': "Thu·∫≠t ng·ªØ 'Cyclic Prefix' trong h·ªá th·ªëng OFDM th·ª±c ch·∫•t ƒë·∫°i di·ªán cho th·ª±c th·ªÉ v·∫≠t l√Ω n√†o?", 'answer': 'Cyclic Prefix l√† m·ªôt ƒëo·∫°n b·∫£n sao c·ªßa ph·∫ßn cu·ªëi k√Ω hi·ªáu OFDM ƒë∆∞·ª£c ch√®n v√†o tr∆∞·ªõc ch√≠nh k√Ω hi·ªáu ƒë√≥ nh·∫±m t·∫°o ra kho·∫£ng b·∫£o v·ªá gi·ªØa c√°c k√Ω hi·ªáu. N√≥ bi·∫øn ph√©p truy·ªÅn tuy·∫øn t√≠nh trong k√™nh truy·ªÅn th√†nh ph√©p cu·ªôn v√≤ng, gi√∫p lo·∫°i b·ªè nhi·ªÖu li√™n k√Ω hi·ªáu v√† ƒë∆°n gi·∫£n h√≥a vi·ªác c√¢n b·∫±ng k√™nh t·∫°i m√°y thu.', 'category': 'Lexicon'}


In [6]:
print(mcq_test['test'][0])

{'question': 'Th√†nh ph·∫ßn n√†o cung c·∫•p ph·∫£n h·ªìi Nudr_DM_Query? [3GPP Release 18]', 'choices': {'1': 'AF', '2': 'NEF', '3': 'UDR', '4': 'SMF', '5': None}, 'answer': 3, 'explanation': 'UDR cung c·∫•p ph·∫£n h·ªìi Nudr_DM_Query (M√£ ƒë·ªãnh danh ·ª©ng d·ª•ng, PFD) cho NEF (PFDF).', 'category': 'Standards specifications'}


In [7]:
MCQ_SYSTEM_PROMPT = (
    "B·∫°n l√† chuy√™n gia Vi·ªÖn th√¥ng. "
    "CH·ªà tr·∫£ l·ªùi b·∫±ng S·ªê TH·ª® T·ª∞ c·ªßa ƒë√°p √°n ƒë√∫ng (1, 2, 3, 4). "
    "KH√îNG gi·∫£i th√≠ch. KH√îNG th√™m ch·ªØ."
)

mcq_results = []
MAX_NEW_TOKENS_MCQ = 5

for sample in tqdm(mcq_test["test"], desc="MCQ inference"):
    question = sample["question"]
    choices = sample["choices"]

    formatted_choices = ""
    for k in sorted(choices.keys(), key=int):
        formatted_choices += f"{k}. {choices[k]}\n"

    user_content = f"{question}\n\nL·ª±a ch·ªçn:\n{formatted_choices}"

    messages = [
        {"role": "system", "content": MCQ_SYSTEM_PROMPT},
        {"role": "user", "content": user_content},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        enable_thinking=False
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=MAX_NEW_TOKENS_MCQ,
            do_sample=False,
            temperature=0.0,
        )

    gen_text = tokenizer.decode(
        outputs[0][input_ids.shape[-1]:],
        skip_special_tokens=True
    ).strip()

    match = re.findall(r"\d+", gen_text)
    model_answer = int(match[0]) if match else None

    mcq_results.append({
        **sample,
        "model_answer": model_answer,
        "raw_model_output": gen_text,
    })


MCQ inference:   0%|                                                                                                                                                | 0/2000 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
MCQ inference:   0%|‚ñç                                                                                                                                       | 6/2000 [00:04<22:27,  1.48it/s]


KeyboardInterrupt: 

In [10]:
QNA_SYSTEM_PROMPT = (
    "B·∫°n l√† chuy√™n gia Vi·ªÖn th√¥ng cao c·∫•p. "
    "H√£y tr·∫£ l·ªùi ƒë√∫ng tr·ªçng t√¢m c√¢u h·ªèi trong kho·∫£ng t·ª´ 2 ƒë·∫øn 3 c√¢u."
)

qna_results = []

MAX_NEW_TOKENS_QNA = 512
qna_dataset = qna_test["test"]

for sample in tqdm(qna_dataset, desc="QNA inference"):
    question = sample["question"]

    messages = [
        {"role": "system", "content": QNA_SYSTEM_PROMPT},
        {"role": "user", "content": question},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        enable_thinking=False
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=MAX_NEW_TOKENS_QNA,
            do_sample=False,
            temperature=0.0,
        )

    gen_text = tokenizer.decode(
        outputs[0][input_ids.shape[-1]:],
        skip_special_tokens=True
    ).strip()

    qna_results.append({
        **sample,
        "model_answer": gen_text
    })


QNA inference:   0%|                                                                                                                                                 | 0/400 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [9]:
import pandas as pd

pd.DataFrame(mcq_results).to_csv(f"{RESULTS_PATH}/mcq_test_predictions.csv", index=False)
pd.DataFrame(qna_results).to_csv(f"{RESULTS_PATH}/qna_test_predictions.csv", index=False)