In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json, time, re
from pathlib import Path

In [5]:
MODEL_NAME = "google/flan-t5-small"  # lightweight accessible model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
def generate_answer(input_text, max_new_tokens=64):
    inputs = tokenizer(
        input_text, return_tensors="pt", truncation=True, max_length=512
    )
    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        num_beams=3,
        length_penalty=0.6,
        early_stopping=True,
    )
    return tokenizer.decode(out[0], skip_special_tokens=True).strip()

In [7]:
def normalize_num(s: str):
    # remove commas/spaces and keep digits, minus, dot
    return re.sub(r"[^\d\-\.]", "", s).lstrip("0")

In [8]:
# Load 10 test QAs (we saved them as JSON, not JSONL)
test_path = Path("../data/qa_datasets/test_qas.json")
tests = json.load(open(test_path, "r", encoding="utf-8"))


In [10]:
correct = 0
timings, preds = [], []

for ex in tests:
    q = ex["question"]
    gt = ex["answer"]

    start = time.time()
    pred = generate_answer(q)
    timings.append(time.time() - start)

    ok = False
    if gt in pred:
        ok = True
    else:
        n_gt = normalize_num(gt)
        n_pred = normalize_num(pred)
        if n_gt and n_gt in n_pred:
            ok = True

    preds.append({"q": q, "pred": pred, "gold": gt, "ok": ok})
    correct += int(ok)

acc = correct / len(tests) if tests else 0.0
print(f"✅ Baseline accuracy on 10 test Qs: {acc:.2f}")
print(f"⚡ Avg inference time: {sum(timings)/len(timings):.2f}s")
print("\nSample predictions:")
for p in preds[:3]:
    print(p)

✅ Baseline accuracy on 10 test Qs: 0.00
⚡ Avg inference time: 0.15s

Sample predictions:
{'q': 'What was the Revenue from operations in Q4 FY24?', 'pred': '$2.75 billion', 'gold': 'The Revenue from operations in Q4 FY24 was 61,237.', 'ok': False}
{'q': 'What was the Revenue from operations in Q3 FY23?', 'pred': '$2.75 billion', 'gold': 'The Revenue from operations in Q3 FY23 was 60,583.', 'ok': False}
{'q': 'What was the Revenue from operations in Q4 FY23?', 'pred': '$2.75 billion', 'gold': 'The Revenue from operations in Q4 FY23 was 59,162.', 'ok': False}
