In [None]:
from datasets import load_dataset, DatasetDict

ds = load_dataset("keivalya/MedQuad-MedicalQnADataset", split="train")

# 90 / 10 split
split = ds.train_test_split(test_size=0.1, seed=42)
dataset_dict = DatasetDict({"train": split["train"], "test": split["test"]})
dataset_dict.save_to_disk("../../medquad_dataset")

print(dataset_dict)


In [None]:
import torch, json, tqdm
from datasets import load_from_disk
from transformers import AutoTokenizer
from litgpt.lora import GPT
from litgpt.prompts import PromptStyle

# ---------- config ------------------------------------------------------
CKPT        = "../../checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/lit_model.pth"
MODEL_NAME  = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_NEW     = 256                                    # <-- now in scope

# ---------- tokenizer & prompt ------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
prompt_style = PromptStyle.from_name("alpaca")

# ---------- load quantised model ----------------------------------------
model = GPT.from_name(name=MODEL_NAME)
model = model.to_empty(device=DEVICE).eval()
model.load_state_dict(torch.load(CKPT, map_location="cpu"), strict=False)


# ---------- fallback generate() if old Lit-GPT --------------------------
try:
    from litgpt.generate import generate           # modern Lit-GPT
except ImportError:
    def generate(model, input_ids, *, max_new_tokens, eos_id):
        with torch.no_grad():
            for _ in range(max_new_tokens):
                logits = model(input_ids)[:, -1, :]
                next_id = torch.argmax(logits, dim=-1, keepdim=True)
                input_ids = torch.cat([input_ids, next_id], dim=-1)
                if next_id.item() == eos_id:
                    break
        return input_ids

# ---------- helper ------------------------------------------------------
def generate_answer(question: str) -> str:
    prompt = prompt_style.apply(f"Symptoms/Question: {question}\nAnswer:")
    ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
    out = generate(model, input_ids=ids,
                   max_new_tokens=MAX_NEW, eos_id=tokenizer.eos_token_id)
    return tokenizer.decode(out[0], skip_special_tokens=True)



# ---------- load evaluation split ---------------------------------------
full_test_set = load_from_disk("../../medquad_dataset")["test"]

test_set = full_test_set.select(range(10))    
# detect column names automatically (case-insensitive)
cols = {c.lower(): c for c in test_set.column_names}
q_key, a_key = cols["question"], cols["answer"]

# ---------- generate -----------------------------------------------------
preds, refs = [], []
for ex in tqdm.tqdm(test_set, desc="Generating"):
    preds.append(generate_answer(ex[q_key]))
    refs.append(ex[a_key])


with open("../../test_predictions.json", "w") as f:
    json.dump({"predictions": preds, "references": refs}, f, indent=2)

print(f"✓ saved {len(preds)} predictions to test_predictions.json")


Eval Metrics

In [None]:
import json, sacrebleu
from rouge_score import rouge_scorer

data = json.load(open("../../test_predictions.json"))
preds, refs = data["predictions"], data["references"]

# BLEU (sacrebleu always expects list-of-systems vs list-of-lists refs)
bleu = sacrebleu.corpus_bleu(preds, [refs]).score

# ROUGE-L
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
rouge_l = sum(scorer.score(r, p)["rougeL"].fmeasure
              for p, r in zip(preds, refs)) / len(preds)

print(f"BLEU {bleu:.4f}")
print(f"ROUGE-L {rouge_l:.4f}")


Human Judgement

In [None]:
import json, random, torch
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM
from lime.lime_text import LimeTextExplainer

# ---------- 1.  Load held‑out test set & model ----------

DATA_DIR   = "medquad_dataset"          # from Step 1
MODEL_NAME = "your-trained-model-name"  # path or HF repo id

test_set  = load_from_disk(DATA_DIR)["test"]
tok       = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForCausalLM.from_pretrained(MODEL_NAME).eval().to(
              torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# ---------- 2.  Generate answers on the full test set ----------

def answer(q: str, max_len: int = 256) -> str:
    ids = tok.encode(q, return_tensors="pt").to(model.device)
    out = model.generate(ids, max_length=max_len, do_sample=False)
    return tok.decode(out[0], skip_special_tokens=True)

preds, refs = [], []
for ex in test_set:
    preds.append(answer(ex["question"]))
    refs.append(ex["answer"])

with open("test_predictions.json", "w") as f:
    json.dump({"predictions": preds, "references": refs}, f, indent=2)

# ---------- 3.  Sample ~50 Q‑A pairs for human review ----------

SAMPLE_N = 50
idxs = random.sample(range(len(preds)), SAMPLE_N)
sampled = [{"question": refs[i], "answer": preds[i]} for i in idxs]

with open("human_review_samples.json", "w") as f:
    json.dump(sampled, f, indent=2)

# ---------- 4.  Local interpretability with LIME ----------

explainer = LimeTextExplainer(class_names=list(tok.get_vocab().keys()))

def proba_func(questions: list[str]):
    outs = []
    for q in questions:
        ids   = tok(q, return_tensors="pt").to(model.device)
        logits = model(**ids, labels=ids["input_ids"]).logits[:, -1, :]
        outs.append(torch.softmax(logits, dim=-1).cpu().numpy().ravel())
    return outs

sample_q = "What are the early signs of stroke?"
lime_exp = explainer.explain_instance(
    sample_q,
    proba_func,
    num_features=10,
    num_samples=100
)

lime_exp.save_to_file("lime_explanation.html")  # open in browser
print(lime_exp.as_list())                       # console preview


Template Based Tests

In [None]:
import json, random, torch
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM
from lime.lime_text import LimeTextExplainer

# ---------- 1.  Load held‑out test set & model ----------

DATA_DIR   = "medquad_dataset"          # from Step 1
MODEL_NAME = "your-trained-model-name"  # path or HF repo id

test_set  = load_from_disk(DATA_DIR)["test"]
tok       = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForCausalLM.from_pretrained(MODEL_NAME).eval().to(
              torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# ---------- 2.  Generate answers on the full test set ----------

def answer(q: str, max_len: int = 256) -> str:
    ids = tok.encode(q, return_tensors="pt").to(model.device)
    out = model.generate(ids, max_length=max_len, do_sample=False)
    return tok.decode(out[0], skip_special_tokens=True)

preds, refs = [], []
for ex in test_set:
    preds.append(answer(ex["question"]))
    refs.append(ex["answer"])

with open("test_predictions.json", "w") as f:
    json.dump({"predictions": preds, "references": refs}, f, indent=2)

# ---------- 3.  Sample ~50 Q‑A pairs for human review ----------

SAMPLE_N = 50
idxs = random.sample(range(len(preds)), SAMPLE_N)
sampled = [{"question": refs[i], "answer": preds[i]} for i in idxs]

with open("human_review_samples.json", "w") as f:
    json.dump(sampled, f, indent=2)

# ---------- 4.  Local interpretability with LIME ----------

explainer = LimeTextExplainer(class_names=list(tok.get_vocab().keys()))

def proba_func(questions: list[str]):
    outs = []
    for q in questions:
        ids   = tok(q, return_tensors="pt").to(model.device)
        logits = model(**ids, labels=ids["input_ids"]).logits[:, -1, :]
        outs.append(torch.softmax(logits, dim=-1).cpu().numpy().ravel())
    return outs

sample_q = "What are the early signs of stroke?"
lime_exp = explainer.explain_instance(
    sample_q,
    proba_func,
    num_features=10,
    num_samples=100
)

lime_exp.save_to_file("lime_explanation.html")  # open in browser
print(lime_exp.as_list())                       # console preview


In [None]:

import json, torch, tqdm, re, numpy as np
from pathlib import Path
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import AutoTokenizer
from litgpt.lora import GPT
from litgpt.prompts import PromptStyle

# ---------- paths & constants ------------------------------------------
TEMPLATE_PATH = Path("../../offline_eval/templates/paraphrase_sets.json")
CKPT        = "../../checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/lit_model.pth"
MODEL_NAME    = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_NEW       = 256
SIM_THRESHOLD = 0.6         # BLEU-1
PASS_FLOOR    = 0.80        # must pass ≥80 %

# ---------- load model & tokenizer -------------------------------------
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
tok.pad_token = tok.eos_token
style = PromptStyle.from_name("alpaca")

model = GPT.from_name(name=MODEL_NAME)
model = model.to_empty(device=DEVICE).eval()
model.load_state_dict(torch.load(CKPT, map_location="cpu"), strict=False)

try:
    from litgpt.generate import generate
except ImportError:
    def generate(model, input_ids, *, max_new_tokens, eos_id):
        with torch.no_grad():
            for _ in range(max_new_tokens):
                logits = model(input_ids)[:, -1, :]
                next_id = torch.argmax(logits, dim=-1, keepdim=True)
                input_ids = torch.cat([input_ids, next_id], dim=-1)
                if next_id.item() == eos_id:
                    break
        return input_ids

def predict(q: str) -> str:
    prompt = style.apply(f"Question: {q}\nAnswer:")
    ids = tok(prompt, return_tensors="pt").input_ids.to(DEVICE)
    out = generate(model, input_ids=ids,
                   max_new_tokens=MAX_NEW, eos_id=tok.eos_token_id)
    return tok.decode(out[0], skip_special_tokens=True).strip().lower()

# ---------- similarity helper (BLEU-1) ----------------------------------
smooth = SmoothingFunction().method4
def sim(a: str, b: str) -> float:
    return sentence_bleu([a.split()], b.split(), weights=(1,0,0,0), smoothing_function=smooth)

# ---------- run test ----------------------------------------------------
tpls = json.load(TEMPLATE_PATH.open())
total, passed = 0, 0

for entry in tqdm.tqdm(tpls, desc="Template sets"):
    canon_ans = predict(entry["canonical"])
    good = 0
    for p in entry["paraphrases"]:
        score = sim(canon_ans, predict(p))
        good += score >= SIM_THRESHOLD
    entry["n"] = len(entry["paraphrases"])
    entry["passed"] = good
    total += entry["n"]
    passed += good
    print(f"{entry['id']}: {good}/{entry['n']} paraphrases ≥ {SIM_THRESHOLD}")

overall = passed / max(total,1)
print(f"\nOverall pass-rate: {overall:.2%} (threshold {PASS_FLOOR:.0%})")
print("RESULT:", "✅ PASS" if overall >= PASS_FLOOR else "❌ FAIL")

# ---------- save results back to JSON -----------------------------------
TEMPLATE_PATH.write_text(json.dumps(tpls, indent=2))
print(f"Updated results saved to {TEMPLATE_PATH}")


Failure Models

In [None]:


import os, json, sys, re, torch
from litgpt.lora import GPT
from litgpt.prompts import PromptStyle
from transformers import AutoTokenizer

# ---------- paths (string) ----------------------------------------------
CKPT        = "../../checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/lit_model.pth"
MODEL_NAME  = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
FM_PATH     = "../../offline_eval/templates/failure_models.json"

DEVICE   = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_NEW  = 256

# ---------- load failure cases ------------------------------------------
f_cases = json.load(open(FM_PATH))

# ---------- model & tokenizer -------------------------------------------
tok   = AutoTokenizer.from_pretrained(MODEL_NAME)
tok.pad_token = tok.eos_token
style = PromptStyle.from_name("alpaca")

model = GPT.from_name(name=MODEL_NAME)
model = model.to_empty(device=DEVICE).eval()
model.load_state_dict(torch.load(CKPT, map_location="cpu"), strict=False)

try:
    from litgpt.generate import generate
except ImportError:
    def generate(model, input_ids, *, max_new_tokens, eos_id):
        with torch.no_grad():
            for _ in range(max_new_tokens):
                logits = model(input_ids)[:, -1, :]
                next_id = torch.argmax(logits, dim=-1, keepdim=True)
                input_ids = torch.cat([input_ids, next_id], dim=-1)
                if next_id.item() == eos_id:
                    break
        return input_ids

def answer(q: str) -> str:
    prompt = style.apply(f"Question: {q}\nAnswer:")
    ids = tok(prompt, return_tensors="pt").input_ids.to(DEVICE)
    out = generate(model, input_ids=ids,
                   max_new_tokens=MAX_NEW, eos_id=tok.eos_token_id)
    return tok.decode(out[0], skip_special_tokens=True).lower()





# ---------- evaluate -----------------------------------------------------
failed = []
for case in f_cases:
    ans = answer(case["question"]).lower()
    inc_ok = all(k.lower() in ans for k in case["must_include"])
    not_ok = any(k.lower() in ans for k in case.get("must_not_include", []))
    passed = inc_ok and not not_ok
    print(f"{case['id']:20s} → {'✅ pass' if passed else '❌ fail'}")
    if not passed:
        print("   model answer:", ans[:160], "...")
        failed.append(case["id"])
    case["passed"] = passed   # write back result

# ---------- summary & save ----------------------------------------------
json.dump(f_cases, open(FM_PATH, "w"), indent=2)   # update file

if failed:
    print("\nFAILED cases:", ", ".join(failed))
    sys.exit(1)
else:
    print("\nAll failure modes passed.")
    sys.exit(0)


Dataset tagging

In [None]:

import json, pathlib, sacrebleu
from datasets import load_from_disk

DATA_DIR   = "../../medquad_dataset"
SLICE_COL  = "qtype"        # <-- change if you slice on something else
OUT_DIR   = "../../offile_eval/results"
PRED_PATH  = "../../test_predictions.json"
OUT_PATH   = os.path.join(OUT_DIR, "slice_bleu.json")
os.makedirs(OUT_DIR, exist_ok=True)


# ---------- load data ---------------------------------------------------
test = load_from_disk(DATA_DIR)["test"]

with open(PRED_PATH, "r") as f:
    pred_ref = json.load(f)

preds, refs = pred_ref["predictions"], pred_ref["references"]

# ---------- bucket by slice --------------------------------------------
buckets = {}
for ex, pred, ref in zip(test, preds, refs):
    key = ex[SLICE_COL]
    buckets.setdefault(key, {"pred": [], "ref": []})
    buckets[key]["pred"].append(pred)
    buckets[key]["ref"].append(ref)

# ---------- compute BLEU (sacrebleu) -----------------------------------
slice_bleu = {}
for k, d in buckets.items():
    score = sacrebleu.corpus_bleu(d["pred"], [d["ref"]]).score / 100
    slice_bleu[k] = score
    print(f"{k:25s} BLEU={score:.3f}")

with open(OUT_PATH, "w") as f:
    json.dump(slice_bleu, f, indent=2)

print(f"✓ wrote {OUT_PATH}")

In [None]:
from datasets import load_from_disk
import json, sacrebleu, pandas as pd

DATA_DIR   = "../../medquad_dataset"
SLICE_COL  = "qtype"                 # whatever column you discovered
SLICE_SET  = ["inheritance"]   # slices you want to inspect

full_test_set = load_from_disk("../../medquad_dataset")["test"]

test = full_test_set.select(range(10))  

# Load predictions produced earlier
preds_refs = json.load(open("../../test_predictions.json"))
preds, refs = preds_refs["predictions"], preds_refs["references"]

df = pd.DataFrame({
    "question": test["Question"],
    "slice":    test[SLICE_COL],
    "pred":     preds,
    "ref":      refs
})

import numpy as np
from rouge_score import rouge_scorer
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

bleu_rows, rouge_rows = [], []
for sl in SLICE_SET:
    sub = df[df.slice == sl]

    # convert Series → list and wrap refs in outer list
    hyp = sub["pred"].tolist()
    ref = [sub["ref"].tolist()]

    # skip empty slices (if any)
    if not hyp:
        continue

    bleu = sacrebleu.corpus_bleu(hyp, ref).score / 100
    rouge_l = np.mean(
        [rouge.score(r, p)["rougeL"].fmeasure for p, r in zip(hyp, ref[0])]
    )

    bleu_rows.append((sl, len(sub), bleu))
    rouge_rows.append((sl, len(sub), rouge_l))

print("BLEU")
for sl, n, b in bleu_rows:
    print(f"{sl:12s} n={n:4d}  BLEU={b:.3f}")

print("\nROUGE-L")
for sl, n, r in rouge_rows:
    print(f"{sl:12s} n={n:4d}  ROUGE-L={r:.3f}")

# convert Series → Python list
hypotheses = df["pred"].tolist()          # list[str]
references = [df["ref"].tolist()]         # list[list[str]]

global_bleu  = sacrebleu.corpus_bleu(hypotheses, references).score / 100
global_rouge = np.mean(
    [rouge.score(r, p)["rougeL"].fmeasure for p, r in zip(hypotheses, references[0])]
)

print(f"\nGlobal BLEU  = {global_bleu:.3f}")
print(f"Global ROUGE = {global_rouge:.3f}")

