In [None]:

!pip -q install --upgrade "transformers>=4.38" peft datasets bitsandbytes accelerate pandas

import os, json, re, pandas as pd, torch
from pathlib import Path
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                          BitsAndBytesConfig)
from peft import PeftModel

# ───────────────────────── helpers ──────────────────────────────
EOS  = "<|endoftext|>"
PICK = re.compile(r"Pick:\s*([^(\n]+)", re.I)
STRIP_VORP = re.compile(r"\s*\|?\s*NBA_VORP_3yr:.*?(?=$|\n)", re.I)

def _split_text(t):
    t = STRIP_VORP.sub("", t)
    prompt, completion = t.split("### Response:", 1)
    return prompt.strip(), completion.strip()

def _top3(gen):
    names = []
    for m in PICK.finditer(gen):
        n = m.group(1).strip().lower()
        if n not in names:
            names.append(n)
        if len(names) == 3:
            break
    return names + [""] * (3 - len(names))

# ───────────────────────── main evaluator ───────────────────────
def run_eval(jsonl_path: str,
             engine: str = "mistral-base",
             adapter_path: str | None = None,
             first_n: int = 0,
             out_csv: str | Path = "results.csv") -> pd.DataFrame:
    """Benchmark Mistral-7B (base or LoRA) on your cleaned test set."""
    # 1️⃣  load dataset ----------------------------------------------------
    records = []
    with open(jsonl_path, encoding="utf-8") as f:
        for idx, line in enumerate(f, 1):
            prompt, gold = _split_text(json.loads(line)["text"])
            records.append({"idx": idx, "prompt": prompt, "gold": gold})
            if first_n and len(records) >= first_n:
                break
    print(f"🔍  evaluating {len(records)} examples …")

    # 2️⃣  prepare model ---------------------------------------------------
    bnb_cfg = BitsAndBytesConfig(load_in_8bit=True,
                                 llm_int8_enable_fp32_cpu_offload=True)
    base = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-v0.1",
        device_map="auto",
        quantization_config=bnb_cfg,
        trust_remote_code=True)
    tok = AutoTokenizer.from_pretrained(
        "mistralai/Mistral-7B-v0.1", trust_remote_code=True)
    tok.pad_token = tok.eos_token

    if engine == "mistral-lora":
        if not adapter_path:
            raise ValueError("Must pass adapter_path for 'mistral-lora'")
        model = PeftModel.from_pretrained(base, adapter_path, device_map="auto")
    else:
        model = base
    model.eval()

    def generate(prompt: str) -> str:
        inp = tok(prompt + "\n### Response:", return_tensors="pt").to(model.device)
        out = model.generate(
            **inp,
            max_new_tokens=140,
            top_p=0.85,
            temperature=0.4,
            eos_token_id=tok.eos_token_id,
            pad_token_id=tok.eos_token_id,
        )
        return tok.decode(out[0][inp.input_ids.shape[1] :], skip_special_tokens=True)

    # 3️⃣  loop & score ----------------------------------------------------
    rows, hit1, hit3 = [], 0, 0
    for rec in records:
        gold = PICK.search(rec["gold"]).group(1).strip().lower()
        gen = generate(rec["prompt"])
        p1, p2, p3 = _top3(gen)

        top1_ok = int(p1 == gold)
        top3_ok = int(gold in (p1, p2, p3))
        hit1 += top1_ok
        hit3 += top3_ok

        rows.append([rec["idx"], gold, p1, p2, p3, top1_ok, top3_ok])

    df = pd.DataFrame(
        rows,
        columns=[
            "idx",
            "gold",
            "pred1",
            "pred2",
            "pred3",
            "top1_correct",
            "top3_correct",
        ],
    )
    df.to_csv(out_csv, index=False)
    print(f"📄  saved → {out_csv}")

    n = len(records)
    print(f"Top-1 accuracy : {hit1}/{n} = {hit1 / n * 100:.1f}%")
    print(f"Top-3 accuracy : {hit3}/{n} = {hit3 / n * 100:.1f}%")

    return df


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# 1) In the very first cell of your notebook:
from huggingface_hub import notebook_login
notebook_login()  # This will prompt you to paste your HF token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from google.colab import drive
drive.mount('/content/drive')

TEST = "/content/drive/MyDrive/nbadraft/test_samples.jsonl"

# 1. Plain Mistral-7B baseline
df_base = run_eval(TEST,
                   engine="mistral-base",
                   first_n=0,                     # 0 = use all rows
                   out_csv="/content/drive/MyDrive/nbadraft/mistral_base.csv")

# 2. Your fine-tuned adapter
df_lora = run_eval(TEST,
                   engine="mistral-lora",
                   adapter_path="/content/drive/MyDrive/nbadraft/mistral-lora-v9",
                   first_n=0,
                   out_csv="/content/drive/MyDrive/nbadraft/mistral_lora.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🔍  evaluating 116 examples …


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



📄  saved → /content/drive/MyDrive/nbadraft/mistral_base.csv
Top-1 accuracy : 0/116 = 0.0%
Top-3 accuracy : 0/116 = 0.0%
🔍  evaluating 116 examples …


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

📄  saved → /content/drive/MyDrive/nbadraft/mistral_lora.csv
Top-1 accuracy : 38/116 = 32.8%
Top-3 accuracy : 38/116 = 32.8%


In [None]:
#!/usr/bin/env python3
# evaluate_mistral_top3.py
#
#   • works with BASE model or a PEFT adapter (set ADAPTER_PATH="")
#   • removes NBA_VORP_3yr from the prompt
#   • asks for exactly three picks
#   • stores per-example results in a CSV

import re, json, csv, torch, os, sys, random
from pathlib import Path
from typing import List

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# ──────────── paths ────────────────────────────────────────────────
TEST_JSONL   = "/content/drive/MyDrive/nbadraft/test_samples.jsonl"
BASE_MODEL   = "mistralai/Mistral-7B-v0.1"
ADAPTER_PATH = "/content/drive/MyDrive/nbadraft/mistral-lora-v2"        # set to LoRA dir or leave "" for vanilla model
OUT_CSV      = "results_mistral.csv"
N_SAMPLES    = 10        # how many rows to evaluate

# ──────────── build prompt helpers ─────────────────────────────────
# remove the VORP info (anything like " | NBA_VORP_3yr: ...")
STRIP_VORP = re.compile(r"\s*\|\s*NBA_VORP_3yr:[^\n]+")

FEW_SHOT = (
    "### Example\n"
    "Question: Which player should they draft, and why?\n"
    "Answer:\n"
    "1. Victor Wembanyama (C)\n"
    "2. Oscar Tshiebwe (C)\n"
    "3. Liam Robbins (C)\n\n"
    "### Now do the following draft case:\n"
)

INSTRUCTION = (
    "\n### Answer\n"
    "List **exactly three** draft options, best first.\n"
    "Format:\n"
    "1. NAME (pos)\n2. NAME (pos)\n3. NAME (pos)\n"
    "1."
)

line_rgx = re.compile(r"^\s*\d\.\s*([^\(]+)", re.M)

def top3_from(text:str)->List[str]:
    """return up to three lowercase names in order"""
    return [m.group(1).strip().lower() for m in line_rgx.finditer(text)][:3]

pick_rgx = re.compile(r"Pick:\s*([^\(]+)", re.I)

def gold_name(completion:str)->str:
    m = pick_rgx.search(completion)
    return m.group(1).strip().lower() if m else ""

# ──────────── load test set ───────────────────────────────────────
samples=[]
with open(TEST_JSONL,encoding="utf-8") as f:
    for i,line in enumerate(f):
        if i>=N_SAMPLES: break
        full=json.loads(line)["text"]
        full=STRIP_VORP.sub("",full)
        prompt,completion=full.split("### Response:",1)
        samples.append((prompt.strip(),completion.strip()))

# ──────────── model & tokenizer ───────────────────────────────────
bnb = BitsAndBytesConfig(load_in_8bit=True,
                         llm_int8_enable_fp32_cpu_offload=True)

base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        device_map="auto",
        quantization_config=bnb,
        trust_remote_code=True)

if ADAPTER_PATH:
    model = PeftModel.from_pretrained(base, ADAPTER_PATH, device_map="auto")
else:
    model = base
model.eval()

tok = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tok.pad_token = tok.eos_token

# ──────────── evaluate ────────────────────────────────────────────
random.seed(42)
torch.manual_seed(42)

rows=[]
hits1=hits3=0

for idx,(prompt,gold_part) in enumerate(samples,1):
    gold=gold_name(gold_part)

    full_prompt = FEW_SHOT + prompt + INSTRUCTION
    inp = tok(full_prompt,return_tensors="pt").to(model.device)
    out = model.generate(**inp,max_new_tokens=80,
                         temperature=0.5,top_p=0.9,
                         eos_token_id=tok.eos_token_id,
                         pad_token_id=tok.eos_token_id)
    gen=tok.decode(out[0][inp.input_ids.shape[1]:],skip_special_tokens=True)

    top3 = top3_from(gen)
    pred = top3[0] if top3 else ""
    hit1 = pred==gold
    hit3 = gold in top3
    hits1+=hit1
    hits3+=hit3

    rows.append({
        "idx":idx,
        "gold":gold,
        "pred_top1":pred,
        "in_top3":hit3,
        "top3_str":"; ".join(top3),
        "raw_output":gen.replace("\n"," \\n ")[:200]  # truncate for csv
    })

    print(f"{idx:02d}: {gold:25s} | {pred:25s} "
          f"{'✅' if hit1 else '❌'}  (Top-3 {'✅' if hit3 else '❌'})")

acc1=hits1/len(samples)*100
acc3=hits3/len(samples)*100
print(f"\nTop-1 accuracy : {hits1}/{len(samples)} = {acc1:.1f}%")
print(f"Top-3 accuracy : {hits3}/{len(samples)} = {acc3:.1f}%")

# ──────────── save csv ────────────────────────────────────────────
with open(OUT_CSV,"w",newline="",encoding="utf-8") as fout:
    w=csv.DictWriter(fout,fieldnames=rows[0].keys())
    w.writeheader(); w.writerows(rows)
print("Saved:",OUT_CSV)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

01: victor wembanyama         | oscar tshiebwe            ❌  (Top-3 ❌)
02: brandon miller            | ausar thompson            ❌  (Top-3 ❌)
03: scoot henderson           | markquis nowell           ❌  (Top-3 ❌)
04: amen thompson             | yuri collins              ❌  (Top-3 ❌)
05: ausar thompson            | keyontae johnson          ❌  (Top-3 ✅)
06: anthony black             | anthony black             ✅  (Top-3 ✅)
07: bilal coulibaly           | oscar tshiebwe            ❌  (Top-3 ❌)
08: jarace walker             | mouhamed gueye            ❌  (Top-3 ❌)
09: taylor hendricks          | gg jackson                ❌  (Top-3 ❌)
10: cason wallace             | cason wallace             ✅  (Top-3 ✅)

Top-1 accuracy : 2/10 = 20.0%
Top-3 accuracy : 3/10 = 30.0%
Saved: results_mistral.csv


In [None]:
# assuming the CSV sits in /content/results_mistral.csv
!cp /content/results_mistral_top3.csv "/content/drive/MyDrive/nbadraft/results_mistral_top3.csv"


In [None]:
#!/usr/bin/env python3
# evaluate_three_pick_format.py
import json, re, csv, torch, os, random
from pathlib import Path
from typing import List
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# ──────────────── paths ─────────────────────────────────────────────
DATASET_PATH = "/content/drive/MyDrive/nbadraft/test_samples.jsonl"
MODEL_NAME   = "mistralai/Mistral-7B-v0.1"
ADAPTER_PATH = "/content/drive/MyDrive/nbadraft/mistral-lora-v9"  # None = base
OUT_CSV      = "/content/results_mistral_top3.csv"

N_EXAMPLES   = 10        # set -1 to run the whole file
MAX_NEW      = 120
SEED         = 42
torch.manual_seed(SEED); random.seed(SEED)

# ──────────────── helpers ──────────────────────────────────────────
def load_examples(path, n=-1):
    exs = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            full = json.loads(line)["text"]
            prompt, completion = full.split("### Response:", 1)
            exs.append({"prompt": prompt.strip(),
                        "gold":   extract_name(completion)})
            if 0 < n == len(exs):
                break
    return exs

def extract_name(blob: str) -> str:
    """catch 'victor wembanyama', 'Victor Wembanyama', etc."""
    m = re.search(r"pick\s*:\s*([^)|,\n\r]+)", blob, re.I)
    return normalise(m.group(1)) if m else ""

def normalise(name: str) -> str:
    return re.sub(r"[^a-z ]", "", name.lower()).strip()

def extract_predicted_names(text: str, k: int = 3) -> List[str]:
    """
    Accepts a chunk like
        Pick1: Victor Wembanyama (C)
        Pick2: Liam Robbins – C
        3) Drew Timme
    and returns up to three *normalised* names.
    """
    # split on lines / commas / semicolons
    pieces = re.split(r"[,\n;]+", text)
    names  = []
    for p in pieces:
        # remove enumeration like '1)', 'Pick2:', '•' etc.
        p = re.sub(r"^\s*(pick)?\s*[\d\-a-z]+\s*[:\)\.]\s*", "", p, flags=re.I)
        # strip position in parentheses
        p = re.sub(r"\(.*?\)", "", p)
        p = p.strip()
        if p:
            names.append(normalise(p))
        if len(names) == k:
            break
    return names

# ------------- load model + tokenizer -------------------------------
bnb = BitsAndBytesConfig(load_in_8bit=True,
                         llm_int8_enable_fp32_cpu_offload=True)
base = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME, device_map="auto",
            quantization_config=bnb, trust_remote_code=True)

if ADAPTER_PATH and Path(ADAPTER_PATH).exists():
    model = PeftModel.from_pretrained(base, ADAPTER_PATH, device_map="auto")
    print("Loaded fine-tuned adapter ✅")
else:
    model = base
    print("Running raw Mistral-7B ✅")

model.eval()
tok = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tok.pad_token = tok.eos_token     # safety

# ------------- evaluation loop --------------------------------------
examples = load_examples(DATASET_PATH, N_EXAMPLES)
print(f"Evaluating {len(examples)} examples …\n")

rows = []; top1_hits = top3_hits = 0

for idx, ex in enumerate(examples, 1):
    # ---------- build prompt as a plain string ----------------------
    prompt = (
        "System: You are an expert NBA draft assistant.\n\n"
        + ex["prompt"]
        + "\n\n### Task:\n"
          "List **exactly three** draft candidates, ranked 1-3, each on its own line:\n"
          "Pick1: <name>\nPick2: <name>\nPick3: <name>\n"
          "Do NOT add analysis or extra lines.\n\nAssistant:"
    )

    inputs = tok(prompt, return_tensors="pt").to(model.device)

    gen = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW,
        temperature=0.2,
        top_p=0.9,
        eos_token_id=tok.eos_token_id,
        pad_token_id=tok.eos_token_id
    )
    reply = tok.decode(gen[0][inputs.input_ids.shape[1]:],
                       skip_special_tokens=True).strip()

    preds = extract_predicted_names(reply, 3)
    gold  = ex["gold"]
    top1  = preds[0] if preds else ""

    top1_ok = (top1 == gold)
    top3_ok = (gold in preds)

    top1_hits += top1_ok
    top3_hits += top3_ok

    rows.append({
        "idx": idx,
        "gold": gold,
        "pred1": top1,
        "pred2": preds[1] if len(preds) > 1 else "",
        "pred3": preds[2] if len(preds) > 2 else "",
        "top1_hit": int(top1_ok),
        "top3_hit": int(top3_ok),
        "raw_generation": reply
    })

    s1 = "✅" if top1_ok else "❌"
    s3 = "✅" if top3_ok else "❌"
    print(f"{idx:>3}. {gold:22} → {top1:22} {s1} (Top-3 {s3})")

# ----------- summary & CSV ------------------------------------------
tot = len(examples)
print(f"\nTop-1 accuracy : {top1_hits}/{tot} = {100*top1_hits/tot:.1f}%")
print(f"Top-3 accuracy : {top3_hits}/{tot} = {100*top3_hits/tot:.1f}%")

with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    fieldnames = ["idx","gold","pred1","pred2","pred3","top1_hit","top3_hit","raw_generation"]
    csv.DictWriter(f, fieldnames).writeheader()
    csv.DictWriter(f, fieldnames).writerows(rows)

print("CSV saved →", OUT_CSV)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded fine-tuned adapter ✅
Evaluating 10 examples …





  1. victor wembanyama c    → the san antonio spurs should draft liam robbins his versatile scoring ability and strong rim protection ❌ (Top-3 ❌)
  2. brandon miller sf      → the charlotte hornets should draft jalen wilson his versatile frontcourt play and strong rebounding ability ❌ (Top-3 ❌)
  3. scoot henderson pg     → the portland trail blazers should draft amen thompson his excellent transition play and versatile defensive skills make him a perfect fit for a team looking to bolster its perimeter defense and create dynamic ballhandling options with an impressive  assists per game ❌ (Top-3 ❌)
  4. amen thompson pg       → the houston rockets should draft amen thompson his versatile defensive skills and excellent transition play make him an ideal fit for a rebuilding team looking to establish a strong defensive identity with an impressive  assists per game ❌ (Top-3 ❌)
  5. ausar thompson sf      → the detroit pistons should draft hunter tyson his elite rebounding ability ❌ (Top-3 ❌

In [None]:
#!/usr/bin/env python3
# evaluate_single_pick.py
# -----------------------------------------------------------
# Evaluate 1-pick accuracy on the first N test examples
# for a LoRA-tuned Mistral-7B checkpoint.
# -----------------------------------------------------------

import json, re, csv, torch, math
from pathlib import Path
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                          BitsAndBytesConfig)
from peft import PeftModel

# ------------- paths ---------------------------------------
TEST_JSONL = "/content/drive/MyDrive/nbadraft/test_samples.jsonl"
BASE_MODEL = "mistralai/Mistral-7B-v0.1"
LORA_DIR   = "/content/drive/MyDrive/nbadraft/mistral-lora-v9"
OUT_CSV    = "/content/nbadraft_eval_single.csv"

N_EXAMPLES = 10          # how many rows to check
MAX_NEW    = 64
SEED       = 42

# ------------- load test prompts ---------------------------
def read_examples(path, n):
    out = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            rec   = json.loads(line)["text"]
            prompt, completion = rec.split("### Response:", 1)
            gold = re.search(r"Pick:\s*([^\(]+)", completion,
                             re.I).group(1).strip().lower()
            out.append({"prompt": prompt.strip(), "gold": gold})
            if len(out) == n:
                break
    return out

examples = read_examples(TEST_JSONL, N_EXAMPLES)

# ------------- load tokenizer & model ----------------------
bnb = BitsAndBytesConfig(load_in_8bit=True,
                         llm_int8_enable_fp32_cpu_offload=True)

base = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL, device_map="auto",
            quantization_config=bnb, trust_remote_code=True)

model = PeftModel.from_pretrained(base, LORA_DIR, device_map="auto")
model.eval()

tok = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tok.pad_token = tok.eos_token

# ------------- regex helper --------------------------------
name_pat = re.compile(r"Pick:\s*([^\n\(]+)", re.I)

def extract_pick(text):
    m = name_pat.search(text)
    return m.group(1).strip().lower() if m else ""

# ------------- evaluation ----------------------------------
torch.manual_seed(SEED)
hits, rows = 0, []

for idx, ex in enumerate(examples, 1):
    full_prompt = ex["prompt"] + "\n\n### Response:"
    inpt = tok(full_prompt, return_tensors="pt").to(model.device)

    gen_ids = model.generate(
        **inpt,
        max_new_tokens=MAX_NEW,
        temperature=0.4,
        top_p=0.85,
        repetition_penalty=1.05,
        do_sample=True,
        eos_token_id=tok.eos_token_id,
        pad_token_id=tok.eos_token_id
    )
    ans = tok.decode(gen_ids[0][inpt.input_ids.shape[1]:],
                     skip_special_tokens=True).strip()

    pred = extract_pick(ans)
    correct = (pred == ex["gold"])
    hits += correct

    rows.append({
        "idx": idx,
        "gold": ex["gold"],
        "pred": pred,
        "hit": int(correct),
        "raw": ans
    })

    mark = "✅" if correct else "❌"
    print(f"{idx:>3}. {ex['gold']:<22} → {pred or '(none)'} {mark}")

# ------------- summary -------------------------------------
acc = 100 * hits / len(examples)
print(f"\nExact-match accuracy: {hits}/{len(examples)} = {acc:.1f}%")

with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=rows[0].keys())
    w.writeheader(); w.writerows(rows)
print("CSV saved to:", OUT_CSV)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  1. victor wembanyama      → victor wembanyama ✅
  2. brandon miller         → jalen wilson ❌
  3. scoot henderson        → markquis nowell ❌
  4. amen thompson          → markquis nowell ❌
  5. ausar thompson         → jalen wilson ❌
  6. anthony black          → jalen hood-schifino ❌
  7. bilal coulibaly        → leonard miller ❌
  8. jarace walker          → jarace walker ✅
  9. taylor hendricks       → kris murray ❌
 10. cason wallace          → cason wallace ✅

Exact-match accuracy: 3/10 = 30.0%
CSV saved to: /content/nbadraft_eval_single.csv


In [None]:
#!/usr/bin/env python3
"""
quick_eval_top3.py – fast sanity-check of a LoRA model on the first N samples.

• shows GOLD name, model’s Top-3 names, ✓/✗ for Top-1 and Top-3
• prints final Top-1 and Top-3 accuracy

Assumes the JSONL test file has only a "text" field (prompt + completion).
"""

import json, re, unicodedata, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# ─────────────────────────── config ──────────────────────────────
DATASET_PATH = "/content/drive/MyDrive/nbadraft/test_samples.jsonl"
MODEL_NAME   = "mistralai/Mistral-7B-v0.1"
ADAPTER_PATH = "/content/drive/MyDrive/nbadraft/mistral-lora-v9"

N_EXAMPLES   = 58           # how many test rows to evaluate
MAX_NEW_TOK  = 120           # generation length
SEED         = 42

# optional system-prompt (kept short because this is SFT – we just prepend it)
SYS_PROMPT = "You are an NBA draft assistant. Answer with 'Pick: <name>' only."
# ─────────────────────────────────────────────────────────────────

# ========== utilities =======================================================

pick_line_re = re.compile(r"pick\s*:\s*([^\n]+)", re.I)

def _clean(name: str) -> str:
    """
    Strip position tags, dashes, commas; normalise spaces; lowercase.
    """
    name = name.split(",", 1)[0]       # drop “, SG” or similar
    name = name.split("(", 1)[0]       # drop parentheses
    name = name.split("–", 1)[0]       # drop long dash parts
    name = unicodedata.normalize("NFKD", name)
    return re.sub(r"\s+", " ", name).strip().lower()

def extract_topk_picks(answer: str, k: int = 3) -> list[str]:
    """
    Return up to *k* candidate names from the model answer, keeping order.
    Handles 'A or B / C' etc.
    """
    m = pick_line_re.search(answer)
    if not m:
        return []
    segment = m.group(1)
    parts = re.split(r"\s*(?:,|/|\bor\b|&)\s*", segment)
    out, seen = [], set()
    for p in parts:
        n = _clean(p)
        if n and n not in seen:
            out.append(n); seen.add(n)
            if len(out) == k:
                break
    return out

def gold_name(sample_completion: str) -> str:
    m = pick_line_re.search(sample_completion)
    return _clean(m.group(1)) if m else ""

def load_examples(path: str, n: int):
    data = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            full = json.loads(line)["text"]
            prompt, completion = full.split("### Response:", 1)
            data.append({"prompt": prompt.strip(), "completion": completion.strip()})
            if len(data) == n:
                break
    return data

# ========== load model ======================================================
print("⌛  loading model …")
bnb_cfg = BitsAndBytesConfig(load_in_8bit=True,
                             llm_int8_enable_fp32_cpu_offload=True)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_cfg,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH, device_map="auto")
model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
torch.manual_seed(SEED)

# ========== run evaluation ==================================================
examples = load_examples(DATASET_PATH, N_EXAMPLES)

hits_top1 = hits_top3 = 0
print(f"\n=== Testing first {N_EXAMPLES} examples ===\n")

for idx, ex in enumerate(examples, 1):
    prompt_text = ex["prompt"]
    gold = gold_name(ex["completion"])

    # prepend system prompt
    full_prompt = SYS_PROMPT + "\n\n" + prompt_text + "\n### Response:"

    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
    out_ids = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOK,
        top_p=0.85,
        temperature=0.4,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )
    answer = tokenizer.decode(out_ids[0][inputs.input_ids.shape[1]:],
                              skip_special_tokens=True).strip()

    preds = extract_topk_picks(answer, k=3)
    top1_correct  = preds and preds[0] == gold
    top3_correct  = gold in preds

    hits_top1 += top1_correct
    hits_top3 += top3_correct

    # ---------- minimal console output ----------------------------
    pdisp = ", ".join(preds) if preds else "(none)"
    print(f"--- Example {idx:>2} ---")
    print(f"GOLD : {gold}")
    print(f"PRED : {pdisp:<40}  "
          f"{'✅' if top1_correct else '❌'}  "
          f"(Top-3 {'✅' if top3_correct else '❌'})\n")

# ========== summary =========================================================
tot = len(examples)
print(f"Top-1 exact-match accuracy : {hits_top1}/{tot}  = {hits_top1/tot*100:.1f}%")
print(f"Top-3 soft accuracy       : {hits_top3}/{tot}  = {hits_top3/tot*100:.1f}%")


⌛  loading model …


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


=== Testing first 58 examples ===

--- Example  1 ---
GOLD : victor wembanyama
PRED : victor wembanyama                         ✅  (Top-3 ✅)

--- Example  2 ---
GOLD : brandon miller
PRED : jalen wilson                              ❌  (Top-3 ❌)

--- Example  3 ---
GOLD : scoot henderson
PRED : scoot henderson                           ✅  (Top-3 ✅)

--- Example  4 ---
GOLD : amen thompson
PRED : amen thompson                             ✅  (Top-3 ✅)

--- Example  5 ---
GOLD : ausar thompson
PRED : keyontae johnson                          ❌  (Top-3 ❌)

--- Example  6 ---
GOLD : anthony black
PRED : anthony black, sg)                        ✅  (Top-3 ✅)

--- Example  7 ---
GOLD : bilal coulibaly
PRED : bilal coulibaly                           ✅  (Top-3 ✅)

--- Example  8 ---
GOLD : jarace walker
PRED : jarace walker                             ✅  (Top-3 ✅)

--- Example  9 ---
GOLD : taylor hendricks
PRED : gg jackson                                ❌  (Top-3 ❌)

--- Example 10 ---
GOLD 

In [None]:
#!/usr/bin/env python3
"""
quick_eval_top3.py – fast sanity-check of a LoRA model on the first N samples.

• shows GOLD name, model’s Top-3 names, ✓/✗ for Top-1 and Top-3
• prints final Top-1 and Top-3 accuracy

Assumes the JSONL test file has only a "text" field (prompt + completion).
"""

import json, re, unicodedata, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# ─────────────────────────── config ──────────────────────────────
DATASET_PATH = "/content/drive/MyDrive/nbadraft/test_samples.jsonl"
MODEL_NAME   = "mistralai/Mistral-7B-v0.1"
ADAPTER_PATH = "/content/drive/MyDrive/nbadraft/mistral-lora-v9"

N_EXAMPLES   = 116           # how many test rows to evaluate
MAX_NEW_TOK  = 120           # generation length
SEED         = 42

# optional system-prompt (kept short because this is SFT – we just prepend it)
SYS_PROMPT = "You are an NBA draft assistant. Answer with 'Pick: <name>' only."
# ─────────────────────────────────────────────────────────────────

# ========== utilities =======================================================

pick_line_re = re.compile(r"pick\s*:\s*([^\n]+)", re.I)

def _clean(name: str) -> str:
    """
    Strip position tags, dashes, commas; normalise spaces; lowercase.
    """
    name = name.split(",", 1)[0]       # drop “, SG” or similar
    name = name.split("(", 1)[0]       # drop parentheses
    name = name.split("–", 1)[0]       # drop long dash parts
    name = unicodedata.normalize("NFKD", name)
    return re.sub(r"\s+", " ", name).strip().lower()

def extract_topk_picks(answer: str, k: int = 3) -> list[str]:
    """
    Return up to *k* candidate names from the model answer, keeping order.
    Handles 'A or B / C' etc.
    """
    m = pick_line_re.search(answer)
    if not m:
        return []
    segment = m.group(1)
    parts = re.split(r"\s*(?:,|/|\bor\b|&)\s*", segment)
    out, seen = [], set()
    for p in parts:
        n = _clean(p)
        if n and n not in seen:
            out.append(n); seen.add(n)
            if len(out) == k:
                break
    return out

def gold_name(sample_completion: str) -> str:
    m = pick_line_re.search(sample_completion)
    return _clean(m.group(1)) if m else ""

def load_examples(path: str, n: int):
    data = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            full = json.loads(line)["text"]
            prompt, completion = full.split("### Response:", 1)
            data.append({"prompt": prompt.strip(), "completion": completion.strip()})
            if len(data) == n:
                break
    return data

# ========== load model ======================================================
print("⌛  loading model …")
bnb_cfg = BitsAndBytesConfig(load_in_8bit=True,
                             llm_int8_enable_fp32_cpu_offload=True)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_cfg,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH, device_map="auto")
model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
torch.manual_seed(SEED)

# ========== run evaluation ==================================================
examples = load_examples(DATASET_PATH, N_EXAMPLES)

hits_top1 = hits_top3 = 0
print(f"\n=== Testing first {N_EXAMPLES} examples ===\n")

for idx, ex in enumerate(examples, 1):
    prompt_text = ex["prompt"]
    gold = gold_name(ex["completion"])

    # prepend system prompt
    full_prompt = SYS_PROMPT + "\n\n" + prompt_text + "\n### Response:"

    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
    out_ids = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOK,
        top_p=0.85,
        temperature=0.4,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )
    answer = tokenizer.decode(out_ids[0][inputs.input_ids.shape[1]:],
                              skip_special_tokens=True).strip()

    preds = extract_topk_picks(answer, k=3)
    top1_correct  = preds and preds[0] == gold
    top3_correct  = gold in preds

    hits_top1 += top1_correct
    hits_top3 += top3_correct

    # ---------- minimal console output ----------------------------
    pdisp = ", ".join(preds) if preds else "(none)"
    print(f"--- Example {idx:>2} ---")
    print(f"GOLD : {gold}")
    print(f"PRED : {pdisp:<40}  "
          f"{'✅' if top1_correct else '❌'}  "
          f"(Top-3 {'✅' if top3_correct else '❌'})\n")

# ========== summary =========================================================
tot = len(examples)
print(f"Top-1 exact-match accuracy : {hits_top1}/{tot}  = {hits_top1/tot*100:.1f}%")
print(f"Top-3 soft accuracy       : {hits_top3}/{tot}  = {hits_top3/tot*100:.1f}%")


⌛  loading model …


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]




=== Testing first 116 examples ===

--- Example  1 ---
GOLD : victor wembanyama
PRED : victor wembanyama                         ✅  (Top-3 ✅)

--- Example  2 ---
GOLD : brandon miller
PRED : jalen wilson                              ❌  (Top-3 ❌)

--- Example  3 ---
GOLD : scoot henderson
PRED : scoot henderson                           ✅  (Top-3 ✅)

--- Example  4 ---
GOLD : amen thompson
PRED : markquis nowell                           ❌  (Top-3 ❌)

--- Example  5 ---
GOLD : ausar thompson
PRED : jalen wilson                              ❌  (Top-3 ❌)

--- Example  6 ---
GOLD : anthony black
PRED : anthony black, sg)                        ✅  (Top-3 ✅)

--- Example  7 ---
GOLD : bilal coulibaly
PRED : jalen wilson                              ❌  (Top-3 ❌)

--- Example  8 ---
GOLD : jarace walker
PRED : jarace walker                             ✅  (Top-3 ✅)

--- Example  9 ---
GOLD : taylor hendricks
PRED : kris murray                               ❌  (Top-3 ❌)

--- Example 10 ---
GOLD