In [None]:
from pathlib import Path
import json

DATA_DIR = Path("./LLM_NewDataset_phi/Data")

for jf in sorted(DATA_DIR.glob("*.json")):
    print(f"\n[CHECK] {jf}")
    try:
        with open(jf, "r", encoding="utf-8") as f:
            data = json.load(f)
        print("  OK â€“ valid JSON")
    except Exception as e:
        print("  JSON ERROR:", e)


In [None]:


import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
os.environ["TRANSFORMERS_NO_JAX"] = "1"

import json
import re
from pathlib import Path
import random
from typing import Dict, Any

import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
from peft import LoraConfig, get_peft_model, PeftModel

BASE_DIR   = Path("./LLM_NewDataset_phi")  
DATA_DIR   = BASE_DIR / "Data"
RESULT_DIR = BASE_DIR / "result"
MODEL_DIR  = BASE_DIR / "models/ft_tinyllama_pmp_manual"   # <--- changed name

# BASE_DIR   = Path("./LLM_Dataset")
# DATA_DIR   = BASE_DIR / "Data"
# RESULT_DIR = BASE_DIR / "result"
# MODEL_DIR  = BASE_DIR / "models/ft_qwen2.5_1.5B_pmp_manual"


RESULT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"   
# MODEL_ID = "Qwen/Qwen2.5-1.5B"

MAX_NEW_TOKENS = 64
N_SHOW          = 10
MAX_LENGTH      = 512

# Split ratios
TRAIN_RATIO = 0.7
VAL_RATIO   = 0.1
TEST_RATIO  = 0.2
RANDOM_SEED = 42

NUM_EPOCHS = 50
BATCH_SIZE = 2
LR         = 5e-5

overall_csv_path  = RESULT_DIR / "pmp_ft_tinyllama_global_TEST_results.csv"
overall_xlsx_path = RESULT_DIR / "pmp_ft_tinyllama_global_TEST_results.xlsx"
# overall_csv_path  = RESULT_DIR / "pmp_ft_qwen_global_TEST_results.csv"
# overall_xlsx_path = RESULT_DIR / "pmp_ft_qwen_global_TEST_results.xlsx"


json_files = sorted(DATA_DIR.glob("*.json"))
if not json_files:
    raise FileNotFoundError(
        f"No JSON files found in {DATA_DIR}. "
        "Put your question_*.json or Question_*_trick_160.json files there."
    )

all_questions = []
for jf in json_files:
    src_name = jf.name
    print(f"[INFO] Loading {jf}")
    with open(jf, "r", encoding="utf-8") as f:
        data = json.load(f)
        if not isinstance(data, list):
            raise ValueError(f"File {jf} must contain a list of question objects (JSON array).")
        for q in data:
            q["source_file"] = src_name
            all_questions.append(q)

print(f"[INFO] Total questions loaded from {len(json_files)} files: {len(all_questions)}")

required_fields = {"question_id", "question", "type", "options", "answer", "template_type"}
missing_fields = set()
for q in all_questions[: min(10, len(all_questions))]:
    for rf in required_fields:
        if rf not in q:
            missing_fields.add(rf)
if missing_fields:
    print(f"[WARN] Some required fields are missing in sample questions: {missing_fields}")
else:
    print("[INFO] Sample questions look structurally OK.")

def determine_format(q_obj: Dict[str, Any]) -> str:
    t = q_obj.get("type", "").strip().upper()
    if "T" in t:   # e.g. 'T/F'
        return "TF"
    return "MCQ"

def build_prompt_tf_for_train(q_obj, answer_letter: str) -> str:
    return (
        "You are a historian. Answer the following True/False question using ONLY "
        "the letter 'a' for True or 'b' for False.\n\n"
        f"Question: {q_obj['question']}\n"
        "Options:\n"
        "A. True\n"
        "B. False\n\n"
        f"Answer: {answer_letter}\n"
    )

def build_prompt_mcq_for_train(q_obj, answer_letter: str) -> str:
    opts = "\n".join(f"{k.upper()}. {v}" for k, v in q_obj["options"].items())
    return (
        "You are a historian. Choose the correct option and answer using ONLY the "
        "letter (a, b, c, or d).\n\n"
        f"Question: {q_obj['question']}\n{opts}\n\n"
        f"Answer: {answer_letter}\n"
    )

def build_prompt_tf_for_infer(q_obj) -> str:
    return (
        "You are a historian. Answer the following True/False question using ONLY "
        "the letter 'a' for True or 'b' for False.\n\n"
        f"Question: {q_obj['question']}\n"
        "Options:\n"
        "A. True\n"
        "B. False\n\n"
        "Answer:"
    )

def build_prompt_mcq_for_infer(q_obj) -> str:
    opts = "\n".join(f"{k.upper()}. {v}" for k, v in q_obj["options"].items())
    return (
        "You are a historian. Choose the correct option and answer using ONLY the "
        "letter (a, b, c, or d).\n\n"
        f"Question: {q_obj['question']}\n{opts}\n\n"
        "Answer:"
    )

def extract_letter_from_output(text: str) -> str:
    t = text.strip()
    if t and t[0].lower() in ["a", "b", "c", "d"]:
        return t[0].lower()
    m = re.search(r"\b([A-D])\b", t, re.IGNORECASE)
    if m:
        return m.group(1).lower()
    return "unknown"

random.seed(RANDOM_SEED)

by_template: Dict[str, list] = {}
for q in all_questions:
    ttype = q.get("template_type", "unknown")
    by_template.setdefault(ttype, []).append(q)

train_data = []
val_data   = []
test_data  = []

for ttype, qlist in by_template.items():
    random.shuffle(qlist)
    n = len(qlist)

    n_train = int(TRAIN_RATIO * n)
    n_val   = int(VAL_RATIO * n)
    n_test  = n - n_train - n_val

    t_train = qlist[:n_train]
    t_val   = qlist[n_train:n_train + n_val]
    t_test  = qlist[n_train + n_val:]

    train_data.extend(t_train)
    val_data.extend(t_val)
    test_data.extend(t_test)

    print(
        f"[SPLIT] template_type={ttype}: "
        f"train={len(t_train)}, val={len(t_val)}, test={len(t_test)} (total={n})"
    )

print(
    f"[SPLIT] TOTAL train={len(train_data)}, "
    f"val={len(val_data)}, test={len(test_data)}, all={len(all_questions)}"
)

print(f"[INFO] Loading base model: {MODEL_ID}")

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Using device: {device}")

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
).to(device)

# (Optional, but recommended to avoid warnings during training)
if hasattr(base_model.config, "use_cache"):
    base_model.config.use_cache = False

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

class PMPDataset(Dataset):
    def __init__(self, questions, tokenizer, max_length=512):
        self.questions = questions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        q_obj = self.questions[idx]
        fmt = determine_format(q_obj)
        gold_letter = q_obj["answer"].lower()

        if fmt == "TF":
            text = build_prompt_tf_for_train(q_obj, gold_letter)
        else:
            text = build_prompt_mcq_for_train(q_obj, gold_letter)

        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )

        input_ids = enc["input_ids"].squeeze(0)
        attention_mask = enc["attention_mask"].squeeze(0)
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

train_dataset = PMPDataset(train_data, tokenizer, max_length=MAX_LENGTH)
val_dataset   = PMPDataset(val_data,   tokenizer, max_length=MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)

optimizer = AdamW(model.parameters(), lr=LR)

total_steps = NUM_EPOCHS * max(len(train_loader), 1)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps) if total_steps > 0 else 0,
    num_training_steps=total_steps if total_steps > 0 else 1,
)

print("[INFO] Starting manual LoRA fine-tuning...")

for epoch in range(NUM_EPOCHS):
    model.train()
    total_train_loss = 0.0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / max(len(train_loader), 1)

    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / max(len(val_loader), 1)

    print(
        f"[EPOCH {epoch+1}/{NUM_EPOCHS}] "
        f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}"
    )

print("[INFO] Training completed.")

model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
print(f"[INFO] Saved fine-tuned LoRA model to {MODEL_DIR}")

print("[INFO] Reloading base model + LoRA adapter for TEST evaluation...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
).to(device)

if hasattr(base_model.config, "use_cache"):
    base_model.config.use_cache = False

model = get_peft_model(base_model, lora_config)
model = PeftModel.from_pretrained(model, MODEL_DIR).to(device)
model.eval()
print("[INFO] Fine-tuned model ready for TEST set.")

def ask_model(prompt: str, max_new_tokens: int = MAX_NEW_TOKENS) -> str:
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
        )
    full = tokenizer.decode(out[0], skip_special_tokens=True)
    return full[len(prompt):].strip()

overall_correct = 0
overall_total = 0
per_template_stats: Dict[str, Dict[str, int]] = {}
per_format_stats: Dict[str, Dict[str, int]] = {}
per_file_stats: Dict[str, Dict[str, int]] = {}
results_rows = []
examples_shown = 0

print(f"[INFO] Evaluating on TEST set of size {len(test_data)}")

for q in test_data:
    qid = q.get("question_id")
    q_type = q.get("template_type", "unknown")
    fmt = determine_format(q)
    question_text = q["question"]
    gold_letter = q["answer"].lower()
    src_file = q.get("source_file", "unknown")

    if fmt == "TF":
        prompt = build_prompt_tf_for_infer(q)
    else:
        prompt = build_prompt_mcq_for_infer(q)

    raw_answer = ask_model(prompt)
    pred_letter = extract_letter_from_output(raw_answer)
    is_correct = int(pred_letter == gold_letter)

    overall_correct += is_correct
    overall_total += 1

    per_template_stats.setdefault(q_type, {"correct": 0, "total": 0})
    per_template_stats[q_type]["correct"] += is_correct
    per_template_stats[q_type]["total"] += 1

    per_format_stats.setdefault(fmt, {"correct": 0, "total": 0})
    per_format_stats[fmt]["correct"] += is_correct
    per_format_stats[fmt]["total"] += 1

    per_file_stats.setdefault(src_file, {"correct": 0, "total": 0})
    per_file_stats[src_file]["correct"] += is_correct
    per_file_stats[src_file]["total"] += 1

    results_rows.append({
        "question_id": qid,
        "source_file": src_file,
        "template_type": q_type,
        "question_format": fmt,
        "question": question_text,
        "gold_answer_letter": gold_letter,
        "gold_answer_text": q["options"][gold_letter],
        "model_raw_answer": raw_answer,
        "model_parsed_letter": pred_letter,
        "model_parsed_text": q["options"].get(pred_letter, "") if pred_letter in q["options"] else "",
        "is_correct": is_correct,
    })

    if examples_shown < N_SHOW:
        print("\n==============================")
        print(f"[TEST EXAMPLE] QID {qid} | File: {src_file} | Type: {q_type} | Format: {fmt}")
        print("Q:", question_text)
        for k, v in q["options"].items():
            print(f"  {k.upper()}. {v}")
        print("MODEL RAW:", raw_answer)
        print("PRED LETTER:", pred_letter, "| GOLD LETTER:", gold_letter, "| CORRECT?", bool(is_correct))
        examples_shown += 1

# ========= 13. Summary =========
overall_acc = overall_correct / max(overall_total, 1)
print("\n==============================")
print(f"[RESULT] GLOBAL TEST accuracy: {overall_acc:.3f}")
print(f"[RESULT] Total TEST questions evaluated: {overall_total}")

print("\n[RESULT] Accuracy per template_type (TEST):")
for t, stats in per_template_stats.items():
    acc = stats["correct"] / max(stats["total"], 1)
    total_q = stats["total"]
    print(f"  - {t}: {acc:.3f}  (n={total_q})")

print("\n[RESULT] Accuracy per question_format (TEST):")
for ffmt, stats in per_format_stats.items():
    acc = stats["correct"] / max(stats["total"], 1)
    total_q = stats["total"]
    print(f"  - {ffmt}: {acc:.3f}  (n={total_q})")

print("\n[RESULT] Accuracy per source_file (TEST):")
for fname, stats in per_file_stats.items():
    acc = stats["correct"] / max(stats["total"], 1)
    total_q = stats["total"]
    print(f"  - {fname}: {acc:.3f}  (n={total_q})")

# ========= 14. Save GLOBAL TEST results =========
results_df = pd.DataFrame(results_rows)
results_df.to_csv(overall_csv_path, index=False)
results_df.to_excel(overall_xlsx_path, index=False)

print(f"\n[INFO] Saved GLOBAL TEST per-question results to CSV:   {overall_csv_path}")
print(f"[INFO] Saved GLOBAL TEST per-question results to Excel: {overall_xlsx_path}")

# ========= 15. Save PER-FILE TEST results =========
for fname, group_df in results_df.groupby("source_file"):
    safe_name = fname.replace(".json", "")
    file_csv  = RESULT_DIR / f"TEST_results_{safe_name}.csv"
    file_xlsx = RESULT_DIR / f"TEST_results_{safe_name}.xlsx"
    group_df.to_csv(file_csv, index=False)
    group_df.to_excel(file_xlsx, index=False)
    print(f"[INFO] Saved TEST results for {fname} -> {file_csv}, {file_xlsx}")
