In [6]:
import os, time
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, classification_report
from langchain_ollama import OllamaLLM

# ─── CONFIG ───
DATASET_DIRS = {
    "pan": "stratified_splits_80_10_10/pan",
    "maalej": "stratified_splits_80_10_10/maalej",
    "scalabrino": "stratified_splits_80_10_10/scalabrino",
}

OLLAMA_MODEL = "llama3:70b"
# Sanitize model name to make it a valid folder name
MODEL_NAME_CLEAN = OLLAMA_MODEL.replace(":", "_").replace("/", "_")
OUT_DIR = f"zero_shot_folds_{MODEL_NAME_CLEAN}"
PRED_DIR = os.path.join(OUT_DIR, "preds")
os.makedirs(PRED_DIR, exist_ok=True)

llm = OllamaLLM(model=OLLAMA_MODEL)

# ─── PROMPT TEMPLATES ───
LLAMA3_TEMPLATES = {
    "Pan": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Information Giving: factual descriptions, app details, or user opinions.\n"
        "Problem Discovery: issues, bugs, crashes, or errors.\n"
        "Feature Request: suggestions or desired improvements.\n"
        "Information Seeking: questions or asking for help.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Maalej": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Feature Request: suggestions or desires for new features, functionality, or improvements.\n"
        "Rating: numeric scores or general feedback like 'great app' or 'bad service'.\n"
        "User Experience: opinions about usability, UI, speed, or ease of use.\n"
        "Problem Discovery: reports of bugs, crashes, or errors.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Scalab": (
        "You are a strict classification assistant.\n"
        "Classify the following app review into ONE and ONLY ONE of these categories:\n\n"
        "BUG\nFEATURE\nPERFORMANCE\nENERGY\nSECURITY\nUSABILITY\nOTHER\n\n"
        "- If unclear, choose OTHER.\n"
        "- Output the label name exactly as written above.\n\n"
        "Review: {req_text}\n"
        "Label:"
    )
}

# ─── EVALUATION ───
all_results = []

def run_fold_zero_shot(df, labels, template, dataset_name, fold_id):
    y_true = df["class"].tolist()
    req_ids = df["id"].tolist()
    texts = df["review"].tolist() if "review" in df.columns else df["text"].tolist()

    preds = []
    for txt in tqdm(texts, desc=f"{dataset_name} Fold-{fold_id}"):
        prompt = template.format(req_text=txt)
        pred = llm.invoke(prompt).strip()
        pred = next((lab for lab in labels if lab.lower() in pred.lower()), labels[0])
        preds.append(pred)

    pd.DataFrame({
        "id": req_ids,
        "text": texts,
        "gold": y_true,
        "pred": preds
    }).to_csv(os.path.join(PRED_DIR, f"{dataset_name}_fold{fold_id}_preds.csv"), index=False)

    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(y_true, preds, average="macro", zero_division=0)
    micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(y_true, preds, average="micro", zero_division=0)

    return {
        "dataset": dataset_name, "fold": fold_id,
        "macro_precision": macro_p, "macro_recall": macro_r, "macro_f1": macro_f1,
        "micro_precision": micro_p, "micro_recall": micro_r, "micro_f1": micro_f1
    }

# ─── MAIN FOLD LOOP ───
for dataset_name, folder in DATASET_DIRS.items():
    print(f"\n=== {dataset_name} ===")
    labels = sorted(pd.read_csv(os.path.join(folder, "test_fold_0.csv"))["class"].unique())
    template = LLAMA3_TEMPLATES[dataset_name if dataset_name != "Scalabrino" else "Scalab"]

    for i in range(10):
        path = os.path.join(folder, f"test_fold_{i}.csv")
        df = pd.read_csv(path)
        metrics = run_fold_zero_shot(df, labels, template, dataset_name, i)
        all_results.append(metrics)

summary_df = pd.DataFrame(all_results)
summary_df.to_csv(os.path.join(OUT_DIR, "zero_shot_folded_summary.csv"), index=False)
print("\n📁 Saved per-fold macro/micro results.")
# ─── FINAL FULL SET EVALUATION ───
print("\n📊 Computing final combined weighted metrics...")

def safe_metric(report, key, metric):
    return report.get(key, {}).get(metric, 0.0)

for dataset_name in DATASET_DIRS:
    fold_preds = []
    for i in range(10):
        fold_path = os.path.join(PRED_DIR, f"{dataset_name}_fold{i}_preds.csv")
        fold_df = pd.read_csv(fold_path)
        fold_preds.append(fold_df)

    all_df = pd.concat(fold_preds, ignore_index=True)
    y_true = all_df["gold"].str.lower()
    y_pred = all_df["pred"].str.lower()

    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

    final_metrics = {
        "dataset": dataset_name,
        "macro_precision":   safe_metric(report, "macro avg", "precision"),
        "macro_recall":      safe_metric(report, "macro avg", "recall"),
        "macro_f1":          safe_metric(report, "macro avg", "f1-score"),
        "micro_precision":   safe_metric(report, "micro avg", "precision"),
        "micro_recall":      safe_metric(report, "micro avg", "recall"),
        "micro_f1":          safe_metric(report, "micro avg", "f1-score"),
        "weighted_precision": safe_metric(report, "weighted avg", "precision"),
        "weighted_recall":    safe_metric(report, "weighted avg", "recall"),
        "weighted_f1":        safe_metric(report, "weighted avg", "f1-score")
    }

    pd.DataFrame([final_metrics]).to_csv(
        os.path.join(OUT_DIR, f"{dataset_name}_final_combined_metrics.csv"),
        index=False
    )

    print(f"✅ {dataset_name}: macro-F1={final_metrics['macro_f1']:.3f}, weighted-F1={final_metrics['weighted_f1']:.3f}")

print("\n🎯 Done!")



=== Pan ===


Pan Fold-0: 100%|██████████| 139/139 [00:44<00:00,  3.14it/s]
Pan Fold-1: 100%|██████████| 139/139 [00:44<00:00,  3.13it/s]
Pan Fold-2: 100%|██████████| 139/139 [00:44<00:00,  3.12it/s]
Pan Fold-3: 100%|██████████| 139/139 [00:44<00:00,  3.10it/s]
Pan Fold-4: 100%|██████████| 139/139 [00:44<00:00,  3.12it/s]
Pan Fold-5: 100%|██████████| 139/139 [00:44<00:00,  3.15it/s]
Pan Fold-6: 100%|██████████| 139/139 [00:44<00:00,  3.15it/s]
Pan Fold-7: 100%|██████████| 139/139 [00:44<00:00,  3.15it/s]
Pan Fold-8: 100%|██████████| 139/139 [00:44<00:00,  3.12it/s]
Pan Fold-9: 100%|██████████| 139/139 [00:44<00:00,  3.13it/s]



=== Maalej ===


Maalej Fold-0: 100%|██████████| 370/370 [01:52<00:00,  3.29it/s]
Maalej Fold-1: 100%|██████████| 370/370 [01:53<00:00,  3.27it/s]
Maalej Fold-2: 100%|██████████| 370/370 [01:53<00:00,  3.27it/s]
Maalej Fold-3: 100%|██████████| 370/370 [01:54<00:00,  3.24it/s]
Maalej Fold-4: 100%|██████████| 370/370 [01:54<00:00,  3.23it/s]
Maalej Fold-5: 100%|██████████| 370/370 [01:53<00:00,  3.25it/s]
Maalej Fold-6: 100%|██████████| 370/370 [01:53<00:00,  3.27it/s]
Maalej Fold-7: 100%|██████████| 370/370 [01:53<00:00,  3.26it/s]
Maalej Fold-8: 100%|██████████| 370/370 [01:52<00:00,  3.29it/s]
Maalej Fold-9: 100%|██████████| 370/370 [01:54<00:00,  3.23it/s]



=== Scalabrino ===


Scalabrino Fold-0: 100%|██████████| 300/300 [01:36<00:00,  3.10it/s]
Scalabrino Fold-1: 100%|██████████| 300/300 [01:40<00:00,  2.98it/s]
Scalabrino Fold-2: 100%|██████████| 300/300 [01:36<00:00,  3.10it/s]
Scalabrino Fold-3: 100%|██████████| 300/300 [01:40<00:00,  2.99it/s]
Scalabrino Fold-4: 100%|██████████| 300/300 [01:38<00:00,  3.04it/s]
Scalabrino Fold-5: 100%|██████████| 300/300 [01:38<00:00,  3.06it/s]
Scalabrino Fold-6: 100%|██████████| 300/300 [01:36<00:00,  3.10it/s]
Scalabrino Fold-7: 100%|██████████| 300/300 [01:40<00:00,  3.00it/s]
Scalabrino Fold-8: 100%|██████████| 300/300 [01:38<00:00,  3.05it/s]
Scalabrino Fold-9: 100%|██████████| 300/300 [01:36<00:00,  3.11it/s]



📁 Saved per-fold macro/micro results.

📊 Computing final combined weighted metrics...
✅ Pan: macro-F1=0.738, weighted-F1=0.796
✅ Maalej: macro-F1=0.466, weighted-F1=0.655
✅ Scalabrino: macro-F1=0.545, weighted-F1=0.433

🎯 Done!


In [7]:
import os, time
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, classification_report
from langchain_ollama import OllamaLLM

# ─── CONFIG ───
DATASET_DIRS = {
    "pan": "stratified_splits_80_10_10/pan",
    "maalej": "stratified_splits_80_10_10/maalej",
    "scalabrino": "stratified_splits_80_10_10/scalabrino",
}

OLLAMA_MODEL = "llama3:8b"
# Sanitize model name to make it a valid folder name
MODEL_NAME_CLEAN = OLLAMA_MODEL.replace(":", "_").replace("/", "_")
OUT_DIR = f"zero_shot_folds_{MODEL_NAME_CLEAN}"
PRED_DIR = os.path.join(OUT_DIR, "preds")
os.makedirs(PRED_DIR, exist_ok=True)

llm = OllamaLLM(model=OLLAMA_MODEL)

# ─── PROMPT TEMPLATES ───
LLAMA3_TEMPLATES = {
    "Pan": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Information Giving: factual descriptions, app details, or user opinions.\n"
        "Problem Discovery: issues, bugs, crashes, or errors.\n"
        "Feature Request: suggestions or desired improvements.\n"
        "Information Seeking: questions or asking for help.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Maalej": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Feature Request: suggestions or desires for new features, functionality, or improvements.\n"
        "Rating: numeric scores or general feedback like 'great app' or 'bad service'.\n"
        "User Experience: opinions about usability, UI, speed, or ease of use.\n"
        "Problem Discovery: reports of bugs, crashes, or errors.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Scalab": (
        "You are a strict classification assistant.\n"
        "Classify the following app review into ONE and ONLY ONE of these categories:\n\n"
        "BUG\nFEATURE\nPERFORMANCE\nENERGY\nSECURITY\nUSABILITY\nOTHER\n\n"
        "- If unclear, choose OTHER.\n"
        "- Output the label name exactly as written above.\n\n"
        "Review: {req_text}\n"
        "Label:"
    )
}

# ─── EVALUATION ───
all_results = []

def run_fold_zero_shot(df, labels, template, dataset_name, fold_id):
    y_true = df["class"].tolist()
    req_ids = df["id"].tolist()
    texts = df["review"].tolist() if "review" in df.columns else df["text"].tolist()

    preds = []
    for txt in tqdm(texts, desc=f"{dataset_name} Fold-{fold_id}"):
        prompt = template.format(req_text=txt)
        pred = llm.invoke(prompt).strip()
        pred = next((lab for lab in labels if lab.lower() in pred.lower()), labels[0])
        preds.append(pred)

    pd.DataFrame({
        "id": req_ids,
        "text": texts,
        "gold": y_true,
        "pred": preds
    }).to_csv(os.path.join(PRED_DIR, f"{dataset_name}_fold{fold_id}_preds.csv"), index=False)

    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(y_true, preds, average="macro", zero_division=0)
    micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(y_true, preds, average="micro", zero_division=0)

    return {
        "dataset": dataset_name, "fold": fold_id,
        "macro_precision": macro_p, "macro_recall": macro_r, "macro_f1": macro_f1,
        "micro_precision": micro_p, "micro_recall": micro_r, "micro_f1": micro_f1
    }

# ─── MAIN FOLD LOOP ───
for dataset_name, folder in DATASET_DIRS.items():
    print(f"\n=== {dataset_name} ===")
    labels = sorted(pd.read_csv(os.path.join(folder, "test_fold_0.csv"))["class"].unique())
    template = LLAMA3_TEMPLATES[dataset_name if dataset_name != "Scalabrino" else "Scalab"]

    for i in range(10):
        path = os.path.join(folder, f"test_fold_{i}.csv")
        df = pd.read_csv(path)
        metrics = run_fold_zero_shot(df, labels, template, dataset_name, i)
        all_results.append(metrics)

summary_df = pd.DataFrame(all_results)
summary_df.to_csv(os.path.join(OUT_DIR, "zero_shot_folded_summary.csv"), index=False)
print("\n📁 Saved per-fold macro/micro results.")
# ─── FINAL FULL SET EVALUATION ───
print("\n📊 Computing final combined weighted metrics...")

def safe_metric(report, key, metric):
    return report.get(key, {}).get(metric, 0.0)

for dataset_name in DATASET_DIRS:
    fold_preds = []
    for i in range(10):
        fold_path = os.path.join(PRED_DIR, f"{dataset_name}_fold{i}_preds.csv")
        fold_df = pd.read_csv(fold_path)
        fold_preds.append(fold_df)

    all_df = pd.concat(fold_preds, ignore_index=True)
    y_true = all_df["gold"].str.lower()
    y_pred = all_df["pred"].str.lower()

    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

    final_metrics = {
        "dataset": dataset_name,
        "macro_precision":   safe_metric(report, "macro avg", "precision"),
        "macro_recall":      safe_metric(report, "macro avg", "recall"),
        "macro_f1":          safe_metric(report, "macro avg", "f1-score"),
        "micro_precision":   safe_metric(report, "micro avg", "precision"),
        "micro_recall":      safe_metric(report, "micro avg", "recall"),
        "micro_f1":          safe_metric(report, "micro avg", "f1-score"),
        "weighted_precision": safe_metric(report, "weighted avg", "precision"),
        "weighted_recall":    safe_metric(report, "weighted avg", "recall"),
        "weighted_f1":        safe_metric(report, "weighted avg", "f1-score")
    }

    pd.DataFrame([final_metrics]).to_csv(
        os.path.join(OUT_DIR, f"{dataset_name}_final_combined_metrics.csv"),
        index=False
    )

    print(f"✅ {dataset_name}: macro-F1={final_metrics['macro_f1']:.3f}, weighted-F1={final_metrics['weighted_f1']:.3f}")

print("\n🎯 Done!")



=== Pan ===


Pan Fold-0: 100%|██████████| 139/139 [00:28<00:00,  4.83it/s]
Pan Fold-1: 100%|██████████| 139/139 [00:25<00:00,  5.48it/s]
Pan Fold-2: 100%|██████████| 139/139 [00:25<00:00,  5.39it/s]
Pan Fold-3: 100%|██████████| 139/139 [00:25<00:00,  5.35it/s]
Pan Fold-4: 100%|██████████| 139/139 [00:25<00:00,  5.46it/s]
Pan Fold-5: 100%|██████████| 139/139 [00:25<00:00,  5.49it/s]
Pan Fold-6: 100%|██████████| 139/139 [00:25<00:00,  5.48it/s]
Pan Fold-7: 100%|██████████| 139/139 [00:25<00:00,  5.43it/s]
Pan Fold-8: 100%|██████████| 139/139 [00:25<00:00,  5.46it/s]
Pan Fold-9: 100%|██████████| 139/139 [00:25<00:00,  5.48it/s]



=== Maalej ===


Maalej Fold-0: 100%|██████████| 370/370 [01:07<00:00,  5.46it/s]
Maalej Fold-1: 100%|██████████| 370/370 [01:07<00:00,  5.49it/s]
Maalej Fold-2: 100%|██████████| 370/370 [01:07<00:00,  5.48it/s]
Maalej Fold-3: 100%|██████████| 370/370 [01:07<00:00,  5.49it/s]
Maalej Fold-4: 100%|██████████| 370/370 [01:07<00:00,  5.45it/s]
Maalej Fold-5: 100%|██████████| 370/370 [01:07<00:00,  5.44it/s]
Maalej Fold-6: 100%|██████████| 370/370 [01:07<00:00,  5.44it/s]
Maalej Fold-7: 100%|██████████| 370/370 [01:08<00:00,  5.43it/s]
Maalej Fold-8: 100%|██████████| 370/370 [01:06<00:00,  5.53it/s]
Maalej Fold-9: 100%|██████████| 370/370 [01:08<00:00,  5.39it/s]



=== Scalabrino ===


Scalabrino Fold-0: 100%|██████████| 300/300 [00:52<00:00,  5.68it/s]
Scalabrino Fold-1: 100%|██████████| 300/300 [00:53<00:00,  5.62it/s]
Scalabrino Fold-2: 100%|██████████| 300/300 [00:53<00:00,  5.59it/s]
Scalabrino Fold-3: 100%|██████████| 300/300 [00:53<00:00,  5.61it/s]
Scalabrino Fold-4: 100%|██████████| 300/300 [00:53<00:00,  5.65it/s]
Scalabrino Fold-5: 100%|██████████| 300/300 [00:53<00:00,  5.64it/s]
Scalabrino Fold-6: 100%|██████████| 300/300 [00:53<00:00,  5.61it/s]
Scalabrino Fold-7: 100%|██████████| 300/300 [00:53<00:00,  5.60it/s]
Scalabrino Fold-8: 100%|██████████| 300/300 [00:53<00:00,  5.61it/s]
Scalabrino Fold-9: 100%|██████████| 300/300 [00:53<00:00,  5.56it/s]



📁 Saved per-fold macro/micro results.

📊 Computing final combined weighted metrics...
✅ Pan: macro-F1=0.673, weighted-F1=0.762
✅ Maalej: macro-F1=0.340, weighted-F1=0.233
✅ Scalabrino: macro-F1=0.555, weighted-F1=0.539

🎯 Done!


In [8]:
import os, time
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, classification_report
from langchain_ollama import OllamaLLM

# ─── CONFIG ───
DATASET_DIRS = {
    "Pan": "stratified_splits_80_10_10/pan",
    "Maalej": "stratified_splits_80_10_10/maalej",
    "Scalabrino": "stratified_splits_80_10_10/scalabrino",
}

OLLAMA_MODEL = "mistral:7b"
# Sanitize model name to make it a valid folder name
MODEL_NAME_CLEAN = OLLAMA_MODEL.replace(":", "_").replace("/", "_")
OUT_DIR = f"zero_shot_folds_{MODEL_NAME_CLEAN}"
PRED_DIR = os.path.join(OUT_DIR, "preds")
os.makedirs(PRED_DIR, exist_ok=True)

llm = OllamaLLM(model=OLLAMA_MODEL)

# ─── PROMPT TEMPLATES ───
LLAMA3_TEMPLATES = {
    "Pan": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Information Giving: factual descriptions, app details, or user opinions.\n"
        "Problem Discovery: issues, bugs, crashes, or errors.\n"
        "Feature Request: suggestions or desired improvements.\n"
        "Information Seeking: questions or asking for help.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Maalej": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Feature Request: suggestions or desires for new features, functionality, or improvements.\n"
        "Rating: numeric scores or general feedback like 'great app' or 'bad service'.\n"
        "User Experience: opinions about usability, UI, speed, or ease of use.\n"
        "Problem Discovery: reports of bugs, crashes, or errors.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Scalab": (
        "You are a strict classification assistant.\n"
        "Classify the following app review into ONE and ONLY ONE of these categories:\n\n"
        "BUG\nFEATURE\nPERFORMANCE\nENERGY\nSECURITY\nUSABILITY\nOTHER\n\n"
        "- If unclear, choose OTHER.\n"
        "- Output the label name exactly as written above.\n\n"
        "Review: {req_text}\n"
        "Label:"
    )
}

# ─── EVALUATION ───
all_results = []

def run_fold_zero_shot(df, labels, template, dataset_name, fold_id):
    y_true = df["class"].tolist()
    req_ids = df["id"].tolist()
    texts = df["review"].tolist() if "review" in df.columns else df["text"].tolist()

    preds = []
    for txt in tqdm(texts, desc=f"{dataset_name} Fold-{fold_id}"):
        prompt = template.format(req_text=txt)
        pred = llm.invoke(prompt).strip()
        pred = next((lab for lab in labels if lab.lower() in pred.lower()), labels[0])
        preds.append(pred)

    pd.DataFrame({
        "id": req_ids,
        "text": texts,
        "gold": y_true,
        "pred": preds
    }).to_csv(os.path.join(PRED_DIR, f"{dataset_name}_fold{fold_id}_preds.csv"), index=False)

    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(y_true, preds, average="macro", zero_division=0)
    micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(y_true, preds, average="micro", zero_division=0)

    return {
        "dataset": dataset_name, "fold": fold_id,
        "macro_precision": macro_p, "macro_recall": macro_r, "macro_f1": macro_f1,
        "micro_precision": micro_p, "micro_recall": micro_r, "micro_f1": micro_f1
    }

# ─── MAIN FOLD LOOP ───
for dataset_name, folder in DATASET_DIRS.items():
    print(f"\n=== {dataset_name} ===")
    labels = sorted(pd.read_csv(os.path.join(folder, "test_fold_0.csv"))["class"].unique())
    template = LLAMA3_TEMPLATES[dataset_name if dataset_name != "Scalabrino" else "Scalab"]

    for i in range(10):
        path = os.path.join(folder, f"test_fold_{i}.csv")
        df = pd.read_csv(path)
        metrics = run_fold_zero_shot(df, labels, template, dataset_name, i)
        all_results.append(metrics)

summary_df = pd.DataFrame(all_results)
summary_df.to_csv(os.path.join(OUT_DIR, "zero_shot_folded_summary.csv"), index=False)
print("\n📁 Saved per-fold macro/micro results.")
# ─── FINAL FULL SET EVALUATION ───
print("\n📊 Computing final combined weighted metrics...")

def safe_metric(report, key, metric):
    return report.get(key, {}).get(metric, 0.0)

for dataset_name in DATASET_DIRS:
    fold_preds = []
    for i in range(10):
        fold_path = os.path.join(PRED_DIR, f"{dataset_name}_fold{i}_preds.csv")
        fold_df = pd.read_csv(fold_path)
        fold_preds.append(fold_df)

    all_df = pd.concat(fold_preds, ignore_index=True)
    y_true = all_df["gold"].str.lower()
    y_pred = all_df["pred"].str.lower()

    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

    final_metrics = {
        "dataset": dataset_name,
        "macro_precision":   safe_metric(report, "macro avg", "precision"),
        "macro_recall":      safe_metric(report, "macro avg", "recall"),
        "macro_f1":          safe_metric(report, "macro avg", "f1-score"),
        "micro_precision":   safe_metric(report, "micro avg", "precision"),
        "micro_recall":      safe_metric(report, "micro avg", "recall"),
        "micro_f1":          safe_metric(report, "micro avg", "f1-score"),
        "weighted_precision": safe_metric(report, "weighted avg", "precision"),
        "weighted_recall":    safe_metric(report, "weighted avg", "recall"),
        "weighted_f1":        safe_metric(report, "weighted avg", "f1-score")
    }

    pd.DataFrame([final_metrics]).to_csv(
        os.path.join(OUT_DIR, f"{dataset_name}_final_combined_metrics.csv"),
        index=False
    )

    print(f"✅ {dataset_name}: macro-F1={final_metrics['macro_f1']:.3f}, weighted-F1={final_metrics['weighted_f1']:.3f}")

print("\n🎯 Done!")



=== Pan ===


Pan Fold-0: 100%|██████████| 139/139 [00:14<00:00,  9.81it/s]
Pan Fold-1: 100%|██████████| 139/139 [00:11<00:00, 11.83it/s]
Pan Fold-2: 100%|██████████| 139/139 [00:11<00:00, 12.13it/s]
Pan Fold-3: 100%|██████████| 139/139 [00:11<00:00, 11.87it/s]
Pan Fold-4: 100%|██████████| 139/139 [00:11<00:00, 12.12it/s]
Pan Fold-5: 100%|██████████| 139/139 [00:11<00:00, 12.24it/s]
Pan Fold-6: 100%|██████████| 139/139 [00:11<00:00, 12.02it/s]
Pan Fold-7: 100%|██████████| 139/139 [00:11<00:00, 12.42it/s]
Pan Fold-8: 100%|██████████| 139/139 [00:11<00:00, 12.14it/s]
Pan Fold-9: 100%|██████████| 139/139 [00:11<00:00, 11.97it/s]



=== Maalej ===


Maalej Fold-0: 100%|██████████| 370/370 [00:28<00:00, 13.07it/s]
Maalej Fold-1: 100%|██████████| 370/370 [00:28<00:00, 12.95it/s]
Maalej Fold-2: 100%|██████████| 370/370 [00:28<00:00, 12.87it/s]
Maalej Fold-3: 100%|██████████| 370/370 [00:28<00:00, 13.18it/s]
Maalej Fold-4: 100%|██████████| 370/370 [00:28<00:00, 13.05it/s]
Maalej Fold-5: 100%|██████████| 370/370 [00:28<00:00, 12.81it/s]
Maalej Fold-6: 100%|██████████| 370/370 [00:28<00:00, 13.10it/s]
Maalej Fold-7: 100%|██████████| 370/370 [00:28<00:00, 13.13it/s]
Maalej Fold-8: 100%|██████████| 370/370 [00:28<00:00, 13.04it/s]
Maalej Fold-9: 100%|██████████| 370/370 [00:28<00:00, 13.10it/s]



=== Scalabrino ===


Scalabrino Fold-0: 100%|██████████| 300/300 [00:26<00:00, 11.47it/s]
Scalabrino Fold-1: 100%|██████████| 300/300 [00:26<00:00, 11.18it/s]
Scalabrino Fold-2: 100%|██████████| 300/300 [00:27<00:00, 10.82it/s]
Scalabrino Fold-3: 100%|██████████| 300/300 [00:29<00:00, 10.20it/s]
Scalabrino Fold-4: 100%|██████████| 300/300 [00:27<00:00, 10.83it/s]
Scalabrino Fold-5: 100%|██████████| 300/300 [00:27<00:00, 10.99it/s]
Scalabrino Fold-6: 100%|██████████| 300/300 [00:27<00:00, 10.73it/s]
Scalabrino Fold-7: 100%|██████████| 300/300 [00:28<00:00, 10.56it/s]
Scalabrino Fold-8: 100%|██████████| 300/300 [00:28<00:00, 10.67it/s]
Scalabrino Fold-9: 100%|██████████| 300/300 [00:26<00:00, 11.53it/s]



📁 Saved per-fold macro/micro results.

📊 Computing final combined weighted metrics...
✅ Pan: macro-F1=0.721, weighted-F1=0.792
✅ Maalej: macro-F1=0.450, weighted-F1=0.515
✅ Scalabrino: macro-F1=0.401, weighted-F1=0.275

🎯 Done!


In [9]:
import os, time
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, classification_report
from langchain_ollama import OllamaLLM

# ─── CONFIG ───
DATASET_DIRS = {
    "Pan": "stratified_splits_80_10_10/pan",
    "Maalej": "stratified_splits_80_10_10/maalej",
    "Scalabrino": "stratified_splits_80_10_10/scalabrino",
}

OLLAMA_MODEL = "gemma3:4b"
# Sanitize model name to make it a valid folder name
MODEL_NAME_CLEAN = OLLAMA_MODEL.replace(":", "_").replace("/", "_")
OUT_DIR = f"zero_shot_folds_{MODEL_NAME_CLEAN}"
PRED_DIR = os.path.join(OUT_DIR, "preds")
os.makedirs(PRED_DIR, exist_ok=True)

llm = OllamaLLM(model=OLLAMA_MODEL)

# ─── PROMPT TEMPLATES ───
LLAMA3_TEMPLATES = {
    "Pan": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Information Giving: factual descriptions, app details, or user opinions.\n"
        "Problem Discovery: issues, bugs, crashes, or errors.\n"
        "Feature Request: suggestions or desired improvements.\n"
        "Information Seeking: questions or asking for help.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Maalej": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Feature Request: suggestions or desires for new features, functionality, or improvements.\n"
        "Rating: numeric scores or general feedback like 'great app' or 'bad service'.\n"
        "User Experience: opinions about usability, UI, speed, or ease of use.\n"
        "Problem Discovery: reports of bugs, crashes, or errors.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Scalab": (
        "You are a strict classification assistant.\n"
        "Classify the following app review into ONE and ONLY ONE of these categories:\n\n"
        "BUG\nFEATURE\nPERFORMANCE\nENERGY\nSECURITY\nUSABILITY\nOTHER\n\n"
        "- If unclear, choose OTHER.\n"
        "- Output the label name exactly as written above.\n\n"
        "Review: {req_text}\n"
        "Label:"
    )
}

# ─── EVALUATION ───
all_results = []

def run_fold_zero_shot(df, labels, template, dataset_name, fold_id):
    y_true = df["class"].tolist()
    req_ids = df["id"].tolist()
    texts = df["review"].tolist() if "review" in df.columns else df["text"].tolist()

    preds = []
    for txt in tqdm(texts, desc=f"{dataset_name} Fold-{fold_id}"):
        prompt = template.format(req_text=txt)
        pred = llm.invoke(prompt).strip()
        pred = next((lab for lab in labels if lab.lower() in pred.lower()), labels[0])
        preds.append(pred)

    pd.DataFrame({
        "id": req_ids,
        "text": texts,
        "gold": y_true,
        "pred": preds
    }).to_csv(os.path.join(PRED_DIR, f"{dataset_name}_fold{fold_id}_preds.csv"), index=False)

    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(y_true, preds, average="macro", zero_division=0)
    micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(y_true, preds, average="micro", zero_division=0)

    return {
        "dataset": dataset_name, "fold": fold_id,
        "macro_precision": macro_p, "macro_recall": macro_r, "macro_f1": macro_f1,
        "micro_precision": micro_p, "micro_recall": micro_r, "micro_f1": micro_f1
    }

# ─── MAIN FOLD LOOP ───
for dataset_name, folder in DATASET_DIRS.items():
    print(f"\n=== {dataset_name} ===")
    labels = sorted(pd.read_csv(os.path.join(folder, "test_fold_0.csv"))["class"].unique())
    template = LLAMA3_TEMPLATES[dataset_name if dataset_name != "Scalabrino" else "Scalab"]

    for i in range(10):
        path = os.path.join(folder, f"test_fold_{i}.csv")
        df = pd.read_csv(path)
        metrics = run_fold_zero_shot(df, labels, template, dataset_name, i)
        all_results.append(metrics)

summary_df = pd.DataFrame(all_results)
summary_df.to_csv(os.path.join(OUT_DIR, "zero_shot_folded_summary.csv"), index=False)
print("\n📁 Saved per-fold macro/micro results.")
# ─── FINAL FULL SET EVALUATION ───
print("\n📊 Computing final combined weighted metrics...")

def safe_metric(report, key, metric):
    return report.get(key, {}).get(metric, 0.0)

for dataset_name in DATASET_DIRS:
    fold_preds = []
    for i in range(10):
        fold_path = os.path.join(PRED_DIR, f"{dataset_name}_fold{i}_preds.csv")
        fold_df = pd.read_csv(fold_path)
        fold_preds.append(fold_df)

    all_df = pd.concat(fold_preds, ignore_index=True)
    y_true = all_df["gold"].str.lower()
    y_pred = all_df["pred"].str.lower()

    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

    final_metrics = {
        "dataset": dataset_name,
        "macro_precision":   safe_metric(report, "macro avg", "precision"),
        "macro_recall":      safe_metric(report, "macro avg", "recall"),
        "macro_f1":          safe_metric(report, "macro avg", "f1-score"),
        "micro_precision":   safe_metric(report, "micro avg", "precision"),
        "micro_recall":      safe_metric(report, "micro avg", "recall"),
        "micro_f1":          safe_metric(report, "micro avg", "f1-score"),
        "weighted_precision": safe_metric(report, "weighted avg", "precision"),
        "weighted_recall":    safe_metric(report, "weighted avg", "recall"),
        "weighted_f1":        safe_metric(report, "weighted avg", "f1-score")
    }

    pd.DataFrame([final_metrics]).to_csv(
        os.path.join(OUT_DIR, f"{dataset_name}_final_combined_metrics.csv"),
        index=False
    )

    print(f"✅ {dataset_name}: macro-F1={final_metrics['macro_f1']:.3f}, weighted-F1={final_metrics['weighted_f1']:.3f}")

print("\n🎯 Done!")



=== Pan ===


Pan Fold-0: 100%|██████████| 139/139 [00:38<00:00,  3.60it/s]
Pan Fold-1: 100%|██████████| 139/139 [00:32<00:00,  4.27it/s]
Pan Fold-2: 100%|██████████| 139/139 [00:32<00:00,  4.24it/s]
Pan Fold-3: 100%|██████████| 139/139 [00:33<00:00,  4.16it/s]
Pan Fold-4: 100%|██████████| 139/139 [00:32<00:00,  4.30it/s]
Pan Fold-5: 100%|██████████| 139/139 [00:33<00:00,  4.20it/s]
Pan Fold-6: 100%|██████████| 139/139 [00:32<00:00,  4.29it/s]
Pan Fold-7: 100%|██████████| 139/139 [00:33<00:00,  4.19it/s]
Pan Fold-8: 100%|██████████| 139/139 [00:29<00:00,  4.67it/s]
Pan Fold-9: 100%|██████████| 139/139 [00:33<00:00,  4.20it/s]



=== Maalej ===


Maalej Fold-0: 100%|██████████| 370/370 [01:25<00:00,  4.34it/s]
Maalej Fold-1: 100%|██████████| 370/370 [01:25<00:00,  4.31it/s]
Maalej Fold-2: 100%|██████████| 370/370 [01:25<00:00,  4.33it/s]
Maalej Fold-3: 100%|██████████| 370/370 [01:26<00:00,  4.27it/s]
Maalej Fold-4: 100%|██████████| 370/370 [01:24<00:00,  4.39it/s]
Maalej Fold-5: 100%|██████████| 370/370 [01:24<00:00,  4.36it/s]
Maalej Fold-6: 100%|██████████| 370/370 [01:25<00:00,  4.34it/s]
Maalej Fold-7: 100%|██████████| 370/370 [01:24<00:00,  4.37it/s]
Maalej Fold-8: 100%|██████████| 370/370 [01:27<00:00,  4.21it/s]
Maalej Fold-9: 100%|██████████| 370/370 [01:25<00:00,  4.31it/s]



=== Scalabrino ===


Scalabrino Fold-0: 100%|██████████| 300/300 [01:10<00:00,  4.24it/s]
Scalabrino Fold-1: 100%|██████████| 300/300 [01:11<00:00,  4.21it/s]
Scalabrino Fold-2: 100%|██████████| 300/300 [01:11<00:00,  4.22it/s]
Scalabrino Fold-3: 100%|██████████| 300/300 [01:10<00:00,  4.27it/s]
Scalabrino Fold-4: 100%|██████████| 300/300 [01:09<00:00,  4.29it/s]
Scalabrino Fold-5: 100%|██████████| 300/300 [01:11<00:00,  4.22it/s]
Scalabrino Fold-6: 100%|██████████| 300/300 [01:09<00:00,  4.29it/s]
Scalabrino Fold-7: 100%|██████████| 300/300 [01:10<00:00,  4.27it/s]
Scalabrino Fold-8: 100%|██████████| 300/300 [01:10<00:00,  4.24it/s]
Scalabrino Fold-9: 100%|██████████| 300/300 [01:09<00:00,  4.35it/s]



📁 Saved per-fold macro/micro results.

📊 Computing final combined weighted metrics...
✅ Pan: macro-F1=0.660, weighted-F1=0.746
✅ Maalej: macro-F1=0.445, weighted-F1=0.638
✅ Scalabrino: macro-F1=0.516, weighted-F1=0.585

🎯 Done!


In [10]:
import os, time
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, classification_report
from langchain_ollama import OllamaLLM

# ─── CONFIG ───
DATASET_DIRS = {
    "Pan": "stratified_splits_80_10_10/pan",
    "Maalej": "stratified_splits_80_10_10/maalej",
    "Scalabrino": "stratified_splits_80_10_10/scalabrino",
}

OLLAMA_MODEL = "wizardlm2:7b"
# Sanitize model name to make it a valid folder name
MODEL_NAME_CLEAN = OLLAMA_MODEL.replace(":", "_").replace("/", "_")
OUT_DIR = f"zero_shot_folds_{MODEL_NAME_CLEAN}"
PRED_DIR = os.path.join(OUT_DIR, "preds")
os.makedirs(PRED_DIR, exist_ok=True)

llm = OllamaLLM(model=OLLAMA_MODEL)

# ─── PROMPT TEMPLATES ───
LLAMA3_TEMPLATES = {
    "Pan": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Information Giving: factual descriptions, app details, or user opinions.\n"
        "Problem Discovery: issues, bugs, crashes, or errors.\n"
        "Feature Request: suggestions or desired improvements.\n"
        "Information Seeking: questions or asking for help.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Maalej": (
        "You are a requirement classification assistant.\n"
        "Your task is to classify a user review into ONE of the following four categories:\n\n"
        "Feature Request: suggestions or desires for new features, functionality, or improvements.\n"
        "Rating: numeric scores or general feedback like 'great app' or 'bad service'.\n"
        "User Experience: opinions about usability, UI, speed, or ease of use.\n"
        "Problem Discovery: reports of bugs, crashes, or errors.\n\n"
        "Return ONLY the label name, no explanation.\n\n"
        "Review: {req_text}\n"
        "Label:"
    ),
    "Scalab": (
        "You are a strict classification assistant.\n"
        "Classify the following app review into ONE and ONLY ONE of these categories:\n\n"
        "BUG\nFEATURE\nPERFORMANCE\nENERGY\nSECURITY\nUSABILITY\nOTHER\n\n"
        "- If unclear, choose OTHER.\n"
        "- Output the label name exactly as written above.\n\n"
        "Review: {req_text}\n"
        "Label:"
    )
}

# ─── EVALUATION ───
all_results = []

def run_fold_zero_shot(df, labels, template, dataset_name, fold_id):
    y_true = df["class"].tolist()
    req_ids = df["id"].tolist()
    texts = df["review"].tolist() if "review" in df.columns else df["text"].tolist()

    preds = []
    for txt in tqdm(texts, desc=f"{dataset_name} Fold-{fold_id}"):
        prompt = template.format(req_text=txt)
        pred = llm.invoke(prompt).strip()
        pred = next((lab for lab in labels if lab.lower() in pred.lower()), labels[0])
        preds.append(pred)

    pd.DataFrame({
        "id": req_ids,
        "text": texts,
        "gold": y_true,
        "pred": preds
    }).to_csv(os.path.join(PRED_DIR, f"{dataset_name}_fold{fold_id}_preds.csv"), index=False)

    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(y_true, preds, average="macro", zero_division=0)
    micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(y_true, preds, average="micro", zero_division=0)

    return {
        "dataset": dataset_name, "fold": fold_id,
        "macro_precision": macro_p, "macro_recall": macro_r, "macro_f1": macro_f1,
        "micro_precision": micro_p, "micro_recall": micro_r, "micro_f1": micro_f1
    }

# ─── MAIN FOLD LOOP ───
for dataset_name, folder in DATASET_DIRS.items():
    print(f"\n=== {dataset_name} ===")
    labels = sorted(pd.read_csv(os.path.join(folder, "test_fold_0.csv"))["class"].unique())
    template = LLAMA3_TEMPLATES[dataset_name if dataset_name != "Scalabrino" else "Scalab"]

    for i in range(10):
        path = os.path.join(folder, f"test_fold_{i}.csv")
        df = pd.read_csv(path)
        metrics = run_fold_zero_shot(df, labels, template, dataset_name, i)
        all_results.append(metrics)

summary_df = pd.DataFrame(all_results)
summary_df.to_csv(os.path.join(OUT_DIR, "zero_shot_folded_summary.csv"), index=False)
print("\n📁 Saved per-fold macro/micro results.")
# ─── FINAL FULL SET EVALUATION ───
print("\n📊 Computing final combined weighted metrics...")

def safe_metric(report, key, metric):
    return report.get(key, {}).get(metric, 0.0)

for dataset_name in DATASET_DIRS:
    fold_preds = []
    for i in range(10):
        fold_path = os.path.join(PRED_DIR, f"{dataset_name}_fold{i}_preds.csv")
        fold_df = pd.read_csv(fold_path)
        fold_preds.append(fold_df)

    all_df = pd.concat(fold_preds, ignore_index=True)
    y_true = all_df["gold"].str.lower()
    y_pred = all_df["pred"].str.lower()

    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

    final_metrics = {
        "dataset": dataset_name,
        "macro_precision":   safe_metric(report, "macro avg", "precision"),
        "macro_recall":      safe_metric(report, "macro avg", "recall"),
        "macro_f1":          safe_metric(report, "macro avg", "f1-score"),
        "micro_precision":   safe_metric(report, "micro avg", "precision"),
        "micro_recall":      safe_metric(report, "micro avg", "recall"),
        "micro_f1":          safe_metric(report, "micro avg", "f1-score"),
        "weighted_precision": safe_metric(report, "weighted avg", "precision"),
        "weighted_recall":    safe_metric(report, "weighted avg", "recall"),
        "weighted_f1":        safe_metric(report, "weighted avg", "f1-score")
    }

    pd.DataFrame([final_metrics]).to_csv(
        os.path.join(OUT_DIR, f"{dataset_name}_final_combined_metrics.csv"),
        index=False
    )

    print(f"✅ {dataset_name}: macro-F1={final_metrics['macro_f1']:.3f}, weighted-F1={final_metrics['weighted_f1']:.3f}")

print("\n🎯 Done!")



=== Pan ===


Pan Fold-0: 100%|██████████| 139/139 [00:14<00:00,  9.37it/s]
Pan Fold-1: 100%|██████████| 139/139 [00:13<00:00, 10.64it/s]
Pan Fold-2: 100%|██████████| 139/139 [00:12<00:00, 11.53it/s]
Pan Fold-3: 100%|██████████| 139/139 [00:12<00:00, 11.19it/s]
Pan Fold-4: 100%|██████████| 139/139 [00:11<00:00, 12.01it/s]
Pan Fold-5: 100%|██████████| 139/139 [00:11<00:00, 11.89it/s]
Pan Fold-6: 100%|██████████| 139/139 [00:13<00:00,  9.94it/s]
Pan Fold-7: 100%|██████████| 139/139 [00:13<00:00, 10.47it/s]
Pan Fold-8: 100%|██████████| 139/139 [00:12<00:00, 11.47it/s]
Pan Fold-9: 100%|██████████| 139/139 [00:11<00:00, 12.00it/s]



=== Maalej ===


Maalej Fold-0: 100%|██████████| 370/370 [00:43<00:00,  8.60it/s]
Maalej Fold-1: 100%|██████████| 370/370 [00:39<00:00,  9.39it/s]
Maalej Fold-2: 100%|██████████| 370/370 [00:43<00:00,  8.58it/s]
Maalej Fold-3: 100%|██████████| 370/370 [00:36<00:00, 10.24it/s]
Maalej Fold-4: 100%|██████████| 370/370 [00:40<00:00,  9.12it/s]
Maalej Fold-5: 100%|██████████| 370/370 [00:41<00:00,  8.82it/s]
Maalej Fold-6: 100%|██████████| 370/370 [00:40<00:00,  9.15it/s]
Maalej Fold-7: 100%|██████████| 370/370 [00:40<00:00,  9.03it/s]
Maalej Fold-8: 100%|██████████| 370/370 [00:39<00:00,  9.42it/s]
Maalej Fold-9: 100%|██████████| 370/370 [00:40<00:00,  9.06it/s]



=== Scalabrino ===


Scalabrino Fold-0: 100%|██████████| 300/300 [02:13<00:00,  2.25it/s]
Scalabrino Fold-1: 100%|██████████| 300/300 [02:07<00:00,  2.35it/s]
Scalabrino Fold-2: 100%|██████████| 300/300 [02:06<00:00,  2.38it/s]
Scalabrino Fold-3: 100%|██████████| 300/300 [02:04<00:00,  2.41it/s]
Scalabrino Fold-4: 100%|██████████| 300/300 [02:11<00:00,  2.28it/s]
Scalabrino Fold-5: 100%|██████████| 300/300 [02:06<00:00,  2.38it/s]
Scalabrino Fold-6: 100%|██████████| 300/300 [02:07<00:00,  2.36it/s]
Scalabrino Fold-7: 100%|██████████| 300/300 [02:13<00:00,  2.25it/s]
Scalabrino Fold-8: 100%|██████████| 300/300 [02:11<00:00,  2.28it/s]
Scalabrino Fold-9: 100%|██████████| 300/300 [02:11<00:00,  2.29it/s]



📁 Saved per-fold macro/micro results.

📊 Computing final combined weighted metrics...
✅ Pan: macro-F1=0.662, weighted-F1=0.749
✅ Maalej: macro-F1=0.404, weighted-F1=0.452
✅ Scalabrino: macro-F1=0.475, weighted-F1=0.288

🎯 Done!


In [12]:
# few_shot_folds_ollama.py
import os
import time
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from langchain_ollama import OllamaLLM

# =============== CONFIG ===============
DATASET_NAME = "pan"
OLLAMA_MODEL = "llama3:70b"
K_VALUES = [1, 3, 5, 7]
SEED = 42

FOLD_DIR = f"stratified_splits_80_10_10/{DATASET_NAME}"
MODEL_CLEAN = OLLAMA_MODEL.replace(":", "_").replace("/", "_")
OUT_DIR = f"few_shot_folds_{MODEL_CLEAN}_{DATASET_NAME}"
PRED_DIR = os.path.join(OUT_DIR, "preds")
os.makedirs(PRED_DIR, exist_ok=True)
random.seed(SEED)

LABEL_LIST = [
    "Information Giving",
    "Problem Discovery",
    "Feature Request",
    "Information Seeking",
]

PROMPT_TEMPLATE = (
    "You are a requirement classification assistant.\n"
    "Your task is to classify a user review into ONE of the following four categories:\n\n"
    "- Information Giving: factual descriptions, app details, or user opinions.\n"
    "- Problem Discovery: issues, bugs, crashes, or errors.\n"
    "- Feature Request: suggestions or desired improvements.\n"
    "- Information Seeking: questions or asking for help.\n\n"
    "{examples_section}"
    "Return ONLY the label name, no explanation.\n\n"
    "Review: {req_text}\n"
    "Label:"
)

llm = OllamaLLM(model=OLLAMA_MODEL)

# =============== HELPERS ===============
def build_examples(df):
    return "\n".join(f"{row.review} , {row['class']}" for _, row in df.iterrows())

def call_llm(prompt: str) -> str:
    return llm.invoke(prompt).strip()

def extract_label(response: str, labels: list[str]) -> str:
    rl = response.lower()
    for lab in labels:
        if lab.lower() in rl:
            return lab
    return labels[0]

def k_examples_per_class(df: pd.DataFrame, k: int) -> pd.DataFrame:
    return (
        df.assign(len=df.review.str.len())
          .sort_values("len")
          .groupby("class", group_keys=False)
          .head(k)
          .drop(columns="len")
          .reset_index(drop=True)
    )

# =============== MAIN LOOP ===============
all_metrics = []
summary_rows = []

for fold in range(10):
    test_path = os.path.join(FOLD_DIR, f"test_fold_{fold}.csv")
    train_path = os.path.join(FOLD_DIR, f"train_fold_{fold}.csv")
    test_df = pd.read_csv(test_path).reset_index(drop=True)
    train_df = pd.read_csv(train_path).reset_index(drop=True)

    labels = sorted(train_df["class"].unique())

    for k in K_VALUES:
        fewshot_df = k_examples_per_class(train_df, k)
        example_block = build_examples(fewshot_df)
        examples_section = f"Here are {k} examples (text , class):\n{example_block}\n\n"

        y_true, preds = [], []

        for _, row in tqdm(test_df.iterrows(), total=len(test_df),
                           desc=f"{DATASET_NAME.upper()} | Fold-{fold} | {k}-shot", unit="req"):

            prompt = PROMPT_TEMPLATE.format(
                examples_section=examples_section,
                req_text=row.review
            )

            resp = call_llm(prompt)
            pred = extract_label(resp, labels)
            y_true.append(row["class"])
            preds.append(pred)

        # ─── METRICS ──────────────────────────────
        macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
            y_true, preds, labels=labels, average="macro", zero_division=0)
        micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
            y_true, preds, labels=labels, average="micro", zero_division=0)
        weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(
            y_true, preds, labels=labels, average="weighted", zero_division=0)

        # ─── SAVE PREDICTIONS ─────────────────────
        pd.DataFrame({
            "text": test_df.review,
            "gold": y_true,
            "pred": preds
        }).to_csv(
            os.path.join(PRED_DIR, f"{DATASET_NAME}_fold{fold}_{k}shot_preds.csv"),
            index=False, encoding="utf-8"
        )

        # ─── RECORD METRICS ───────────────────────
        summary_rows.append({
            "dataset": DATASET_NAME,
            "model": OLLAMA_MODEL,
            "fold": fold,
            "k_shot": k,
            "macro_f1": macro_f1,
            "micro_f1": micro_f1,
            "weighted_f1": weighted_f1
        })

        for i, lab in enumerate(labels):
            all_metrics.append({
                "dataset": DATASET_NAME,
                "model": OLLAMA_MODEL,
                "fold": fold,
                "k_shot": k,
                "class_label": lab,
                "macro_f1": macro_f1,
                "micro_f1": micro_f1,
                "weighted_f1": weighted_f1,
            })

# ─── SAVE RESULTS ───────────────────────────────
pd.DataFrame(summary_rows).to_csv(
    os.path.join(OUT_DIR, "pan_fewshot_folded_summary.csv"),
    index=False, encoding="utf-8"
)

pd.DataFrame(all_metrics).to_csv(
    os.path.join(OUT_DIR, "pan_fewshot_all_metrics.csv"),
    index=False, encoding="utf-8"
)

print("\n✅ Few-shot evaluation complete.")


PAN | Fold-0 | 1-shot: 100%|██████████| 139/139 [00:43<00:00,  3.19req/s]
PAN | Fold-0 | 3-shot: 100%|██████████| 139/139 [00:44<00:00,  3.14req/s]
PAN | Fold-0 | 5-shot: 100%|██████████| 139/139 [00:43<00:00,  3.17req/s]
PAN | Fold-0 | 7-shot: 100%|██████████| 139/139 [00:44<00:00,  3.12req/s]
PAN | Fold-1 | 1-shot: 100%|██████████| 139/139 [00:43<00:00,  3.18req/s]
PAN | Fold-1 | 3-shot: 100%|██████████| 139/139 [00:44<00:00,  3.10req/s]
PAN | Fold-1 | 5-shot: 100%|██████████| 139/139 [00:45<00:00,  3.05req/s]
PAN | Fold-1 | 7-shot: 100%|██████████| 139/139 [00:45<00:00,  3.05req/s]
PAN | Fold-2 | 1-shot: 100%|██████████| 139/139 [00:45<00:00,  3.09req/s]
PAN | Fold-2 | 3-shot: 100%|██████████| 139/139 [00:44<00:00,  3.11req/s]
PAN | Fold-2 | 5-shot: 100%|██████████| 139/139 [00:45<00:00,  3.04req/s]
PAN | Fold-2 | 7-shot: 100%|██████████| 139/139 [00:46<00:00,  2.98req/s]
PAN | Fold-3 | 1-shot: 100%|██████████| 139/139 [00:45<00:00,  3.08req/s]
PAN | Fold-3 | 3-shot: 100%|██████████


✅ Few-shot evaluation complete.





In [14]:
# few_shot_folds_ollama.py
import os
import time
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from langchain_ollama import OllamaLLM

# =============== CONFIG ===============
DATASET_NAME = "pan"
OLLAMA_MODEL = "llama3:8b"
K_VALUES = [1, 3, 5, 7]
SEED = 42

FOLD_DIR = f"stratified_splits_80_10_10/{DATASET_NAME}"
MODEL_CLEAN = OLLAMA_MODEL.replace(":", "_").replace("/", "_")
OUT_DIR = f"few_shot_folds_{MODEL_CLEAN}_{DATASET_NAME}"
PRED_DIR = os.path.join(OUT_DIR, "preds")
os.makedirs(PRED_DIR, exist_ok=True)
random.seed(SEED)

LABEL_LIST = [
    "Information Giving",
    "Problem Discovery",
    "Feature Request",
    "Information Seeking",
]

PROMPT_TEMPLATE = (
    "You are a requirement classification assistant.\n"
    "Your task is to classify a user review into ONE of the following four categories:\n\n"
    "- Information Giving: factual descriptions, app details, or user opinions.\n"
    "- Problem Discovery: issues, bugs, crashes, or errors.\n"
    "- Feature Request: suggestions or desired improvements.\n"
    "- Information Seeking: questions or asking for help.\n\n"
    "{examples_section}"
    "Return ONLY the label name, no explanation.\n\n"
    "Review: {req_text}\n"
    "Label:"
)

llm = OllamaLLM(model=OLLAMA_MODEL)

# =============== HELPERS ===============
def build_examples(df):
    return "\n".join(f"{row.review} , {row['class']}" for _, row in df.iterrows())

def call_llm(prompt: str) -> str:
    return llm.invoke(prompt).strip()

def extract_label(response: str, labels: list[str]) -> str:
    rl = response.lower()
    for lab in labels:
        if lab.lower() in rl:
            return lab
    return labels[0]

def k_examples_per_class(df: pd.DataFrame, k: int) -> pd.DataFrame:
    return (
        df.assign(len=df.review.str.len())
          .sort_values("len")
          .groupby("class", group_keys=False)
          .head(k)
          .drop(columns="len")
          .reset_index(drop=True)
    )

# =============== MAIN LOOP ===============
all_metrics = []
summary_rows = []

for fold in range(10):
    test_path = os.path.join(FOLD_DIR, f"test_fold_{fold}.csv")
    train_path = os.path.join(FOLD_DIR, f"train_fold_{fold}.csv")
    test_df = pd.read_csv(test_path).reset_index(drop=True)
    train_df = pd.read_csv(train_path).reset_index(drop=True)

    labels = sorted(train_df["class"].unique())

    for k in K_VALUES:
        fewshot_df = k_examples_per_class(train_df, k)
        example_block = build_examples(fewshot_df)
        examples_section = f"Here are {k} examples (text , class):\n{example_block}\n\n"

        y_true, preds = [], []

        for _, row in tqdm(test_df.iterrows(), total=len(test_df),
                           desc=f"{DATASET_NAME.upper()} | Fold-{fold} | {k}-shot", unit="req"):

            prompt = PROMPT_TEMPLATE.format(
                examples_section=examples_section,
                req_text=row.review
            )

            resp = call_llm(prompt)
            pred = extract_label(resp, labels)
            y_true.append(row["class"])
            preds.append(pred)

        # ─── METRICS ──────────────────────────────
        macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
            y_true, preds, labels=labels, average="macro", zero_division=0)
        micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
            y_true, preds, labels=labels, average="micro", zero_division=0)
        weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(
            y_true, preds, labels=labels, average="weighted", zero_division=0)

        # ─── SAVE PREDICTIONS ─────────────────────
        pd.DataFrame({
            "text": test_df.review,
            "gold": y_true,
            "pred": preds
        }).to_csv(
            os.path.join(PRED_DIR, f"{DATASET_NAME}_fold{fold}_{k}shot_preds.csv"),
            index=False, encoding="utf-8"
        )

        # ─── RECORD METRICS ───────────────────────
        summary_rows.append({
            "dataset": DATASET_NAME,
            "model": OLLAMA_MODEL,
            "fold": fold,
            "k_shot": k,
            "macro_f1": macro_f1,
            "micro_f1": micro_f1,
            "weighted_f1": weighted_f1
        })

        for i, lab in enumerate(labels):
            all_metrics.append({
                "dataset": DATASET_NAME,
                "model": OLLAMA_MODEL,
                "fold": fold,
                "k_shot": k,
                "class_label": lab,
                "macro_f1": macro_f1,
                "micro_f1": micro_f1,
                "weighted_f1": weighted_f1,
            })

# ─── SAVE RESULTS ───────────────────────────────
pd.DataFrame(summary_rows).to_csv(
    os.path.join(OUT_DIR, "pan_fewshot_folded_summary.csv"),
    index=False, encoding="utf-8"
)

pd.DataFrame(all_metrics).to_csv(
    os.path.join(OUT_DIR, "pan_fewshot_all_metrics.csv"),
    index=False, encoding="utf-8"
)

print("\n✅ Few-shot evaluation complete.")


PAN | Fold-0 | 1-shot: 100%|██████████| 139/139 [00:28<00:00,  4.80req/s]
PAN | Fold-0 | 3-shot: 100%|██████████| 139/139 [00:26<00:00,  5.27req/s]
PAN | Fold-0 | 5-shot: 100%|██████████| 139/139 [00:26<00:00,  5.18req/s]
PAN | Fold-0 | 7-shot: 100%|██████████| 139/139 [00:27<00:00,  5.15req/s]
PAN | Fold-1 | 1-shot: 100%|██████████| 139/139 [00:26<00:00,  5.29req/s]
PAN | Fold-1 | 3-shot: 100%|██████████| 139/139 [00:26<00:00,  5.19req/s]
PAN | Fold-1 | 5-shot: 100%|██████████| 139/139 [00:26<00:00,  5.20req/s]
PAN | Fold-1 | 7-shot: 100%|██████████| 139/139 [00:27<00:00,  5.13req/s]
PAN | Fold-2 | 1-shot: 100%|██████████| 139/139 [00:26<00:00,  5.27req/s]
PAN | Fold-2 | 3-shot: 100%|██████████| 139/139 [00:26<00:00,  5.27req/s]
PAN | Fold-2 | 5-shot: 100%|██████████| 139/139 [00:26<00:00,  5.26req/s]
PAN | Fold-2 | 7-shot: 100%|██████████| 139/139 [00:27<00:00,  5.13req/s]
PAN | Fold-3 | 1-shot: 100%|██████████| 139/139 [00:26<00:00,  5.24req/s]
PAN | Fold-3 | 3-shot: 100%|██████████


✅ Few-shot evaluation complete.


In [15]:
# few_shot_folds_ollama.py
import os
import time
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from langchain_ollama import OllamaLLM

# =============== CONFIG ===============
DATASET_NAME = "pan"
OLLAMA_MODEL = "mistral:7b"
K_VALUES = [1, 3, 5, 7]
SEED = 42

FOLD_DIR = f"stratified_splits_80_10_10/{DATASET_NAME}"
MODEL_CLEAN = OLLAMA_MODEL.replace(":", "_").replace("/", "_")
OUT_DIR = f"few_shot_folds_{MODEL_CLEAN}_{DATASET_NAME}"
PRED_DIR = os.path.join(OUT_DIR, "preds")
os.makedirs(PRED_DIR, exist_ok=True)
random.seed(SEED)

LABEL_LIST = [
    "Information Giving",
    "Problem Discovery",
    "Feature Request",
    "Information Seeking",
]

PROMPT_TEMPLATE = (
    "You are a requirement classification assistant.\n"
    "Your task is to classify a user review into ONE of the following four categories:\n\n"
    "- Information Giving: factual descriptions, app details, or user opinions.\n"
    "- Problem Discovery: issues, bugs, crashes, or errors.\n"
    "- Feature Request: suggestions or desired improvements.\n"
    "- Information Seeking: questions or asking for help.\n\n"
    "{examples_section}"
    "Return ONLY the label name, no explanation.\n\n"
    "Review: {req_text}\n"
    "Label:"
)

llm = OllamaLLM(model=OLLAMA_MODEL)

# =============== HELPERS ===============
def build_examples(df):
    return "\n".join(f"{row.review} , {row['class']}" for _, row in df.iterrows())

def call_llm(prompt: str) -> str:
    return llm.invoke(prompt).strip()

def extract_label(response: str, labels: list[str]) -> str:
    rl = response.lower()
    for lab in labels:
        if lab.lower() in rl:
            return lab
    return labels[0]

def k_examples_per_class(df: pd.DataFrame, k: int) -> pd.DataFrame:
    return (
        df.assign(len=df.review.str.len())
          .sort_values("len")
          .groupby("class", group_keys=False)
          .head(k)
          .drop(columns="len")
          .reset_index(drop=True)
    )

# =============== MAIN LOOP ===============
all_metrics = []
summary_rows = []

for fold in range(10):
    test_path = os.path.join(FOLD_DIR, f"test_fold_{fold}.csv")
    train_path = os.path.join(FOLD_DIR, f"train_fold_{fold}.csv")
    test_df = pd.read_csv(test_path).reset_index(drop=True)
    train_df = pd.read_csv(train_path).reset_index(drop=True)

    labels = sorted(train_df["class"].unique())

    for k in K_VALUES:
        fewshot_df = k_examples_per_class(train_df, k)
        example_block = build_examples(fewshot_df)
        examples_section = f"Here are {k} examples (text , class):\n{example_block}\n\n"

        y_true, preds = [], []

        for _, row in tqdm(test_df.iterrows(), total=len(test_df),
                           desc=f"{DATASET_NAME.upper()} | Fold-{fold} | {k}-shot", unit="req"):

            prompt = PROMPT_TEMPLATE.format(
                examples_section=examples_section,
                req_text=row.review
            )

            resp = call_llm(prompt)
            pred = extract_label(resp, labels)
            y_true.append(row["class"])
            preds.append(pred)

        # ─── METRICS ──────────────────────────────
        macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
            y_true, preds, labels=labels, average="macro", zero_division=0)
        micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
            y_true, preds, labels=labels, average="micro", zero_division=0)
        weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(
            y_true, preds, labels=labels, average="weighted", zero_division=0)

        # ─── SAVE PREDICTIONS ─────────────────────
        pd.DataFrame({
            "text": test_df.review,
            "gold": y_true,
            "pred": preds
        }).to_csv(
            os.path.join(PRED_DIR, f"{DATASET_NAME}_fold{fold}_{k}shot_preds.csv"),
            index=False, encoding="utf-8"
        )

        # ─── RECORD METRICS ───────────────────────
        summary_rows.append({
            "dataset": DATASET_NAME,
            "model": OLLAMA_MODEL,
            "fold": fold,
            "k_shot": k,
            "macro_f1": macro_f1,
            "micro_f1": micro_f1,
            "weighted_f1": weighted_f1
        })

        for i, lab in enumerate(labels):
            all_metrics.append({
                "dataset": DATASET_NAME,
                "model": OLLAMA_MODEL,
                "fold": fold,
                "k_shot": k,
                "class_label": lab,
                "macro_f1": macro_f1,
                "micro_f1": micro_f1,
                "weighted_f1": weighted_f1,
            })

# ─── SAVE RESULTS ───────────────────────────────
pd.DataFrame(summary_rows).to_csv(
    os.path.join(OUT_DIR, "pan_fewshot_folded_summary.csv"),
    index=False, encoding="utf-8"
)

pd.DataFrame(all_metrics).to_csv(
    os.path.join(OUT_DIR, "pan_fewshot_all_metrics.csv"),
    index=False, encoding="utf-8"
)

print("\n✅ Few-shot evaluation complete.")


PAN | Fold-0 | 1-shot: 100%|██████████| 139/139 [00:14<00:00,  9.64req/s]
PAN | Fold-0 | 3-shot: 100%|██████████| 139/139 [00:12<00:00, 11.40req/s]
PAN | Fold-0 | 5-shot: 100%|██████████| 139/139 [00:12<00:00, 11.04req/s]
PAN | Fold-0 | 7-shot: 100%|██████████| 139/139 [00:12<00:00, 10.86req/s]
PAN | Fold-1 | 1-shot: 100%|██████████| 139/139 [00:12<00:00, 11.34req/s]
PAN | Fold-1 | 3-shot: 100%|██████████| 139/139 [00:12<00:00, 11.38req/s]
PAN | Fold-1 | 5-shot: 100%|██████████| 139/139 [00:12<00:00, 10.96req/s]
PAN | Fold-1 | 7-shot: 100%|██████████| 139/139 [00:12<00:00, 10.72req/s]
PAN | Fold-2 | 1-shot: 100%|██████████| 139/139 [00:11<00:00, 11.61req/s]
PAN | Fold-2 | 3-shot: 100%|██████████| 139/139 [00:12<00:00, 11.13req/s]
PAN | Fold-2 | 5-shot: 100%|██████████| 139/139 [00:12<00:00, 11.19req/s]
PAN | Fold-2 | 7-shot: 100%|██████████| 139/139 [00:13<00:00, 10.64req/s]
PAN | Fold-3 | 1-shot: 100%|██████████| 139/139 [00:12<00:00, 11.46req/s]
PAN | Fold-3 | 3-shot: 100%|██████████


✅ Few-shot evaluation complete.





In [16]:
# run_few_shot_all_models.py
import os
import time
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from langchain_ollama import OllamaLLM

# =============== CONFIG ===============
DATASET_NAME = "pan"
FOLD_DIR = f"stratified_splits_80_10_10/{DATASET_NAME}"
K_VALUES = [1, 3, 5, 7]
SEED = 42
random.seed(SEED)

LABEL_LIST = [
    "Information Giving",
    "Problem Discovery",
    "Feature Request",
    "Information Seeking",
]

OLLAMA_MODELS = [
  
    "gemma3:4b",
    "wizardlm2:7b"
]

PROMPT_TEMPLATE = (
    "You are a requirement classification assistant.\n"
    "Your task is to classify a user review into ONE of the following four categories:\n\n"
    "- Information Giving: factual descriptions, app details, or user opinions.\n"
    "- Problem Discovery: issues, bugs, crashes, or errors.\n"
    "- Feature Request: suggestions or desired improvements.\n"
    "- Information Seeking: questions or asking for help.\n\n"
    "{examples_section}"
    "Return ONLY the label name, no explanation.\n\n"
    "Review: {req_text}\n"
    "Label:"
)

# =============== HELPERS ===============
def build_examples(df):
    return "\n".join(f"{row.review} , {row['class']}" for _, row in df.iterrows())

def call_llm(llm, prompt: str) -> str:
    return llm.invoke(prompt).strip()

def extract_label(response: str, labels: list[str]) -> str:
    rl = response.lower()
    for lab in labels:
        if lab.lower() in rl:
            return lab
    return labels[0]

def k_examples_per_class(df: pd.DataFrame, k: int) -> pd.DataFrame:
    return (
        df.assign(len=df.review.str.len())
          .sort_values("len")
          .groupby("class", group_keys=False)
          .head(k)
          .drop(columns="len")
          .reset_index(drop=True)
    )

# =============== MAIN ===============
for model_name in OLLAMA_MODELS:
    MODEL_CLEAN = model_name.replace(":", "_").replace("/", "_")
    OUT_DIR = f"few_shot_folds_{MODEL_CLEAN}_{DATASET_NAME}"
    PRED_DIR = os.path.join(OUT_DIR, "preds")
    os.makedirs(PRED_DIR, exist_ok=True)

    llm = OllamaLLM(model=model_name)

    all_metrics = []
    summary_rows = []

    for fold in range(10):
        test_path = os.path.join(FOLD_DIR, f"test_fold_{fold}.csv")
        train_path = os.path.join(FOLD_DIR, f"train_fold_{fold}.csv")
        test_df = pd.read_csv(test_path).reset_index(drop=True)
        train_df = pd.read_csv(train_path).reset_index(drop=True)

        labels = sorted(train_df["class"].unique())

        for k in K_VALUES:
            fewshot_df = k_examples_per_class(train_df, k)
            example_block = build_examples(fewshot_df)
            examples_section = f"Here are {k} examples (text , class):\n{example_block}\n\n"

            y_true, preds = [], []

            for _, row in tqdm(test_df.iterrows(), total=len(test_df),
                               desc=f"{model_name} | Fold-{fold} | {k}-shot", unit="req"):

                prompt = PROMPT_TEMPLATE.format(
                    examples_section=examples_section,
                    req_text=row.review
                )

                resp = call_llm(llm, prompt)
                pred = extract_label(resp, labels)
                y_true.append(row["class"])
                preds.append(pred)

            # ─── METRICS ──────────────────────────────
            macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="macro", zero_division=0)
            micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="micro", zero_division=0)
            weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="weighted", zero_division=0)

            # ─── SAVE PREDICTIONS ─────────────────────
            pd.DataFrame({
                "text": test_df.review,
                "gold": y_true,
                "pred": preds
            }).to_csv(
                os.path.join(PRED_DIR, f"{DATASET_NAME}_fold{fold}_{k}shot_preds.csv"),
                index=False, encoding="utf-8"
            )

            # ─── RECORD METRICS ───────────────────────
            summary_rows.append({
                "dataset": DATASET_NAME,
                "model": model_name,
                "fold": fold,
                "k_shot": k,
                "macro_f1": macro_f1,
                "micro_f1": micro_f1,
                "weighted_f1": weighted_f1
            })

            for i, lab in enumerate(labels):
                all_metrics.append({
                    "dataset": DATASET_NAME,
                    "model": model_name,
                    "fold": fold,
                    "k_shot": k,
                    "class_label": lab,
                    "macro_f1": macro_f1,
                    "micro_f1": micro_f1,
                    "weighted_f1": weighted_f1,
                })

    # ─── SAVE RESULTS ───────────────────────────────
    pd.DataFrame(summary_rows).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_folded_summary.csv"),
        index=False, encoding="utf-8"
    )

    pd.DataFrame(all_metrics).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_all_metrics.csv"),
        index=False, encoding="utf-8"
    )

    print(f"\n✅ {model_name} few-shot evaluation complete.")


gemma3:4b | Fold-0 | 1-shot: 100%|██████████| 139/139 [00:38<00:00,  3.60req/s]
gemma3:4b | Fold-0 | 3-shot: 100%|██████████| 139/139 [00:34<00:00,  4.00req/s]
gemma3:4b | Fold-0 | 5-shot: 100%|██████████| 139/139 [00:34<00:00,  4.03req/s]
gemma3:4b | Fold-0 | 7-shot: 100%|██████████| 139/139 [00:35<00:00,  3.95req/s]
gemma3:4b | Fold-1 | 1-shot: 100%|██████████| 139/139 [00:34<00:00,  4.00req/s]
gemma3:4b | Fold-1 | 3-shot: 100%|██████████| 139/139 [00:33<00:00,  4.10req/s]
gemma3:4b | Fold-1 | 5-shot: 100%|██████████| 139/139 [00:36<00:00,  3.86req/s]
gemma3:4b | Fold-1 | 7-shot: 100%|██████████| 139/139 [00:35<00:00,  3.91req/s]
gemma3:4b | Fold-2 | 1-shot: 100%|██████████| 139/139 [00:35<00:00,  3.96req/s]
gemma3:4b | Fold-2 | 3-shot: 100%|██████████| 139/139 [00:34<00:00,  4.04req/s]
gemma3:4b | Fold-2 | 5-shot: 100%|██████████| 139/139 [00:35<00:00,  3.94req/s]
gemma3:4b | Fold-2 | 7-shot: 100%|██████████| 139/139 [00:34<00:00,  4.03req/s]
gemma3:4b | Fold-3 | 1-shot: 100%|██████


✅ gemma3:4b few-shot evaluation complete.


wizardlm2:7b | Fold-0 | 1-shot: 100%|██████████| 139/139 [00:13<00:00, 10.06req/s]
wizardlm2:7b | Fold-0 | 3-shot: 100%|██████████| 139/139 [00:11<00:00, 11.68req/s]
wizardlm2:7b | Fold-0 | 5-shot: 100%|██████████| 139/139 [00:11<00:00, 11.61req/s]
wizardlm2:7b | Fold-0 | 7-shot: 100%|██████████| 139/139 [00:11<00:00, 11.65req/s]
wizardlm2:7b | Fold-1 | 1-shot: 100%|██████████| 139/139 [00:11<00:00, 12.42req/s]
wizardlm2:7b | Fold-1 | 3-shot: 100%|██████████| 139/139 [00:11<00:00, 11.97req/s]
wizardlm2:7b | Fold-1 | 5-shot: 100%|██████████| 139/139 [00:12<00:00, 11.23req/s]
wizardlm2:7b | Fold-1 | 7-shot: 100%|██████████| 139/139 [00:12<00:00, 11.11req/s]
wizardlm2:7b | Fold-2 | 1-shot: 100%|██████████| 139/139 [00:11<00:00, 12.19req/s]
wizardlm2:7b | Fold-2 | 3-shot: 100%|██████████| 139/139 [00:11<00:00, 11.63req/s]
wizardlm2:7b | Fold-2 | 5-shot: 100%|██████████| 139/139 [00:12<00:00, 11.25req/s]
wizardlm2:7b | Fold-2 | 7-shot: 100%|██████████| 139/139 [00:12<00:00, 11.38req/s]
wiza


✅ wizardlm2:7b few-shot evaluation complete.





In [9]:
# run_few_shot_all_models.py
import os
import time
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from langchain_ollama import OllamaLLM

# =============== CONFIG ===============
DATASET_NAME = "maalej"
FOLD_DIR = f"stratified_splits_80_10_10/{DATASET_NAME}"
K_VALUES = [1, 3, 5, 7]
SEED = 42
random.seed(SEED)

LABEL_LIST = [
    "feature request",
    "rating",
    "user experience",
    "problem discovery",
]

OLLAMA_MODELS = [
    "llama3:70b",
    "llama3:8b"
]

PROMPT_TEMPLATE = (
    "You are a requirement classification assistant.\n"
    "Your task is to classify a user review into ONE of the following four categories:\n\n"
    "- Feature Request: suggestions or desires for new features, functionality, or improvements. "
    "Often uses words like 'add', 'should have', 'need', 'wish', 'please include'.\n"
    "- Rating: numeric scores (e.g. 5 stars) or general feedback like 'great app' or 'bad service' without "
    "any specific details.\n"
    "- User Experience: opinions about usability, speed, UI, navigation, or general interaction quality. "
    "Might mention design, slow loading, or ease of use.\n"
    "- Problem Discovery: reports of bugs, crashes, errors, or issues. Often contains words like "
    "'doesn't work', 'crash', 'bug', 'problem', 'error'.\n\n"
    "{examples_section}"
    "Return ONLY the label name, no explanation.\n\n"
    "Review: {req_text}\n"
    "Label:"

)

# =============== HELPERS ===============
def build_examples(df):
    return "\n".join(f"{row.review} , {row['class']}" for _, row in df.iterrows())

def call_llm(llm, prompt: str) -> str:
    return llm.invoke(prompt).strip()

def extract_label(response: str, labels: list[str]) -> str:
    rl = response.lower()
    for lab in labels:
        if lab.lower() in rl:
            return lab
    return labels[0]

def k_examples_per_class(df: pd.DataFrame, k: int) -> pd.DataFrame:
    return (
        df.assign(len=df.review.str.len())
          .sort_values("len")
          .groupby("class", group_keys=False)
          .head(k)
          .drop(columns="len")
          .reset_index(drop=True)
    )

# =============== MAIN ===============
for model_name in OLLAMA_MODELS:
    MODEL_CLEAN = model_name.replace(":", "_").replace("/", "_")
    OUT_DIR = f"few_shot_folds_{MODEL_CLEAN}_{DATASET_NAME}"
    PRED_DIR = os.path.join(OUT_DIR, "preds")
    os.makedirs(PRED_DIR, exist_ok=True)

    llm = OllamaLLM(model=model_name)

    all_metrics = []
    summary_rows = []

    for fold in range(10):
        test_path = os.path.join(FOLD_DIR, f"test_fold_{fold}.csv")
        train_path = os.path.join(FOLD_DIR, f"train_fold_{fold}.csv")
        test_df = pd.read_csv(test_path).reset_index(drop=True)
        train_df = pd.read_csv(train_path).reset_index(drop=True)

        labels = sorted(train_df["class"].unique())

        for k in K_VALUES:
            fewshot_df = k_examples_per_class(train_df, k)
            example_block = build_examples(fewshot_df)
            examples_section = f"Here are {k} examples (text , class):\n{example_block}\n\n"

            y_true, preds = [], []

            for _, row in tqdm(test_df.iterrows(), total=len(test_df),
                               desc=f"{model_name} | Fold-{fold} | {k}-shot", unit="req"):

                prompt = PROMPT_TEMPLATE.format(
                    examples_section=examples_section,
                    req_text=row.review
                )

                resp = call_llm(llm, prompt)
                pred = extract_label(resp, labels)
                y_true.append(row["class"])
                preds.append(pred)

            # ─── METRICS ──────────────────────────────
            macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="macro", zero_division=0)
            micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="micro", zero_division=0)
            weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="weighted", zero_division=0)

            # ─── SAVE PREDICTIONS ─────────────────────
            pd.DataFrame({
                "text": test_df.review,
                "gold": y_true,
                "pred": preds
            }).to_csv(
                os.path.join(PRED_DIR, f"{DATASET_NAME}_fold{fold}_{k}shot_preds.csv"),
                index=False, encoding="utf-8"
            )

            # ─── RECORD METRICS ───────────────────────
            summary_rows.append({
                "dataset": DATASET_NAME,
                "model": model_name,
                "fold": fold,
                "k_shot": k,
                "macro_f1": macro_f1,
                "micro_f1": micro_f1,
                "weighted_f1": weighted_f1
            })

            for i, lab in enumerate(labels):
                all_metrics.append({
                    "dataset": DATASET_NAME,
                    "model": model_name,
                    "fold": fold,
                    "k_shot": k,
                    "class_label": lab,
                    "macro_f1": macro_f1,
                    "micro_f1": micro_f1,
                    "weighted_f1": weighted_f1,
                })

    # ─── SAVE RESULTS ───────────────────────────────
    pd.DataFrame(summary_rows).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_folded_summary.csv"),
        index=False, encoding="utf-8"
    )

    pd.DataFrame(all_metrics).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_all_metrics.csv"),
        index=False, encoding="utf-8"
    )

    print(f"\n✅ {model_name} few-shot evaluation complete.")


llama3:70b | Fold-0 | 1-shot: 100%|██████████| 370/370 [01:59<00:00,  3.11req/s]
llama3:70b | Fold-0 | 3-shot:   5%|▍         | 17/370 [00:05<02:01,  2.91req/s]


KeyboardInterrupt: 

In [None]:
# run_few_shot_all_models.py
import os
import time
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from langchain_ollama import OllamaLLM

# =============== CONFIG ===============
DATASET_NAME = "maalej"
FOLD_DIR = f"stratified_splits_80_10_10/{DATASET_NAME}"
K_VALUES = [1, 3, 5, 7]
SEED = 42
random.seed(SEED)

LABEL_LIST = [
    "feature request",
    "rating",
    "user experience",
    "problem discovery",
]

OLLAMA_MODELS = [
   
    "mistral:7b",
    "gemma3:4b",
    "wizardlm2:7b"
]

PROMPT_TEMPLATE = (
    "You are a requirement classification assistant.\n"
    "Your task is to classify a user review into ONE of the following four categories:\n\n"
    "- Feature Request: suggestions or desires for new features, functionality, or improvements. "
    "Often uses words like 'add', 'should have', 'need', 'wish', 'please include'.\n"
    "- Rating: numeric scores (e.g. 5 stars) or general feedback like 'great app' or 'bad service' without "
    "any specific details.\n"
    "- User Experience: opinions about usability, speed, UI, navigation, or general interaction quality. "
    "Might mention design, slow loading, or ease of use.\n"
    "- Problem Discovery: reports of bugs, crashes, errors, or issues. Often contains words like "
    "'doesn't work', 'crash', 'bug', 'problem', 'error'.\n\n"
    "{examples_section}"
    "Return ONLY the label name, no explanation.\n\n"
    "Review: {req_text}\n"
    "Label:"

)

# =============== HELPERS ===============
def build_examples(df):
    return "\n".join(f"{row.review} , {row['class']}" for _, row in df.iterrows())

def call_llm(llm, prompt: str) -> str:
    return llm.invoke(prompt).strip()

def extract_label(response: str, labels: list[str]) -> str:
    rl = response.lower()
    for lab in labels:
        if lab.lower() in rl:
            return lab
    return labels[0]

def k_examples_per_class(df: pd.DataFrame, k: int) -> pd.DataFrame:
    return (
        df.assign(len=df.review.str.len())
          .sort_values("len")
          .groupby("class", group_keys=False)
          .head(k)
          .drop(columns="len")
          .reset_index(drop=True)
    )

# =============== MAIN ===============
for model_name in OLLAMA_MODELS:
    MODEL_CLEAN = model_name.replace(":", "_").replace("/", "_")
    OUT_DIR = f"few_shot_folds_{MODEL_CLEAN}_{DATASET_NAME}"
    PRED_DIR = os.path.join(OUT_DIR, "preds")
    os.makedirs(PRED_DIR, exist_ok=True)

    llm = OllamaLLM(model=model_name)

    all_metrics = []
    summary_rows = []

    for fold in range(10):
        test_path = os.path.join(FOLD_DIR, f"test_fold_{fold}.csv")
        train_path = os.path.join(FOLD_DIR, f"train_fold_{fold}.csv")
        test_df = pd.read_csv(test_path).reset_index(drop=True)
        train_df = pd.read_csv(train_path).reset_index(drop=True)

        labels = sorted(train_df["class"].unique())

        for k in K_VALUES:
            fewshot_df = k_examples_per_class(train_df, k)
            example_block = build_examples(fewshot_df)
            examples_section = f"Here are {k} examples (text , class):\n{example_block}\n\n"

            y_true, preds = [], []

            for _, row in tqdm(test_df.iterrows(), total=len(test_df),
                               desc=f"{model_name} | Fold-{fold} | {k}-shot", unit="req"):

                prompt = PROMPT_TEMPLATE.format(
                    examples_section=examples_section,
                    req_text=row.review
                )

                resp = call_llm(llm, prompt)
                pred = extract_label(resp, labels)
                y_true.append(row["class"])
                preds.append(pred)

            # ─── METRICS ──────────────────────────────
            macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="macro", zero_division=0)
            micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="micro", zero_division=0)
            weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="weighted", zero_division=0)

            # ─── SAVE PREDICTIONS ─────────────────────
            pd.DataFrame({
                "text": test_df.review,
                "gold": y_true,
                "pred": preds
            }).to_csv(
                os.path.join(PRED_DIR, f"{DATASET_NAME}_fold{fold}_{k}shot_preds.csv"),
                index=False, encoding="utf-8"
            )

            # ─── RECORD METRICS ───────────────────────
            summary_rows.append({
                "dataset": DATASET_NAME,
                "model": model_name,
                "fold": fold,
                "k_shot": k,
                "macro_f1": macro_f1,
                "micro_f1": micro_f1,
                "weighted_f1": weighted_f1
            })

            for i, lab in enumerate(labels):
                all_metrics.append({
                    "dataset": DATASET_NAME,
                    "model": model_name,
                    "fold": fold,
                    "k_shot": k,
                    "class_label": lab,
                    "macro_f1": macro_f1,
                    "micro_f1": micro_f1,
                    "weighted_f1": weighted_f1,
                })

    # ─── SAVE RESULTS ───────────────────────────────
    pd.DataFrame(summary_rows).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_folded_summary.csv"),
        index=False, encoding="utf-8"
    )

    pd.DataFrame(all_metrics).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_all_metrics.csv"),
        index=False, encoding="utf-8"
    )

    print(f"\n✅ {model_name} few-shot evaluation complete.")


In [10]:
#scalab_few_shots
# run_few_shot_all_models.py
import os
import time
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from langchain_ollama import OllamaLLM

# =============== CONFIG ===============
DATASET_NAME = "scalabrino"
FOLD_DIR = f"stratified_splits_80_10_10/{DATASET_NAME}"
K_VALUES = [1, 3, 5, 7]
SEED = 42
random.seed(SEED)

LABEL_LIST = [
    "BUG",
    "FEATURE",
    "PERFORMANCE",
    "ENERGY",
    "OTHER",
    "SECURITY",
    "USABILITY",
]

OLLAMA_MODELS = [
   
    "llama3:70b",
    "llama3:8b"
]

PROMPT_TEMPLATE = (
    "You are a requirement classification assistant.\n"
    "Your task is to classify a user review into ONE of the following four categories:\n\n"
    "- Feature Request: suggestions or desires for new features, functionality, or improvements. "
    "Often uses words like 'add', 'should have', 'need', 'wish', 'please include'.\n"
    "- Rating: numeric scores (e.g. 5 stars) or general feedback like 'great app' or 'bad service' without "
    "any specific details.\n"
    "- User Experience: opinions about usability, speed, UI, navigation, or general interaction quality. "
    "Might mention design, slow loading, or ease of use.\n"
    "- Problem Discovery: reports of bugs, crashes, errors, or issues. Often contains words like "
    "'doesn't work', 'crash', 'bug', 'problem', 'error'.\n\n"
    "{examples_section}"
    "Return ONLY the label name, no explanation.\n\n"
    "Review: {req_text}\n"
    "Label:"

)

# =============== HELPERS ===============
def build_examples(df):
    return "\n".join(f"{row.review} , {row['class']}" for _, row in df.iterrows())

def call_llm(llm, prompt: str) -> str:
    return llm.invoke(prompt).strip()

def extract_label(response: str, labels: list[str]) -> str:
    rl = response.lower()
    for lab in labels:
        if lab.lower() in rl:
            return lab
    return labels[0]

def k_examples_per_class(df: pd.DataFrame, k: int) -> pd.DataFrame:
    return (
        df.assign(len=df.review.str.len())
          .sort_values("len")
          .groupby("class", group_keys=False)
          .head(k)
          .drop(columns="len")
          .reset_index(drop=True)
    )

# =============== MAIN ===============
for model_name in OLLAMA_MODELS:
    MODEL_CLEAN = model_name.replace(":", "_").replace("/", "_")
    OUT_DIR = f"few_shot_folds_{MODEL_CLEAN}_{DATASET_NAME}"
    PRED_DIR = os.path.join(OUT_DIR, "preds")
    os.makedirs(PRED_DIR, exist_ok=True)

    llm = OllamaLLM(model=model_name)

    all_metrics = []
    summary_rows = []

    for fold in range(10):
        test_path = os.path.join(FOLD_DIR, f"test_fold_{fold}.csv")
        train_path = os.path.join(FOLD_DIR, f"train_fold_{fold}.csv")
        test_df = pd.read_csv(test_path).reset_index(drop=True)
        train_df = pd.read_csv(train_path).reset_index(drop=True)

        labels = sorted(train_df["class"].unique())

        for k in K_VALUES:
            fewshot_df = k_examples_per_class(train_df, k)
            example_block = build_examples(fewshot_df)
            examples_section = f"Here are {k} examples (text , class):\n{example_block}\n\n"

            y_true, preds = [], []

            for _, row in tqdm(test_df.iterrows(), total=len(test_df),
                               desc=f"{model_name} | Fold-{fold} | {k}-shot", unit="req"):

                prompt = PROMPT_TEMPLATE.format(
                    examples_section=examples_section,
                    req_text=row.review
                )

                resp = call_llm(llm, prompt)
                pred = extract_label(resp, labels)
                y_true.append(row["class"])
                preds.append(pred)

            # ─── METRICS ──────────────────────────────
            macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="macro", zero_division=0)
            micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="micro", zero_division=0)
            weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="weighted", zero_division=0)

            # ─── SAVE PREDICTIONS ─────────────────────
            pd.DataFrame({
                "text": test_df.review,
                "gold": y_true,
                "pred": preds
            }).to_csv(
                os.path.join(PRED_DIR, f"{DATASET_NAME}_fold{fold}_{k}shot_preds.csv"),
                index=False, encoding="utf-8"
            )

            # ─── RECORD METRICS ───────────────────────
            summary_rows.append({
                "dataset": DATASET_NAME,
                "model": model_name,
                "fold": fold,
                "k_shot": k,
                "macro_f1": macro_f1,
                "micro_f1": micro_f1,
                "weighted_f1": weighted_f1
            })

            for i, lab in enumerate(labels):
                all_metrics.append({
                    "dataset": DATASET_NAME,
                    "model": model_name,
                    "fold": fold,
                    "k_shot": k,
                    "class_label": lab,
                    "macro_f1": macro_f1,
                    "micro_f1": micro_f1,
                    "weighted_f1": weighted_f1,
                })

    # ─── SAVE RESULTS ───────────────────────────────
    pd.DataFrame(summary_rows).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_folded_summary.csv"),
        index=False, encoding="utf-8"
    )

    pd.DataFrame(all_metrics).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_all_metrics.csv"),
        index=False, encoding="utf-8"
    )

    print(f"\n✅ {model_name} few-shot evaluation complete.")


llama3:70b | Fold-0 | 1-shot: 100%|██████████| 300/300 [01:40<00:00,  2.97req/s]
llama3:70b | Fold-0 | 3-shot: 100%|██████████| 300/300 [01:42<00:00,  2.92req/s]
llama3:70b | Fold-0 | 5-shot: 100%|██████████| 300/300 [01:44<00:00,  2.86req/s]
llama3:70b | Fold-0 | 7-shot: 100%|██████████| 300/300 [01:45<00:00,  2.85req/s]
llama3:70b | Fold-1 | 1-shot: 100%|██████████| 300/300 [01:40<00:00,  2.98req/s]
llama3:70b | Fold-1 | 3-shot: 100%|██████████| 300/300 [01:43<00:00,  2.89req/s]
llama3:70b | Fold-1 | 5-shot: 100%|██████████| 300/300 [01:43<00:00,  2.90req/s]
llama3:70b | Fold-1 | 7-shot: 100%|██████████| 300/300 [01:45<00:00,  2.84req/s]
llama3:70b | Fold-2 | 1-shot: 100%|██████████| 300/300 [01:41<00:00,  2.96req/s]
llama3:70b | Fold-2 | 3-shot: 100%|██████████| 300/300 [01:42<00:00,  2.91req/s]
llama3:70b | Fold-2 | 5-shot: 100%|██████████| 300/300 [01:43<00:00,  2.90req/s]
llama3:70b | Fold-2 | 7-shot: 100%|██████████| 300/300 [01:45<00:00,  2.84req/s]
llama3:70b | Fold-3 | 1-shot


✅ llama3:70b few-shot evaluation complete.


llama3:8b | Fold-0 | 1-shot: 100%|██████████| 300/300 [01:00<00:00,  4.95req/s]
llama3:8b | Fold-0 | 3-shot: 100%|██████████| 300/300 [00:57<00:00,  5.18req/s]
llama3:8b | Fold-0 | 5-shot: 100%|██████████| 300/300 [00:58<00:00,  5.12req/s]
llama3:8b | Fold-0 | 7-shot: 100%|██████████| 300/300 [00:59<00:00,  5.08req/s]
llama3:8b | Fold-1 | 1-shot: 100%|██████████| 300/300 [00:59<00:00,  5.07req/s]
llama3:8b | Fold-1 | 3-shot: 100%|██████████| 300/300 [00:58<00:00,  5.13req/s]
llama3:8b | Fold-1 | 5-shot: 100%|██████████| 300/300 [00:58<00:00,  5.17req/s]
llama3:8b | Fold-2 | 3-shot: 100%|██████████| 300/300 [00:58<00:00,  5.13req/s]
llama3:8b | Fold-2 | 5-shot: 100%|██████████| 300/300 [00:58<00:00,  5.11req/s]
llama3:8b | Fold-2 | 7-shot: 100%|██████████| 300/300 [00:59<00:00,  5.08req/s]
llama3:8b | Fold-3 | 1-shot: 100%|██████████| 300/300 [00:58<00:00,  5.11req/s]
llama3:8b | Fold-3 | 3-shot: 100%|██████████| 300/300 [00:58<00:00,  5.11req/s]
llama3:8b | Fold-3 | 5-shot: 100%|██████


✅ llama3:8b few-shot evaluation complete.





In [1]:
#scalab_few_shots
# run_few_shot_all_models.py
import os
import time
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from langchain_ollama import OllamaLLM

# =============== CONFIG ===============
DATASET_NAME = "scalabrino"
FOLD_DIR = f"stratified_splits_80_10_10/{DATASET_NAME}"
K_VALUES = [1, 3, 5, 7]
SEED = 42
random.seed(SEED)

LABEL_LIST = [
    "BUG",
    "FEATURE",
    "PERFORMANCE",
    "ENERGY",
    "OTHER",
    "SECURITY",
    "USABILITY",
]

OLLAMA_MODELS = [
   
    "mistral:7b",
    "gemma3:4b",
    "wizardlm2:7b"
]

PROMPT_TEMPLATE = (
    "You are a requirement classification assistant.\n"
    "Your task is to classify a user review into ONE of the following four categories:\n\n"
    "- Feature Request: suggestions or desires for new features, functionality, or improvements. "
    "Often uses words like 'add', 'should have', 'need', 'wish', 'please include'.\n"
    "- Rating: numeric scores (e.g. 5 stars) or general feedback like 'great app' or 'bad service' without "
    "any specific details.\n"
    "- User Experience: opinions about usability, speed, UI, navigation, or general interaction quality. "
    "Might mention design, slow loading, or ease of use.\n"
    "- Problem Discovery: reports of bugs, crashes, errors, or issues. Often contains words like "
    "'doesn't work', 'crash', 'bug', 'problem', 'error'.\n\n"
    "{examples_section}"
    "Return ONLY the label name, no explanation.\n\n"
    "Review: {req_text}\n"
    "Label:"

)

# =============== HELPERS ===============
def build_examples(df):
    return "\n".join(f"{row.review} , {row['class']}" for _, row in df.iterrows())

def call_llm(llm, prompt: str) -> str:
    return llm.invoke(prompt).strip()

def extract_label(response: str, labels: list[str]) -> str:
    rl = response.lower()
    for lab in labels:
        if lab.lower() in rl:
            return lab
    return labels[0]

def k_examples_per_class(df: pd.DataFrame, k: int) -> pd.DataFrame:
    return (
        df.assign(len=df.review.str.len())
          .sort_values("len")
          .groupby("class", group_keys=False)
          .head(k)
          .drop(columns="len")
          .reset_index(drop=True)
    )

# =============== MAIN ===============
for model_name in OLLAMA_MODELS:
    MODEL_CLEAN = model_name.replace(":", "_").replace("/", "_")
    OUT_DIR = f"few_shot_folds_{MODEL_CLEAN}_{DATASET_NAME}"
    PRED_DIR = os.path.join(OUT_DIR, "preds")
    os.makedirs(PRED_DIR, exist_ok=True)

    llm = OllamaLLM(model=model_name)

    all_metrics = []
    summary_rows = []

    for fold in range(10):
        test_path = os.path.join(FOLD_DIR, f"test_fold_{fold}.csv")
        train_path = os.path.join(FOLD_DIR, f"train_fold_{fold}.csv")
        test_df = pd.read_csv(test_path).reset_index(drop=True)
        train_df = pd.read_csv(train_path).reset_index(drop=True)

        labels = sorted(train_df["class"].unique())

        for k in K_VALUES:
            fewshot_df = k_examples_per_class(train_df, k)
            example_block = build_examples(fewshot_df)
            examples_section = f"Here are {k} examples (text , class):\n{example_block}\n\n"

            y_true, preds = [], []

            for _, row in tqdm(test_df.iterrows(), total=len(test_df),
                               desc=f"{model_name} | Fold-{fold} | {k}-shot", unit="req"):

                prompt = PROMPT_TEMPLATE.format(
                    examples_section=examples_section,
                    req_text=row.review
                )

                resp = call_llm(llm, prompt)
                pred = extract_label(resp, labels)
                y_true.append(row["class"])
                preds.append(pred)

            # ─── METRICS ──────────────────────────────
            macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="macro", zero_division=0)
            micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="micro", zero_division=0)
            weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="weighted", zero_division=0)

            # ─── SAVE PREDICTIONS ─────────────────────
            pd.DataFrame({
                "text": test_df.review,
                "gold": y_true,
                "pred": preds
            }).to_csv(
                os.path.join(PRED_DIR, f"{DATASET_NAME}_fold{fold}_{k}shot_preds.csv"),
                index=False, encoding="utf-8"
            )

            # ─── RECORD METRICS ───────────────────────
            summary_rows.append({
                "dataset": DATASET_NAME,
                "model": model_name,
                "fold": fold,
                "k_shot": k,
                "macro_f1": macro_f1,
                "micro_f1": micro_f1,
                "weighted_f1": weighted_f1
            })

            for i, lab in enumerate(labels):
                all_metrics.append({
                    "dataset": DATASET_NAME,
                    "model": model_name,
                    "fold": fold,
                    "k_shot": k,
                    "class_label": lab,
                    "macro_f1": macro_f1,
                    "micro_f1": micro_f1,
                    "weighted_f1": weighted_f1,
                })

    # ─── SAVE RESULTS ───────────────────────────────
    pd.DataFrame(summary_rows).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_folded_summary.csv"),
        index=False, encoding="utf-8"
    )

    pd.DataFrame(all_metrics).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_all_metrics.csv"),
        index=False, encoding="utf-8"
    )

    print(f"\n✅ {model_name} few-shot evaluation complete.")


mistral:7b | Fold-0 | 1-shot: 100%|██████████| 300/300 [00:31<00:00,  9.57req/s]
mistral:7b | Fold-0 | 3-shot: 100%|██████████| 300/300 [00:32<00:00,  9.25req/s]
mistral:7b | Fold-0 | 5-shot: 100%|██████████| 300/300 [00:30<00:00,  9.80req/s]
mistral:7b | Fold-0 | 7-shot: 100%|██████████| 300/300 [00:31<00:00,  9.38req/s]
mistral:7b | Fold-1 | 1-shot: 100%|██████████| 300/300 [00:32<00:00,  9.23req/s]
mistral:7b | Fold-1 | 3-shot: 100%|██████████| 300/300 [00:32<00:00,  9.25req/s]
mistral:7b | Fold-1 | 5-shot: 100%|██████████| 300/300 [00:33<00:00,  9.00req/s]
mistral:7b | Fold-1 | 7-shot: 100%|██████████| 300/300 [00:34<00:00,  8.68req/s]
mistral:7b | Fold-2 | 1-shot: 100%|██████████| 300/300 [00:29<00:00, 10.31req/s]
mistral:7b | Fold-2 | 3-shot: 100%|██████████| 300/300 [00:28<00:00, 10.42req/s]
mistral:7b | Fold-2 | 5-shot: 100%|██████████| 300/300 [00:31<00:00,  9.50req/s]
mistral:7b | Fold-2 | 7-shot: 100%|██████████| 300/300 [00:32<00:00,  9.35req/s]
mistral:7b | Fold-3 | 1-shot


✅ mistral:7b few-shot evaluation complete.


gemma3:4b | Fold-0 | 1-shot: 100%|██████████| 300/300 [01:16<00:00,  3.92req/s]
gemma3:4b | Fold-0 | 3-shot: 100%|██████████| 300/300 [01:13<00:00,  4.08req/s]
gemma3:4b | Fold-0 | 5-shot: 100%|██████████| 300/300 [01:14<00:00,  4.02req/s]
gemma3:4b | Fold-0 | 7-shot: 100%|██████████| 300/300 [01:15<00:00,  4.00req/s]
gemma3:4b | Fold-1 | 1-shot: 100%|██████████| 300/300 [01:13<00:00,  4.10req/s]
gemma3:4b | Fold-1 | 3-shot: 100%|██████████| 300/300 [01:13<00:00,  4.11req/s]
gemma3:4b | Fold-1 | 5-shot: 100%|██████████| 300/300 [01:14<00:00,  4.04req/s]
gemma3:4b | Fold-1 | 7-shot: 100%|██████████| 300/300 [01:15<00:00,  3.96req/s]
gemma3:4b | Fold-2 | 1-shot: 100%|██████████| 300/300 [01:13<00:00,  4.11req/s]
gemma3:4b | Fold-2 | 3-shot: 100%|██████████| 300/300 [01:12<00:00,  4.11req/s]
gemma3:4b | Fold-2 | 5-shot: 100%|██████████| 300/300 [01:14<00:00,  4.04req/s]
gemma3:4b | Fold-2 | 7-shot: 100%|██████████| 300/300 [01:14<00:00,  4.01req/s]
gemma3:4b | Fold-3 | 1-shot: 100%|██████

KeyboardInterrupt: 

In [3]:
#scalab_few_shots
# run_few_shot_all_models.py
import os
import time
import random
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from langchain_ollama import OllamaLLM

# =============== CONFIG ===============
DATASET_NAME = "scalabrino"
FOLD_DIR = f"stratified_splits_80_10_10/{DATASET_NAME}"
K_VALUES = [1, 3, 5, 7]
SEED = 42
random.seed(SEED)

LABEL_LIST = [
    "BUG",
    "FEATURE",
    "PERFORMANCE",
    "ENERGY",
    "OTHER",
    "SECURITY",
    "USABILITY",
]

OLLAMA_MODELS = [
   

    "gemma3:4b",
    "wizardlm2:7b"
]

PROMPT_TEMPLATE = (
    "You are a requirement classification assistant.\n"
    "Your task is to classify a user review into ONE of the following four categories:\n\n"
    "- Feature Request: suggestions or desires for new features, functionality, or improvements. "
    "Often uses words like 'add', 'should have', 'need', 'wish', 'please include'.\n"
    "- Rating: numeric scores (e.g. 5 stars) or general feedback like 'great app' or 'bad service' without "
    "any specific details.\n"
    "- User Experience: opinions about usability, speed, UI, navigation, or general interaction quality. "
    "Might mention design, slow loading, or ease of use.\n"
    "- Problem Discovery: reports of bugs, crashes, errors, or issues. Often contains words like "
    "'doesn't work', 'crash', 'bug', 'problem', 'error'.\n\n"
    "{examples_section}"
    "Return ONLY the label name, no explanation.\n\n"
    "Review: {req_text}\n"
    "Label:"

)

# =============== HELPERS ===============
def build_examples(df):
    return "\n".join(f"{row.review} , {row['class']}" for _, row in df.iterrows())

def call_llm(llm, prompt: str) -> str:
    return llm.invoke(prompt).strip()

def extract_label(response: str, labels: list[str]) -> str:
    rl = response.lower()
    for lab in labels:
        if lab.lower() in rl:
            return lab
    return labels[0]

def k_examples_per_class(df: pd.DataFrame, k: int) -> pd.DataFrame:
    return (
        df.assign(len=df.review.str.len())
          .sort_values("len")
          .groupby("class", group_keys=False)
          .head(k)
          .drop(columns="len")
          .reset_index(drop=True)
    )

# =============== MAIN ===============
for model_name in OLLAMA_MODELS:
    MODEL_CLEAN = model_name.replace(":", "_").replace("/", "_")
    OUT_DIR = f"few_shot_folds_{MODEL_CLEAN}_{DATASET_NAME}"
    PRED_DIR = os.path.join(OUT_DIR, "preds")
    os.makedirs(PRED_DIR, exist_ok=True)

    llm = OllamaLLM(model=model_name)

    all_metrics = []
    summary_rows = []

    for fold in range(10):
        test_path = os.path.join(FOLD_DIR, f"test_fold_{fold}.csv")
        train_path = os.path.join(FOLD_DIR, f"train_fold_{fold}.csv")
        test_df = pd.read_csv(test_path).reset_index(drop=True)
        train_df = pd.read_csv(train_path).reset_index(drop=True)

        labels = sorted(train_df["class"].unique())

        for k in K_VALUES:
            fewshot_df = k_examples_per_class(train_df, k)
            example_block = build_examples(fewshot_df)
            examples_section = f"Here are {k} examples (text , class):\n{example_block}\n\n"

            y_true, preds = [], []

            for _, row in tqdm(test_df.iterrows(), total=len(test_df),
                               desc=f"{model_name} | Fold-{fold} | {k}-shot", unit="req"):

                prompt = PROMPT_TEMPLATE.format(
                    examples_section=examples_section,
                    req_text=row.review
                )

                resp = call_llm(llm, prompt)
                pred = extract_label(resp, labels)
                y_true.append(row["class"])
                preds.append(pred)

            # ─── METRICS ──────────────────────────────
            macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="macro", zero_division=0)
            micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="micro", zero_division=0)
            weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(
                y_true, preds, labels=labels, average="weighted", zero_division=0)

            # ─── SAVE PREDICTIONS ─────────────────────
            pd.DataFrame({
                "text": test_df.review,
                "gold": y_true,
                "pred": preds
            }).to_csv(
                os.path.join(PRED_DIR, f"{DATASET_NAME}_fold{fold}_{k}shot_preds.csv"),
                index=False, encoding="utf-8"
            )

            # ─── RECORD METRICS ───────────────────────
            summary_rows.append({
                "dataset": DATASET_NAME,
                "model": model_name,
                "fold": fold,
                "k_shot": k,
                "macro_f1": macro_f1,
                "micro_f1": micro_f1,
                "weighted_f1": weighted_f1
            })

            for i, lab in enumerate(labels):
                all_metrics.append({
                    "dataset": DATASET_NAME,
                    "model": model_name,
                    "fold": fold,
                    "k_shot": k,
                    "class_label": lab,
                    "macro_f1": macro_f1,
                    "micro_f1": micro_f1,
                    "weighted_f1": weighted_f1,
                })

    # ─── SAVE RESULTS ───────────────────────────────
    pd.DataFrame(summary_rows).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_folded_summary.csv"),
        index=False, encoding="utf-8"
    )

    pd.DataFrame(all_metrics).to_csv(
        os.path.join(OUT_DIR, f"{DATASET_NAME}_fewshot_all_metrics.csv"),
        index=False, encoding="utf-8"
    )

    print(f"\n✅ {model_name} few-shot evaluation complete.")


gemma3:4b | Fold-0 | 1-shot: 100%|██████████| 300/300 [01:15<00:00,  3.96req/s]
gemma3:4b | Fold-0 | 3-shot: 100%|██████████| 300/300 [01:13<00:00,  4.09req/s]
gemma3:4b | Fold-0 | 5-shot: 100%|██████████| 300/300 [01:15<00:00,  3.98req/s]
gemma3:4b | Fold-0 | 7-shot: 100%|██████████| 300/300 [01:14<00:00,  4.04req/s]
gemma3:4b | Fold-1 | 1-shot: 100%|██████████| 300/300 [01:12<00:00,  4.13req/s]
gemma3:4b | Fold-1 | 3-shot: 100%|██████████| 300/300 [01:14<00:00,  4.05req/s]
gemma3:4b | Fold-1 | 5-shot: 100%|██████████| 300/300 [01:13<00:00,  4.06req/s]
gemma3:4b | Fold-1 | 7-shot: 100%|██████████| 300/300 [01:15<00:00,  3.95req/s]
gemma3:4b | Fold-2 | 1-shot: 100%|██████████| 300/300 [01:12<00:00,  4.13req/s]
gemma3:4b | Fold-2 | 3-shot: 100%|██████████| 300/300 [01:13<00:00,  4.10req/s]
gemma3:4b | Fold-2 | 5-shot: 100%|██████████| 300/300 [01:14<00:00,  4.05req/s]
gemma3:4b | Fold-2 | 7-shot: 100%|██████████| 300/300 [01:14<00:00,  4.03req/s]
gemma3:4b | Fold-3 | 1-shot: 100%|██████


✅ gemma3:4b few-shot evaluation complete.


wizardlm2:7b | Fold-0 | 1-shot: 100%|██████████| 300/300 [00:37<00:00,  7.95req/s]
wizardlm2:7b | Fold-0 | 3-shot: 100%|██████████| 300/300 [00:31<00:00,  9.48req/s]
wizardlm2:7b | Fold-0 | 5-shot: 100%|██████████| 300/300 [00:34<00:00,  8.70req/s]
wizardlm2:7b | Fold-0 | 7-shot: 100%|██████████| 300/300 [00:33<00:00,  9.05req/s]
wizardlm2:7b | Fold-1 | 1-shot: 100%|██████████| 300/300 [00:32<00:00,  9.27req/s]
wizardlm2:7b | Fold-1 | 3-shot: 100%|██████████| 300/300 [00:35<00:00,  8.41req/s]
wizardlm2:7b | Fold-1 | 5-shot: 100%|██████████| 300/300 [00:31<00:00,  9.54req/s]
wizardlm2:7b | Fold-1 | 7-shot: 100%|██████████| 300/300 [00:36<00:00,  8.11req/s]
wizardlm2:7b | Fold-2 | 1-shot: 100%|██████████| 300/300 [00:34<00:00,  8.78req/s]
wizardlm2:7b | Fold-2 | 3-shot: 100%|██████████| 300/300 [00:31<00:00,  9.64req/s]
wizardlm2:7b | Fold-2 | 5-shot: 100%|██████████| 300/300 [00:33<00:00,  9.02req/s]
wizardlm2:7b | Fold-2 | 7-shot: 100%|██████████| 300/300 [00:36<00:00,  8.17req/s]
wiza


✅ wizardlm2:7b few-shot evaluation complete.





In [1]:
#f1 score analysis
import os
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from collections import defaultdict

# ─── CONFIG ─────────────────────────────
ZS_PREFIX = "zero_shot_folds_"
FS_PREFIX = "few_shot_folds_"
DATASETS = ["pan", "maalej", "scalabrino"]
MODELS = ["llama3_8b", "llama3_70b", "mistral_7b", "gemma3_4b", "wizardlm2_7b"]
K_SHOTS = [1, 3, 5, 7]
FOLDS = range(10)
OUT_DIR = "f1_analysis"
os.makedirs(OUT_DIR, exist_ok=True)

# ─── LOAD PREDICTIONS ──────────────────
def load_preds(model_name, dataset, fold, shot=None):
    if shot is None:
        # Zero-shot: all datasets in same folder
        path = os.path.join(f"{ZS_PREFIX}{model_name}", "preds", f"{dataset}_fold{fold}_preds.csv")
        print(f"ZERO-SHOT PATH: {path}")
    else:
        # Few-shot: separate folder per model-dataset combination
        path = os.path.join(f"{FS_PREFIX}{model_name}_{dataset}", "preds", f"{dataset}_fold{fold}_{shot}shot_preds.csv")
        print(f"FEW-SHOT PATH: {path}")
    
    # Check if directory exists
    dir_path = os.path.dirname(path)
    if not os.path.exists(dir_path):
        print(f"DIRECTORY MISSING: {dir_path}")
        return None
    
    if not os.path.exists(path):
        print(f"FILE MISSING: {path}")
        # List what files ARE in the directory
        try:
            files_in_dir = os.listdir(dir_path)
            print(f"Files in {dir_path}: {files_in_dir[:10]}")  # Show first 10 files
        except:
            print(f"Cannot list files in {dir_path}")
        return None
    
    try:
        df = pd.read_csv(path)
        print(f"SUCCESS: Loaded {len(df)} rows, columns: {list(df.columns)}")
        
        # Check if required columns exist
        if 'gold' not in df.columns:
            print(f"ERROR: 'gold' column missing from {path}")
            print(f"Available columns: {list(df.columns)}")
            return None
        if 'pred' not in df.columns:
            print(f"ERROR: 'pred' column missing from {path}")
            print(f"Available columns: {list(df.columns)}")
            return None
            
        return df[["text", "gold", "pred"]]
    except Exception as e:
        print(f"ERROR reading {path}: {e}")
        return None

# ─── CALCULATE F1 SCORES ──────────────────
print("Calculating F1 scores for all configurations...")
print(f"Expected zero-shot path format: {ZS_PREFIX}MODEL_NAME/preds/DATASET_foldN_preds.csv")
print(f"Expected few-shot path format: {FS_PREFIX}MODEL_NAME_DATASET/preds/DATASET_foldN_Nshot_preds.csv")

all_results = []

for model in MODELS:
    print(f"\n{'='*50}")
    print(f"Processing model: {model}")
    print(f"{'='*50}")
    
    # Process zero-shot first to debug
    print(f"\nProcessing ZERO-SHOT for {model}:")
    for dataset in DATASETS:
        print(f"\n  Dataset: {dataset}")
        dataset_f1_scores = []
        dataset_sample_counts = []
        
        for fold in FOLDS:
            df = load_preds(model, dataset, fold, shot=None)
            if df is None:
                continue
            
            # Validate data
            if len(df) == 0:
                print(f"    WARNING: Empty dataframe for {model} {dataset} fold {fold}")
                continue
                
            # Check for missing columns or data
            missing_gold = df["gold"].isna().sum()
            missing_pred = df["pred"].isna().sum()
            if missing_gold > 0 or missing_pred > 0:
                print(f"    WARNING: Missing data - gold: {missing_gold}, pred: {missing_pred}")
            
            # Clean predictions and gold labels
            gold = df["gold"].str.lower().str.strip()
            pred = df["pred"].str.lower().str.strip()
            
            # Calculate F1 score (macro average for multi-class)
            try:
                f1 = f1_score(gold, pred, average='macro', zero_division=0)
                dataset_f1_scores.append(f1)
                dataset_sample_counts.append(len(df))
                print(f"    Fold {fold}: F1={f1:.4f}, samples={len(df)}")
            except Exception as e:
                print(f"    ERROR calculating F1 for {model} {dataset} fold {fold}: {e}")
                continue
        
        if dataset_f1_scores:
            # Calculate weighted average F1 for this dataset
            weights = np.array(dataset_sample_counts)
            avg_f1 = np.average(dataset_f1_scores, weights=weights)
            
            print(f"  {dataset} ZERO-SHOT: Avg F1={avg_f1:.4f} (from {len(dataset_f1_scores)} folds)")
            
            all_results.append({
                'model': model,
                'shot_config': '0-shot',
                'dataset': dataset,
                'avg_f1': avg_f1,
                'total_samples': sum(dataset_sample_counts),
                'num_folds': len(dataset_f1_scores)
            })
        else:
            print(f"  {dataset} ZERO-SHOT: NO DATA FOUND!")
    
    # Process few-shot
    print(f"\nProcessing FEW-SHOT for {model}:")
    for shot in K_SHOTS:
        print(f"\n  {shot}-shot:")
        for dataset in DATASETS:
            dataset_f1_scores = []
            dataset_sample_counts = []
            
            for fold in FOLDS:
                df = load_preds(model, dataset, fold, shot=shot)
                if df is None:
                    continue
                
                # Clean predictions and gold labels
                gold = df["gold"].str.lower().str.strip()
                pred = df["pred"].str.lower().str.strip()
                
                # Calculate F1 score
                try:
                    f1 = f1_score(gold, pred, average='macro', zero_division=0)
                    dataset_f1_scores.append(f1)
                    dataset_sample_counts.append(len(df))
                    print(f"    {dataset} Fold {fold}: F1={f1:.4f}, samples={len(df)}")
                except Exception as e:
                    print(f"    ERROR calculating F1 for {model} {dataset} fold {fold} {shot}-shot: {e}")
                    continue
            
            if dataset_f1_scores:
                # Since all folds have same sample size, use simple arithmetic mean
                avg_f1 = np.mean(dataset_f1_scores)
                
                print(f"    {dataset}: Avg F1={avg_f1:.4f} (from {len(dataset_f1_scores)} folds)")
                
                all_results.append({
                    'model': model,
                    'shot_config': f'{shot}-shot',
                    'dataset': dataset,
                    'avg_f1': avg_f1,
                    'total_samples': sum(dataset_sample_counts),
                    'num_folds': len(dataset_f1_scores)
                })
            else:
                print(f"    {dataset}: NO DATA FOUND!")

# Convert to DataFrame
results_df = pd.DataFrame(all_results)

print(f"\nDEBUG: Results DataFrame shape: {results_df.shape}")
print(f"DEBUG: Available combinations:")
print(results_df[['model', 'shot_config', 'dataset', 'avg_f1']].head(20))

# ─── CREATE EXCEL-STYLE F1 TABLE ──────────────────
print(f"\nCreating Excel-style table...")
excel_style_data = []

for model in MODELS:
    for dataset in DATASETS:
        row = {
            'Model': model,
            'Dataset': dataset,
            'Prompt Template': 'PT 7- definition'
        }
        
        # Add F1 scores for each shot configuration
        for shot_config in ['0-shot'] + [f'{k}-shot' for k in K_SHOTS]:
            col_name = shot_config.replace('-shot', ' shot')
            if col_name == '0 shot':
                col_name = 'Zero shot'
            
            # Find data for this specific combination
            model_data = results_df[
                (results_df['model'] == model) & 
                (results_df['dataset'] == dataset) & 
                (results_df['shot_config'] == shot_config)
            ]
            
            if len(model_data) > 0:
                f1_value = model_data.iloc[0]['avg_f1']
                row[col_name] = round(f1_value, 3)
                print(f"DEBUG: {model} {dataset} {shot_config} = {f1_value:.3f}")
            else:
                row[col_name] = np.nan  # Use NaN instead of 0.000
                print(f"DEBUG: {model} {dataset} {shot_config} = NOT FOUND")
        
        excel_style_data.append(row)

excel_style_df = pd.DataFrame(excel_style_data)

# Reorder columns to match your image
column_order = ['Model', 'Dataset', 'Prompt Template', 'Zero shot', '1 shot', '3 shot', '5 shot', '7 shot']
excel_style_df = excel_style_df.reindex(columns=column_order)

# Replace NaN with empty string for better Excel display
excel_style_df_display = excel_style_df.fillna('')

# Define file paths
excel_style_csv = os.path.join(OUT_DIR, "f1_scores_excel_format.csv")
excel_style_excel = os.path.join(OUT_DIR, "f1_scores_excel_format.xlsx")

# Save Excel-style table
excel_style_df_display.to_csv(excel_style_csv, index=False)

# Create Excel file with formatting
with pd.ExcelWriter(excel_style_excel, engine='openpyxl') as writer:
    excel_style_df_display.to_excel(writer, sheet_name='F1 Scores', index=False)
    
    workbook = writer.book
    worksheet = writer.sheets['F1 Scores']
    
    from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
    
    # Styles
    header_font = Font(bold=True, color="000000")
    header_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")
    model_font = Font(bold=True)
    model_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
    center_alignment = Alignment(horizontal="center", vertical="center")
    thin_border = Border(
        left=Side(style='thin'), right=Side(style='thin'),
        top=Side(style='thin'), bottom=Side(style='thin')
    )
    
    # Format header row
    for cell in worksheet[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = center_alignment
        cell.border = thin_border
    
    # Format data rows and merge model cells
    current_row = 2
    for model in MODELS:
        model_rows = excel_style_df_display[excel_style_df_display['Model'] == model]
        num_datasets = len(model_rows)
        
        if num_datasets > 0:
            # Merge model name cells
            if num_datasets > 1:
                merge_range = f"A{current_row}:A{current_row + num_datasets - 1}"
                worksheet.merge_cells(merge_range)
            
            # Format merged model cell
            model_cell = worksheet[f"A{current_row}"]
            model_cell.value = model
            model_cell.font = model_font
            model_cell.fill = model_fill
            model_cell.alignment = center_alignment
            model_cell.border = thin_border
            
            # Format all cells in this model's section
            for i in range(num_datasets):
                row_num = current_row + i
                for col in range(1, 9):  # Columns A-H
                    cell = worksheet.cell(row=row_num, column=col)
                    cell.alignment = center_alignment
                    cell.border = thin_border
                    
                    # Format F1 score cells to 3 decimal places
                    if col >= 4 and cell.value is not None and cell.value != '' and isinstance(cell.value, (int, float)):
                        cell.number_format = '0.000'
            
            current_row += num_datasets
    
    # Auto-adjust column widths
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 20)
        worksheet.column_dimensions[column_letter].width = adjusted_width

# ─── CREATE SUMMARY TABLES ──────────────────
# Calculate overall weighted averages
overall_results = []

for model in MODELS:
    for shot_config in ['0-shot'] + [f'{k}-shot' for k in K_SHOTS]:
        model_shot_data = results_df[
            (results_df['model'] == model) & 
            (results_df['shot_config'] == shot_config)
        ]
        
        if len(model_shot_data) > 0:
            weights = model_shot_data['total_samples'].values
            f1_scores = model_shot_data['avg_f1'].values
            
            if weights.sum() > 0:
                overall_weighted_f1 = np.average(f1_scores, weights=weights)
                
                overall_results.append({
                    'model': model,
                    'shot_config': shot_config,
                    'weighted_avg_f1': overall_weighted_f1,
                    'total_samples': weights.sum(),
                    'num_datasets': len(model_shot_data)
                })

overall_df = pd.DataFrame(overall_results)

# Create summary pivot table
if len(overall_df) > 0:
    summary_pivot = overall_df.pivot_table(
        index='model',
        columns='shot_config', 
        values='weighted_avg_f1',
        fill_value=np.nan
    ).round(4)
    
    # Reorder columns
    shot_order = ['0-shot', '1-shot', '3-shot', '5-shot', '7-shot']
    summary_pivot = summary_pivot.reindex(columns=[col for col in shot_order if col in summary_pivot.columns])
    
    summary_csv = os.path.join(OUT_DIR, "weighted_f1_summary.csv")
    summary_pivot.to_csv(summary_csv)
    
    print(f"\nFiles created:")
    print(f"Excel-style F1 table: {excel_style_csv}")
    print(f"Excel-style formatted: {excel_style_excel}")
    print(f"Summary F1 scores: {summary_csv}")
    
    print(f"\nF1 Scores by Model and Dataset:")
    print(excel_style_df_display.to_string(index=False))
    
    print(f"\nOverall Weighted F1 Summary:")
    print(summary_pivot.to_string())
    
    # Find best performing configurations
    print(f"\nTop performing configurations:")
    for model in MODELS:
        if model in summary_pivot.index:
            model_scores = summary_pivot.loc[model].dropna()
            if len(model_scores) > 0:
                best_shot = model_scores.idxmax()
                best_score = model_scores.max()
                print(f"{model}: {best_shot} (F1={best_score:.4f})")
            else:
                print(f"{model}: No data available")
else:
    print("No data found for summary statistics!")

print(f"\nDEBUG COMPLETE: Check the debug output above to see why zero-shot might be missing.")

Calculating F1 scores for all configurations...
Expected zero-shot path format: zero_shot_folds_MODEL_NAME/preds/DATASET_foldN_preds.csv
Expected few-shot path format: few_shot_folds_MODEL_NAME_DATASET/preds/DATASET_foldN_Nshot_preds.csv

Processing model: llama3_8b

Processing ZERO-SHOT for llama3_8b:

  Dataset: pan
ZERO-SHOT PATH: zero_shot_folds_llama3_8b/preds/pan_fold0_preds.csv
SUCCESS: Loaded 139 rows, columns: ['id', 'text', 'gold', 'pred']
    Fold 0: F1=0.7248, samples=139
ZERO-SHOT PATH: zero_shot_folds_llama3_8b/preds/pan_fold1_preds.csv
SUCCESS: Loaded 139 rows, columns: ['id', 'text', 'gold', 'pred']
    Fold 1: F1=0.6443, samples=139
ZERO-SHOT PATH: zero_shot_folds_llama3_8b/preds/pan_fold2_preds.csv
SUCCESS: Loaded 139 rows, columns: ['id', 'text', 'gold', 'pred']
    Fold 2: F1=0.6368, samples=139
ZERO-SHOT PATH: zero_shot_folds_llama3_8b/preds/pan_fold3_preds.csv
SUCCESS: Loaded 139 rows, columns: ['id', 'text', 'gold', 'pred']
    Fold 3: F1=0.6978, samples=139
ZERO

In [2]:
import os
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar
import numpy as np
# Add this at the top of your script
np.set_printoptions(suppress=True)

# ─── CONFIG ─────────────────────────────
ZS_PREFIX = "zero_shot_folds_"
FS_PREFIX = "few_shot_folds_"
DATASETS = ["pan", "maalej", "scalabrino"]
MODELS = ["llama3_8b", "llama3_70b", "mistral_7b", "gemma3_4b", "wizardlm2_7b"]
K_SHOTS = [1, 3, 5, 7]
FOLDS = range(10)
OUT_CSV = "mcnemar_results/mcnemar_combined_all_models.csv"
os.makedirs("mcnemar_results", exist_ok=True)

# ─── LOAD PREDICTIONS ──────────────────
def load_preds(model_name, dataset, fold, shot=None):
    if shot is None:
        path = os.path.join(f"{ZS_PREFIX}{model_name}", "preds", f"{dataset}_fold{fold}_preds.csv")
    else:
        path = os.path.join(f"{FS_PREFIX}{model_name}_{dataset}", "preds", f"{dataset}_fold{fold}_{shot}shot_preds.csv")
    
    if not os.path.exists(path):
        print(f"Missing file: {path}")
        return None
    
    df = pd.read_csv(path)
    return df[["text", "gold", "pred"]]

# ─── RUN MCNEMAR TEST ──────────────────
all_results = []

print("🔍 Running McNemar tests across all models...")
print("=" * 60)

for model in MODELS:
    print(f"\n📊 Processing model: {model}")
    
    # Generate all comparisons
    comparisons = [(None, k) for k in K_SHOTS]  # ZS vs FS
    comparisons += [(k1, k2) for i, k1 in enumerate(K_SHOTS) for k2 in K_SHOTS[i+1:]]  # FS vs FS
    
    for s1, s2 in comparisons:
        p1_all, p2_all = [], []
        dataset_samples = {"pan": 0, "maalej": 0, "scalabrino": 0}
        
        for dataset in DATASETS:
            dataset_count = 0
            for fold in FOLDS:
                df1 = load_preds(model, dataset, fold, s1)
                df2 = load_preds(model, dataset, fold, s2)
                
                if df1 is None or df2 is None:
                    continue
                
                # Use inner join to handle mismatched samples
                merged = pd.merge(df1, df2, on="text", suffixes=("_1", "_2"), how="inner")
                
                # Check for significant data loss
                original_size = min(len(df1), len(df2))
                if len(merged) < original_size * 0.8:
                    print(f"⚠️  Warning: {dataset} fold {fold} lost {original_size - len(merged)} samples")
                
                gold = merged["gold_1"].str.lower().str.strip()
                pred1 = merged["pred_1"].str.lower().str.strip()
                pred2 = merged["pred_2"].str.lower().str.strip()
                
                for y, p1, p2 in zip(gold, pred1, pred2):
                    p1_all.append(p1 == y)
                    p2_all.append(p2 == y)
                
                dataset_count += len(merged)
            
            dataset_samples[dataset] = dataset_count
        
        if len(p1_all) == 0:
            print(f"⚠️  Skipping {s1} vs {s2}: no data available")
            continue
        
        # McNemar contingency table
        both_correct = sum(p1 and p2 for p1, p2 in zip(p1_all, p2_all))
        only_s1 = sum(p1 and not p2 for p1, p2 in zip(p1_all, p2_all))
        only_s2 = sum(p2 and not p1 for p1, p2 in zip(p1_all, p2_all))
        both_wrong = sum(not p1 and not p2 for p1, p2 in zip(p1_all, p2_all))
        
        # Verify contingency table
        total_check = both_correct + only_s1 + only_s2 + both_wrong
        assert total_check == len(p1_all), f"Contingency table error: {total_check} != {len(p1_all)}"
        
        table = [[both_correct, only_s1], [only_s2, both_wrong]]
        
        # Check for sufficient discordant pairs
        discordant_pairs = only_s1 + only_s2
        if discordant_pairs < 25:
            print(f"⚠️  Warning: Only {discordant_pairs} discordant pairs for {s1} vs {s2}")
        
        # Calculate accuracies
        acc1 = sum(p1_all) / len(p1_all)
        acc2 = sum(p2_all) / len(p2_all)
        acc_diff = acc2 - acc1  # positive means s2 is better
        
        # Run McNemar test
        result = mcnemar(table, exact=False, correction=True)
        
        # Create readable labels
        label1 = "0-shot" if s1 is None else f"{s1}-shot"
        label2 = f"{s2}-shot"
        
        # Determine statistical significance and practical direction
        is_significant = result.pvalue < 0.05
        
        if is_significant:
            if only_s1 > only_s2:
                significance = f"{label1} significantly better"
            else:
                significance = f"{label2} significantly better"
        else:
            significance = "No significant difference"
        
        # Store results
        all_results.append({
            "model": model,
            "comparison": f"{label1} vs {label2}",
            "method1_accuracy": round(acc1, 4),
            "method2_accuracy": round(acc2, 4),
            "accuracy_difference": round(acc_diff, 4),
            "total_samples": len(p1_all),
            "pan_samples": dataset_samples["pan"],
            "maalej_samples": dataset_samples["maalej"],
            "scalabrino_samples": dataset_samples["scalabrino"],
            f"{label1.replace('-', '_')}_better": only_s1,
            f"{label2.replace('-', '_')}_better": only_s2,
            "discordant_pairs": discordant_pairs,
            "chi2_stat": round(result.statistic, 4),
            "p_value": round(result.pvalue, 6),
            "significant_at_05": is_significant,
            "interpretation": significance
        })
        
        # Print progress
        sig_marker = "✅" if is_significant else "❌"
        print(f"  {sig_marker} {label1} vs {label2}: "
              f"acc_diff={acc_diff:+.3f}, χ²={result.statistic:.3f}, "
              f"p={result.pvalue:.3f} ({significance})")

# ─── SAVE RESULTS ──────────────────────
results_df = pd.DataFrame(all_results)
results_df.to_csv(OUT_CSV, index=False)

print("\n" + "=" * 60)
print(f"✅ Results saved to: {OUT_CSV}")

# ─── SUMMARY ANALYSIS ──────────────────
print(f"\n📈 SUMMARY ANALYSIS:")
print(f"   Total comparisons: {len(results_df)}")
print(f"   Significant differences (p < 0.05): {sum(results_df['significant_at_05'])}")
print(f"   Average sample size: {results_df['total_samples'].mean():.0f}")

# Show significant results by model
print(f"\n🎯 SIGNIFICANT FINDINGS BY MODEL:")
for model in MODELS:
    model_results = results_df[
        (results_df['model'] == model) & 
        (results_df['significant_at_05'] == True)
    ]
    
    if len(model_results) > 0:
        print(f"\n   {model}:")
        for _, row in model_results.iterrows():
            print(f"     • {row['comparison']}: {row['interpretation']} "
                  f"(Δacc={row['accuracy_difference']:+.3f}, p={row['p_value']:.4g})")
    else:
        print(f"\n   {model}: No significant differences found")

# Find best shot configuration per model
print(f"\n🏆 BEST SHOT CONFIGURATION PER MODEL:")
for model in MODELS:
    model_data = results_df[results_df['model'] == model]
    
    # Get all zero-shot vs few-shot comparisons for this model
    zs_comparisons = model_data[model_data['comparison'].str.contains('0-shot vs')]
    
    if len(zs_comparisons) > 0:
        # Find which few-shot configs significantly beat zero-shot
        significant_wins = zs_comparisons[
            (zs_comparisons['significant_at_05'] == True) &
            (zs_comparisons['accuracy_difference'] > 0)
        ]
        
        if len(significant_wins) > 0:
            best = significant_wins.loc[significant_wins['accuracy_difference'].idxmax()]
            print(f"   {model}: {best['comparison'].split(' vs ')[1]} "
                  f"(+{best['accuracy_difference']:.3f} vs 0-shot)")
        else:
            print(f"   {model}: No few-shot config significantly beats 0-shot")
    else:
        print(f"   {model}: No 0-shot comparisons available")

print(f"\n" + "=" * 60)


# ─── CUSTOM FILTER: 5-shot vs 0-shot and 5-shot vs 1-shot ─────────────
print(f"\n🔍 Focused Comparison: 5-shot vs 0-shot and 5-shot vs 1-shot")
focus_pairs = ["0-shot vs 5-shot", "1-shot vs 5-shot"]

for model in MODELS:
    print(f"\n🧠 Model: {model}")
    for pair in focus_pairs:
        row = results_df[
            (results_df['model'] == model) & 
            (results_df['comparison'] == pair)
        ]
        if not row.empty:
            r = row.iloc[0]
            sig = "✅ Significant" if r["significant_at_05"] else "❌ Not significant"
            print(f"  • {pair}: {r['interpretation']} "
                  f"(Δacc={r['accuracy_difference']:+.3f}, p={r['p_value']:.4g}, {sig})")
        else:
            print(f"  • {pair}: ❌ No data found")
focus_df = results_df[results_df['comparison'].isin(focus_pairs)].copy()
focus_df.to_csv("mcnemar_results/focus_BestVsBase_BestVsWorst_shot_comparisons.csv", index=False)
print("📁 Saved: mcnemar_results/focus_BestVsBase_BestVsWorst_shot_comparisons.csv")


🔍 Running McNemar tests across all models...

📊 Processing model: llama3_8b
  ✅ 0-shot vs 1-shot: acc_diff=-0.035, χ²=63.856, p=0.000 (0-shot significantly better)
  ❌ 0-shot vs 3-shot: acc_diff=+0.004, χ²=0.736, p=0.391 (No significant difference)
  ✅ 0-shot vs 5-shot: acc_diff=+0.038, χ²=67.624, p=0.000 (5-shot significantly better)
  ✅ 0-shot vs 7-shot: acc_diff=+0.058, χ²=144.420, p=0.000 (7-shot significantly better)
  ✅ 1-shot vs 3-shot: acc_diff=+0.038, χ²=124.368, p=0.000 (3-shot significantly better)
  ✅ 1-shot vs 5-shot: acc_diff=+0.073, χ²=340.458, p=0.000 (5-shot significantly better)
  ✅ 1-shot vs 7-shot: acc_diff=+0.093, χ²=478.284, p=0.000 (7-shot significantly better)
  ✅ 3-shot vs 5-shot: acc_diff=+0.035, χ²=122.209, p=0.000 (5-shot significantly better)
  ✅ 3-shot vs 7-shot: acc_diff=+0.055, χ²=237.064, p=0.000 (7-shot significantly better)
  ✅ 5-shot vs 7-shot: acc_diff=+0.020, χ²=40.502, p=0.000 (7-shot significantly better)

📊 Processing model: llama3_70b
  ✅ 0-sho

In [4]:
import os
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar

# ─── CONFIG ─────────────────────────────
ZS_PREFIX = "zero_shot_folds_"
FS_PREFIX = "few_shot_folds_"
DATASETS = ["pan", "maalej", "scalabrino"]
MODELS = ["llama3_8b", "llama3_70b", "mistral_7b", "gemma3_4b", "wizardlm2_7b"]
K_SHOTS = [1, 3, 5, 7]
FOLDS = range(10)
OUT_CSV = "mcnemar_results/mcnemar_matrix_pivot.csv"
OUT_EXCEL = "mcnemar_results/mcnemar_matrix_pivot.xlsx"
os.makedirs("mcnemar_results", exist_ok=True)

# ─── LOAD PREDICTIONS ──────────────────
def load_preds(model_name, dataset, fold, shot=None):
    if shot is None:
        path = os.path.join(f"{ZS_PREFIX}{model_name}", "preds", f"{dataset}_fold{fold}_preds.csv")
    else:
        path = os.path.join(f"{FS_PREFIX}{model_name}_{dataset}", "preds", f"{dataset}_fold{fold}_{shot}shot_preds.csv")
    
    if not os.path.exists(path):
        print(f"Missing file: {path}")
        return None
    
    df = pd.read_csv(path)
    return df[["text", "gold", "pred"]]

# ─── RUN MCNEMAR TEST ──────────────────
all_results = []

print("Running McNemar tests for matrix pivot...")

for model in MODELS:
    print(f"Processing model: {model}")
    
    # Generate all comparisons
    comparisons = [(None, k) for k in K_SHOTS]  # ZS vs FS
    comparisons += [(k1, k2) for i, k1 in enumerate(K_SHOTS) for k2 in K_SHOTS[i+1:]]  # FS vs FS
    
    for s1, s2 in comparisons:
        p1_all, p2_all = [], []
        
        for dataset in DATASETS:
            for fold in FOLDS:
                df1 = load_preds(model, dataset, fold, s1)
                df2 = load_preds(model, dataset, fold, s2)
                
                if df1 is None or df2 is None:
                    continue
                
                # Use inner join to handle mismatched samples
                merged = pd.merge(df1, df2, on="text", suffixes=("_1", "_2"), how="inner")
                
                gold = merged["gold_1"].str.lower().str.strip()
                pred1 = merged["pred_1"].str.lower().str.strip()
                pred2 = merged["pred_2"].str.lower().str.strip()
                
                for y, p1, p2 in zip(gold, pred1, pred2):
                    p1_all.append(p1 == y)
                    p2_all.append(p2 == y)
        
        if len(p1_all) == 0:
            continue
        
        # McNemar contingency table
        both_correct = sum(p1 and p2 for p1, p2 in zip(p1_all, p2_all))
        only_s1 = sum(p1 and not p2 for p1, p2 in zip(p1_all, p2_all))
        only_s2 = sum(p2 and not p1 for p1, p2 in zip(p1_all, p2_all))
        both_wrong = sum(not p1 and not p2 for p1, p2 in zip(p1_all, p2_all))
        
        table = [[both_correct, only_s1], [only_s2, both_wrong]]
        
        # Run McNemar test
        result = mcnemar(table, exact=False, correction=True)
        
        # Create readable labels
        method1 = "0-shot" if s1 is None else f"{s1}-shot"
        method2 = f"{s2}-shot"
        
        all_results.append({
            "Model": model,
            "Method_1": method1,
            "Method_2": method2,
            "Chi_Square": result.statistic,
            "P_Value": result.pvalue
        })

# ─── CREATE MATRIX TABLE ──────────────────
matrix_rows = []

for model in MODELS:
    model_data = [r for r in all_results if r['Model'] == model]
    
    # Create rows for each shot type
    shot_types = ["0 shot", "1 shot", "3 shot", "5 shot"]
    
    for i, shot_row in enumerate(shot_types):
        row_data = []
        
        # Add model name only in first row
        if i == 0:
            row_data.append(model)
        else:
            row_data.append("")
        
        # Add shot type
        row_data.append(shot_row)
        
        # Add comparison values for columns: 1 shot, 3 shot, 5 shot, 7 shot
        column_shots = ["1", "3", "5", "7"]
        
        for col_shot in column_shots:
            # Determine method names for lookup
            if shot_row == "0 shot":
                method1 = "0-shot"
            else:
                method1 = f"{shot_row.replace(' shot', '')}-shot"
            
            method2 = f"{col_shot}-shot"
            
            # Check if this is a valid comparison (upper triangular)
            if shot_row == "0 shot":
                # 0-shot vs all others is valid
                pass
            else:
                shot_num = int(shot_row.replace(" shot", ""))
                col_num = int(col_shot)
                if shot_num >= col_num:
                    row_data.append("")
                    continue
            
            # Find the comparison
            comparison = next((r for r in model_data 
                             if r['Method_1'] == method1 and r['Method_2'] == method2), None)
            
            if comparison:
                chi_val = comparison['Chi_Square']
                p_val = comparison['P_Value']
                value = f"({chi_val:.3f}/{p_val:.3f})"
                row_data.append(value)
            else:
                row_data.append("")
        
        matrix_rows.append(row_data)

# Create DataFrame
columns = ["Model", "Shot Type", "1 shot", "3 shot", "5 shot", "7 shot"]
matrix_df = pd.DataFrame(matrix_rows, columns=columns)

# Save CSV
matrix_df.to_csv(OUT_CSV, index=False)

# Save as Excel with formatting
with pd.ExcelWriter(OUT_EXCEL, engine='openpyxl') as writer:
    matrix_df.to_excel(writer, sheet_name='McNemar Matrix', index=False)
    
    # Get workbook and worksheet
    workbook = writer.book
    worksheet = writer.sheets['McNemar Matrix']
    
    # Import styling
    from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
    
    # Define styles
    header_font = Font(bold=True, color="FFFFFF")
    header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
    model_font = Font(bold=True)
    model_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")
    center_alignment = Alignment(horizontal="center", vertical="center")
    thin_border = Border(
        left=Side(style='thin'), right=Side(style='thin'),
        top=Side(style='thin'), bottom=Side(style='thin')
    )
    
    # Format header row
    for cell in worksheet[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = center_alignment
        cell.border = thin_border
    
    # Format data and merge model cells
    current_row = 2
    for model in MODELS:
        # Merge model name cells (4 rows)
        merge_range = f"A{current_row}:A{current_row + 3}"
        worksheet.merge_cells(merge_range)
        
        # Format merged model cell
        model_cell = worksheet[f"A{current_row}"]
        model_cell.value = model
        model_cell.font = model_font
        model_cell.fill = model_fill
        model_cell.alignment = center_alignment
        model_cell.border = thin_border
        
        # Format the 4 rows for this model
        for i in range(4):
            row_num = current_row + i
            for col in range(1, 7):  # Columns A-F
                cell = worksheet.cell(row=row_num, column=col)
                cell.alignment = center_alignment
                cell.border = thin_border
        
        current_row += 4
    
    # Auto-adjust column widths
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 25)
        worksheet.column_dimensions[column_letter].width = adjusted_width

print(f"\nFiles saved:")
print(f"CSV: {OUT_CSV}")
print(f"Excel: {OUT_EXCEL}")
print(f"Format: Matrix showing (chi-square/p-value) for each comparison")
# ─── MATRIX PIVOT TABLE: Focused on 5-shot comparisons ───────────────
# ─── FOCUSED PIVOT: Only 0→5 and 1→5 ───────────────────────────────
print("\n📊 Creating focused McNemar pivot (0-shot vs 5-shot, 1-shot vs 5-shot)...")

focused_rows = []

for model in MODELS:
    model_data = [r for r in all_results if r["Model"] == model]

    for shot_row in ["0 shot", "1 shot"]:
        row_data = []

        # Add model name only once
        if shot_row == "0 shot":
            row_data.append(model)
        else:
            row_data.append("")

        # Add shot type
        row_data.append(shot_row)

        # Only one column = 5 shot
        method1 = "0-shot" if shot_row == "0 shot" else "1-shot"
        method2 = "5-shot"

        comparison = next((r for r in model_data 
                           if r["Method_1"] == method1 and r["Method_2"] == method2), None)

        if comparison:
            chi_val = comparison["Chi_Square"]
            p_val = comparison["P_Value"]
            value = f"({chi_val:.3f}/{p_val:.3f})"
        else:
            value = ""

        row_data.append(value)

        focused_rows.append(row_data)

# Create DataFrame
focused_df = pd.DataFrame(focused_rows, columns=["Model", "Shot Type", "5 shot"])

# Save
focused_csv = "mcnemar_results/mcnemar_focused_0_1_vs_5_pivot.csv"
focused_xlsx = "mcnemar_results/mcnemar_focused_0_1_vs_5_pivot.xlsx"

focused_df.to_csv(focused_csv, index=False)

with pd.ExcelWriter(focused_xlsx, engine="openpyxl") as writer:
    focused_df.to_excel(writer, sheet_name="Focused McNemar", index=False)

print(f"✅ Focused pivot saved:\n - {focused_csv}\n - {focused_xlsx}")


Running McNemar tests for matrix pivot...
Processing model: llama3_8b
Processing model: llama3_70b
Processing model: mistral_7b
Processing model: gemma3_4b
Processing model: wizardlm2_7b

Files saved:
CSV: mcnemar_results/mcnemar_matrix_pivot.csv
Excel: mcnemar_results/mcnemar_matrix_pivot.xlsx
Format: Matrix showing (chi-square/p-value) for each comparison

📊 Creating focused McNemar pivot (0-shot vs 5-shot, 1-shot vs 5-shot)...
✅ Focused pivot saved:
 - mcnemar_results/mcnemar_focused_0_1_vs_5_pivot.csv
 - mcnemar_results/mcnemar_focused_0_1_vs_5_pivot.xlsx


In [6]:
#finding winner
import os
import pandas as pd

# ─── CONFIG ─────────────────────────────
MODELS = ["llama3_8b", "llama3_70b", "mistral_7b", "gemma3_4b", "wizardlm2_7b"]
os.makedirs("mcnemar_results", exist_ok=True)

# Read the existing combined results
combined_df = pd.read_csv("mcnemar_results/mcnemar_combined_all_models.csv")

print("Creating winner analysis tables...")

# Convert boolean column to proper boolean if it's stored as string
combined_df['significant_at_05'] = combined_df['significant_at_05'].astype(str).str.lower().map({'true': True, 'false': False})

# ─── CREATE WINNERS PIVOT TABLE ──────────────────
def get_winner(row):
    """Determine winner from a comparison row"""
    if not row['significant_at_05']:
        return "No diff"
    
    interpretation = str(row['interpretation']).strip()
    
    # Simple mapping
    winner_map = {
        "0-shot significantly better": "0 shot",
        "1-shot significantly better": "1 shot", 
        "3-shot significantly better": "3 shot",
        "5-shot significantly better": "5 shot",
        "7-shot significantly better": "7 shot"
    }
    
    return winner_map.get(interpretation, f"Unknown: {interpretation}")

# Create lookup dictionary for fast access
comparison_lookup = {}
for _, row in combined_df.iterrows():
    key = (row['model'], row['comparison'])
    comparison_lookup[key] = get_winner(row)

# Build matrix rows
matrix_rows = []
for model in MODELS:
    shot_types = ["0 shot", "1 shot", "3 shot", "5 shot"]
    
    for i, shot_row in enumerate(shot_types):
        row_data = []
        
        # Model name (only in first row)
        row_data.append(model if i == 0 else "")
        
        # Shot type
        row_data.append(shot_row)
        
        # Comparisons: vs 1 shot, vs 3 shot, vs 5 shot, vs 7 shot
        for col_shot in ["1", "3", "5", "7"]:
            # Build comparison string
            if shot_row == "0 shot":
                comparison_str = f"0-shot vs {col_shot}-shot"
            else:
                shot_num = shot_row.replace(" shot", "")
                
                # Skip invalid comparisons (lower triangle)
                if int(shot_num) >= int(col_shot):
                    row_data.append("")
                    continue
                    
                comparison_str = f"{shot_num}-shot vs {col_shot}-shot"
            
            # Look up result
            key = (model, comparison_str)
            winner = comparison_lookup.get(key, "Not found")
            row_data.append(winner)
        
        matrix_rows.append(row_data)

# Create DataFrame
columns = ["Model", "Shot Type", "vs 1 shot", "vs 3 shot", "vs 5 shot", "vs 7 shot"]
winners_df = pd.DataFrame(matrix_rows, columns=columns)

# ─── CREATE WIN COUNTS SUMMARY ──────────────────
summary_data = []

for model in MODELS:
    model_data = combined_df[combined_df['model'] == model]
    
    row = {"Model": model}
    
    # Count wins for each shot configuration
    for shot_config in ["0 shot", "1 shot", "3 shot", "5 shot", "7 shot"]:
        target_interpretation = f"{shot_config.replace(' ', '-')} significantly better"
        
        wins = len(model_data[
            (model_data['significant_at_05'] == True) & 
            (model_data['interpretation'] == target_interpretation)
        ])
        
        row[shot_config] = wins
    
    summary_data.append(row)

summary_df = pd.DataFrame(summary_data)

# ─── SAVE FILES ──────────────────
# Save CSV files
winners_csv = "mcnemar_results/mcnemar_winners_pivot.csv"
summary_csv = "mcnemar_results/shot_wins_summary.csv"

winners_df.to_csv(winners_csv, index=False)
summary_df.to_csv(summary_csv, index=False)

# Save Excel files
winners_excel = "mcnemar_results/mcnemar_winners_pivot.xlsx"
summary_excel = "mcnemar_results/shot_wins_summary.xlsx"

with pd.ExcelWriter(winners_excel, engine='openpyxl') as writer:
    winners_df.to_excel(writer, sheet_name='Winners Matrix', index=False)
    
    workbook = writer.book
    worksheet = writer.sheets['Winners Matrix']
    
    from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
    
    # Styles
    header_font = Font(bold=True, color="FFFFFF")
    header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
    model_font = Font(bold=True)
    model_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")
    center_alignment = Alignment(horizontal="center", vertical="center")
    thin_border = Border(
        left=Side(style='thin'), right=Side(style='thin'),
        top=Side(style='thin'), bottom=Side(style='thin')
    )
    
    # Format header
    for cell in worksheet[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = center_alignment
        cell.border = thin_border
    
    # Format and merge model cells
    current_row = 2
    for model in MODELS:
        # Merge model cells (4 rows)
        merge_range = f"A{current_row}:A{current_row + 3}"
        worksheet.merge_cells(merge_range)
        
        model_cell = worksheet[f"A{current_row}"]
        model_cell.value = model
        model_cell.font = model_font
        model_cell.fill = model_fill
        model_cell.alignment = center_alignment
        model_cell.border = thin_border
        
        # Format all cells in this model's section
        for i in range(4):
            row_num = current_row + i
            for col in range(1, 7):
                cell = worksheet.cell(row=row_num, column=col)
                cell.alignment = center_alignment
                cell.border = thin_border
        
        current_row += 4
    
    # Auto-adjust column widths
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 20)
        worksheet.column_dimensions[column_letter].width = adjusted_width

# Summary Excel
with pd.ExcelWriter(summary_excel, engine='openpyxl') as writer:
    summary_df.to_excel(writer, sheet_name='Win Counts', index=False)
    
    workbook = writer.book
    worksheet = writer.sheets['Win Counts']
    
    # Format header
    for cell in worksheet[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = center_alignment
        cell.border = thin_border
    
    # Format data
    for row in worksheet.iter_rows(min_row=2):
        for cell in row:
            cell.alignment = center_alignment
            cell.border = thin_border
    
    # Auto-adjust columns
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 15)
        worksheet.column_dimensions[column_letter].width = adjusted_width

print(f"\nFiles created:")
print(f"Winners Pivot: {winners_csv}")
print(f"Winners Excel: {winners_excel}")
print(f"Summary CSV: {summary_csv}")
print(f"Summary Excel: {summary_excel}")

print(f"\nWin Counts Summary:")
print(summary_df.to_string(index=False))

print(f"\nFirst few rows of Winners Pivot:")
print(winners_df.head(8).to_string(index=False))

# Debug: Show any "Unknown" entries
unknown_entries = winners_df[winners_df.apply(lambda row: any("Unknown:" in str(cell) for cell in row), axis=1)]
if len(unknown_entries) > 0:
    print(f"\nDEBUG - Unknown interpretations found:")
    print(unknown_entries)
else:
    print(f"\nAll interpretations matched successfully!")

# ─── FOCUSED WINNER MATRIX: 0 vs 5 and 1 vs 5 ─────────────────────
# ─── FOCUSED WINNER PIVOT: 0 vs 5 and 1 vs 5 ─────────────────────────────
print("\n🏆 Creating Focused Winner Pivot (0 vs 5, 1 vs 5)...")

focused_matrix_rows = []

for model in MODELS:
    shot_types = ["0 shot", "1 shot"]  # only rows we care about
    
    for i, shot_row in enumerate(shot_types):
        row_data = []

        # Add model name only for first row
        if i == 0:
            row_data.append(model)
        else:
            row_data.append("")

        # Shot type
        row_data.append(shot_row)

        # Only column = 5 shot
        if shot_row == "0 shot":
            comparison_str = "0-shot vs 5-shot"
        else:
            comparison_str = "1-shot vs 5-shot"

        key = (model, comparison_str)
        winner = comparison_lookup.get(key, "Not found")

        row_data.append(winner)
        focused_matrix_rows.append(row_data)

# Build DataFrame
focused_winners_df = pd.DataFrame(focused_matrix_rows,
                                  columns=["Model", "Shot Type", "5 shot"])

# Save to CSV/Excel
focused_winners_csv = "mcnemar_results/mcnemar_focused_winners_pivot.csv"
focused_winners_xlsx = "mcnemar_results/mcnemar_focused_winners_pivot.xlsx"

focused_winners_df.to_csv(focused_winners_csv, index=False)

with pd.ExcelWriter(focused_winners_xlsx, engine="openpyxl") as writer:
    focused_winners_df.to_excel(writer, sheet_name="Focused Winners Pivot", index=False)

print(f"✅ Focused winner pivot saved:\n - {focused_winners_csv}\n - {focused_winners_xlsx}")

print("\nFocused Winner Pivot Preview:")
print(focused_winners_df.head().to_string(index=False))


Creating winner analysis tables...

Files created:
Winners Pivot: mcnemar_results/mcnemar_winners_pivot.csv
Winners Excel: mcnemar_results/mcnemar_winners_pivot.xlsx
Summary CSV: mcnemar_results/shot_wins_summary.csv
Summary Excel: mcnemar_results/shot_wins_summary.xlsx

Win Counts Summary:
       Model  0 shot  1 shot  3 shot  5 shot  7 shot
   llama3_8b       1       0       1       3       4
  llama3_70b       4       2       2       1       0
  mistral_7b       1       0       2       3       4
   gemma3_4b       4       0       1       3       2
wizardlm2_7b       1       0       1       3       3

First few rows of Winners Pivot:
     Model Shot Type vs 1 shot vs 3 shot vs 5 shot vs 7 shot
 llama3_8b    0 shot    0 shot   No diff    5 shot    7 shot
              1 shot              3 shot    5 shot    7 shot
              3 shot                        5 shot    7 shot
              5 shot                                  7 shot
llama3_70b    0 shot    0 shot    0 shot    0 shot 

In [1]:
#friedman
import os
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare
from scipy import stats
from itertools import combinations
import scikit_posthocs as sp

# ─── CONFIG ─────────────────────────────
ZS_PREFIX = "zero_shot_folds_"
FS_PREFIX = "few_shot_folds_"
DATASETS = ["pan", "maalej", "scalabrino"]
MODELS = ["llama3_8b", "llama3_70b", "mistral_7b", "gemma3_4b", "wizardlm2_7b"]
K_SHOTS = [1, 3, 5, 7]
FOLDS = range(10)
OUT_DIR = "friedman_analysis"
os.makedirs(OUT_DIR, exist_ok=True)

from sklearn.metrics import f1_score

# ─── LOAD PREDICTIONS ──────────────────
def load_preds(model_name, dataset, fold, shot=None):
    if shot is None:
        path = os.path.join(f"{ZS_PREFIX}{model_name}", "preds", f"{dataset}_fold{fold}_preds.csv")
    else:
        path = os.path.join(f"{FS_PREFIX}{model_name}_{dataset}", "preds", f"{dataset}_fold{fold}_{shot}shot_preds.csv")
    
    if not os.path.exists(path):
        return None
    
    try:
        df = pd.read_csv(path)
        if 'gold' not in df.columns or 'pred' not in df.columns:
            return None
        return df[["text", "gold", "pred"]]
    except Exception as e:
        print(f"Error reading {path}: {e}")
        return None

# ─── COLLECT ALL F1 SCORES BY FOLD ──────────────────
print("Collecting F1 scores for Friedman test...")

# Structure: [model][dataset][fold] = {shot_config: f1_score}
fold_scores = {}

for model in MODELS:
    print(f"Processing model: {model}")
    fold_scores[model] = {}
    
    for dataset in DATASETS:
        fold_scores[model][dataset] = {}
        
        for fold in FOLDS:
            fold_scores[model][dataset][fold] = {}
            
            # Zero-shot
            df = load_preds(model, dataset, fold, shot=None)
            if df is not None:
                gold = df["gold"].str.lower().str.strip()
                pred = df["pred"].str.lower().str.strip()
                try:
                    f1 = f1_score(gold, pred, average='macro', zero_division=0)
                    fold_scores[model][dataset][fold]['0-shot'] = f1
                except:
                    pass
            
            # Few-shot
            for shot in K_SHOTS:
                df = load_preds(model, dataset, fold, shot=shot)
                if df is not None:
                    gold = df["gold"].str.lower().str.strip()
                    pred = df["pred"].str.lower().str.strip()
                    try:
                        f1 = f1_score(gold, pred, average='macro', zero_division=0)
                        fold_scores[model][dataset][fold][f'{shot}-shot'] = f1
                    except:
                        pass

# ─── FRIEDMAN TEST ANALYSIS ──────────────────
shot_configs = ['0-shot', '1-shot', '3-shot', '5-shot', '7-shot']
friedman_results = []

print("\nRunning Friedman tests...")

for model in MODELS:
    for dataset in DATASETS:
        print(f"\nAnalyzing {model} - {dataset}:")
        
        # Collect F1 scores for each shot configuration across folds
        config_scores = {config: [] for config in shot_configs}
        
        for fold in FOLDS:
            for config in shot_configs:
                if (fold in fold_scores[model][dataset] and 
                    config in fold_scores[model][dataset][fold]):
                    score = fold_scores[model][dataset][fold][config]
                    config_scores[config].append(score)
        
        # Only proceed if we have complete data for all configurations
        min_folds = min(len(scores) for scores in config_scores.values())
        if min_folds < 3:  # Need at least 3 folds for meaningful test
            print(f"  Insufficient data (only {min_folds} complete folds)")
            continue
        
        # Truncate all to same length (in case some configurations have more folds)
        for config in shot_configs:
            config_scores[config] = config_scores[config][:min_folds]
        
        # Run Friedman test
        score_arrays = [config_scores[config] for config in shot_configs]
        
        try:
            statistic, p_value = friedmanchisquare(*score_arrays)
            
            # Calculate mean ranks (for interpretation)
            df_ranks = pd.DataFrame(config_scores)
            ranks = df_ranks.rank(axis=1, method='average')
            mean_ranks = ranks.mean()
            
            # Determine best configuration
            best_config = mean_ranks.idxmax()
            best_mean_f1 = np.mean(config_scores[best_config])
            
            friedman_results.append({
                'Model': model,
                'Dataset': dataset,
                'Friedman_Statistic': statistic,
                'P_Value': p_value,
                'Significant': p_value < 0.05,
                'Num_Folds': min_folds,
                'Best_Config': best_config,
                'Best_Mean_F1': best_mean_f1,
                '0_shot_rank': mean_ranks['0-shot'],
                '1_shot_rank': mean_ranks['1-shot'],
                '3_shot_rank': mean_ranks['3-shot'],
                '5_shot_rank': mean_ranks['5-shot'],
                '7_shot_rank': mean_ranks['7-shot'],
                '0_shot_mean_f1': np.mean(config_scores['0-shot']),
                '1_shot_mean_f1': np.mean(config_scores['1-shot']),
                '3_shot_mean_f1': np.mean(config_scores['3-shot']),
                '5_shot_mean_f1': np.mean(config_scores['5-shot']),
                '7_shot_mean_f1': np.mean(config_scores['7-shot'])
            })
            
            print(f"  Friedman χ² = {statistic:.4f}, p = {p_value:.6f}")
            print(f"  Significant: {'Yes' if p_value < 0.05 else 'No'}")
            print(f"  Best config: {best_config} (rank = {mean_ranks[best_config]:.2f})")
            
        except Exception as e:
            print(f"  Error in Friedman test: {e}")

# ─── POST-HOC ANALYSIS (NEMENYI TEST) ──────────────────
print("\nRunning post-hoc Nemenyi tests for significant results...")

posthoc_results = []

for result in friedman_results:
    if result['Significant']:
        model = result['Model']
        dataset = result['Dataset']
        
        print(f"\nPost-hoc analysis for {model} - {dataset}:")
        
        # Reconstruct data for post-hoc test
        config_scores = {config: [] for config in shot_configs}
        
        for fold in FOLDS:
            if fold in fold_scores[model][dataset]:
                for config in shot_configs:
                    if config in fold_scores[model][dataset][fold]:
                        score = fold_scores[model][dataset][fold][config]
                        config_scores[config].append(score)
        
        # Create matrix for post-hoc test
        min_folds = min(len(scores) for scores in config_scores.values())
        data_matrix = []
        for i in range(min_folds):
            row = [config_scores[config][i] for config in shot_configs]
            data_matrix.append(row)
        
        df_posthoc = pd.DataFrame(data_matrix, columns=shot_configs)
        
        try:
            # Nemenyi test
            nemenyi_result = sp.posthoc_nemenyi_friedman(df_posthoc)
            
            # Store pairwise comparisons
            for i, config1 in enumerate(shot_configs):
                for j, config2 in enumerate(shot_configs):
                    if i < j:  # Only upper triangle
                        p_val = nemenyi_result.iloc[i, j]
                        posthoc_results.append({
                            'Model': model,
                            'Dataset': dataset,
                            'Config1': config1,
                            'Config2': config2,
                            'P_Value': p_val,
                            'Significant': p_val < 0.05,
                            'Config1_Rank': result[f'{config1.replace("-", "_")}_rank'],
                            'Config2_Rank': result[f'{config2.replace("-", "_")}_rank'],
                            'Config1_F1': result[f'{config1.replace("-", "_")}_mean_f1'],
                            'Config2_F1': result[f'{config2.replace("-", "_")}_mean_f1']
                        })
            
            print(f"  Nemenyi post-hoc test completed")
            print(f"  Significant pairwise differences:")
            significant_pairs = [(i, j) for i, config1 in enumerate(shot_configs) 
                               for j, config2 in enumerate(shot_configs) 
                               if i < j and nemenyi_result.iloc[i, j] < 0.05]
            
            for i, j in significant_pairs:
                config1, config2 = shot_configs[i], shot_configs[j]
                p_val = nemenyi_result.iloc[i, j]
                print(f"    {config1} vs {config2}: p = {p_val:.4f}")
            
            if not significant_pairs:
                print(f"    No significant pairwise differences found")
                
        except Exception as e:
            print(f"  Error in post-hoc test: {e}")

# ─── CREATE SUMMARY TABLES ──────────────────
friedman_df = pd.DataFrame(friedman_results)
posthoc_df = pd.DataFrame(posthoc_results)

# Save main results
friedman_csv = os.path.join(OUT_DIR, "friedman_test_results.csv")
friedman_df.to_csv(friedman_csv, index=False)

if len(posthoc_df) > 0:
    posthoc_csv = os.path.join(OUT_DIR, "nemenyi_posthoc_results.csv")
    posthoc_df.to_csv(posthoc_csv, index=False)

# ─── CREATE RANKS SUMMARY TABLE ──────────────────
ranks_data = []
for _, row in friedman_df.iterrows():
    rank_row = {
        'Model': row['Model'],
        'Dataset': row['Dataset'],
        'Significant': 'Yes' if row['Significant'] else 'No',
        '0-shot': f"{row['0_shot_rank']:.2f}",
        '1-shot': f"{row['1_shot_rank']:.2f}",
        '3-shot': f"{row['3_shot_rank']:.2f}",
        '5-shot': f"{row['5_shot_rank']:.2f}",
        '7-shot': f"{row['7_shot_rank']:.2f}",
        'Best_Config': row['Best_Config']
    }
    ranks_data.append(rank_row)

ranks_df = pd.DataFrame(ranks_data)
ranks_csv = os.path.join(OUT_DIR, "friedman_ranks_summary.csv")
ranks_df.to_csv(ranks_csv, index=False)

# ─── CREATE EXCEL FILES WITH FORMATTING ──────────────────
excel_file = os.path.join(OUT_DIR, "friedman_analysis_complete.xlsx")

with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
    # Main Friedman results
    friedman_df.to_excel(writer, sheet_name='Friedman Test', index=False)
    
    # Ranks summary
    ranks_df.to_excel(writer, sheet_name='Ranks Summary', index=False)
    
    # Post-hoc results (if any)
    if len(posthoc_df) > 0:
        posthoc_df.to_excel(writer, sheet_name='Nemenyi Post-hoc', index=False)
    
    # Format sheets
    from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
    
    header_font = Font(bold=True, color="FFFFFF")
    header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
    center_alignment = Alignment(horizontal="center", vertical="center")
    thin_border = Border(
        left=Side(style='thin'), right=Side(style='thin'),
        top=Side(style='thin'), bottom=Side(style='thin')
    )
    
    for sheet_name in writer.sheets:
        worksheet = writer.sheets[sheet_name]
        
        # Format headers
        for cell in worksheet[1]:
            cell.font = header_font
            cell.fill = header_fill
            cell.alignment = center_alignment
            cell.border = thin_border
        
        # Format data cells
        for row in worksheet.iter_rows(min_row=2):
            for cell in row:
                cell.alignment = center_alignment
                cell.border = thin_border
        
        # Auto-adjust column widths
        for column in worksheet.columns:
            max_length = 0
            column_letter = column[0].column_letter
            for cell in column:
                try:
                    if len(str(cell.value)) > max_length:
                        max_length = len(str(cell.value))
                except:
                    pass
            adjusted_width = min(max_length + 2, 25)
            worksheet.column_dimensions[column_letter].width = adjusted_width

# ─── PRINT RESULTS ──────────────────
print(f"\n{'='*60}")
print("FRIEDMAN TEST ANALYSIS COMPLETE")
print(f"{'='*60}")

print(f"\nFiles created:")
print(f"Main results: {friedman_csv}")
print(f"Ranks summary: {ranks_csv}")
if len(posthoc_df) > 0:
    print(f"Post-hoc tests: {posthoc_csv}")
print(f"Excel summary: {excel_file}")

print(f"\nOVERALL SUMMARY:")
total_tests = len(friedman_df)
significant_tests = sum(friedman_df['Significant'])
print(f"Total model-dataset combinations tested: {total_tests}")
print(f"Significant differences found: {significant_tests}")
print(f"Percentage with significant differences: {significant_tests/total_tests*100:.1f}%")

if significant_tests > 0:
    print(f"\nBEST CONFIGURATIONS BY FREQUENCY:")
    best_configs = friedman_df[friedman_df['Significant']]['Best_Config'].value_counts()
    for config, count in best_configs.items():
        print(f"  {config}: {count} cases ({count/significant_tests*100:.1f}%)")

    print(f"\nSIGNIFICANT CASES:")
    sig_cases = friedman_df[friedman_df['Significant']]
    for _, row in sig_cases.iterrows():
        print(f"  {row['Model']} - {row['Dataset']}: {row['Best_Config']} "
              f"(p={row['P_Value']:.4f})")

print(f"\nThe Friedman test ranks configurations from 1 (best) to 5 (worst) based on performance.")
print(f"Lower rank = better performance. Post-hoc tests show which specific pairs differ significantly.")

Collecting F1 scores for Friedman test...
Processing model: llama3_8b
Processing model: llama3_70b
Processing model: mistral_7b
Processing model: gemma3_4b
Processing model: wizardlm2_7b

Running Friedman tests...

Analyzing llama3_8b - pan:
  Friedman χ² = 10.2400, p = 0.036573
  Significant: Yes
  Best config: 5-shot (rank = 3.70)

Analyzing llama3_8b - maalej:
  Friedman χ² = 29.3600, p = 0.000007
  Significant: Yes
  Best config: 7-shot (rank = 4.60)

Analyzing llama3_8b - scalabrino:
  Friedman χ² = 30.1600, p = 0.000005
  Significant: Yes
  Best config: 0-shot (rank = 4.40)

Analyzing llama3_70b - pan:
  Friedman χ² = 4.1600, p = 0.384785
  Significant: No
  Best config: 7-shot (rank = 3.60)

Analyzing llama3_70b - maalej:
  Friedman χ² = 18.8000, p = 0.000860
  Significant: Yes
  Best config: 3-shot (rank = 4.20)

Analyzing llama3_70b - scalabrino:
  Friedman χ² = 34.0000, p = 0.000001
  Significant: Yes
  Best config: 7-shot (rank = 4.80)

Analyzing mistral_7b - pan:
  Friedman

In [9]:
import os
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar

# ─── CONFIG ─────────────────────────────
FS_PREFIX = "few_shot_folds_"
DATASETS = ["pan", "maalej", "scalabrino"]
MODELS = ["llama3_8b", "llama3_70b", "mistral_7b", "gemma3_4b", "wizardlm2_7b"]
SHOT_COUNT = 5  # Focus on 5-shot comparisons
FOLDS = range(10)
OUT_CSV = "mcnemar_results/mcnemar_5shot_model_comparison.csv"
OUT_EXCEL = "mcnemar_results/mcnemar_5shot_model_comparison.xlsx"
os.makedirs("mcnemar_results", exist_ok=True)

# ─── LOAD PREDICTIONS ──────────────────
def load_preds(model_name, dataset, fold, shot=5):
    path = os.path.join(f"{FS_PREFIX}{model_name}_{dataset}", "preds", f"{dataset}_fold{fold}_{shot}shot_preds.csv")
    
    if not os.path.exists(path):
        print(f"Missing file: {path}")
        return None
    
    df = pd.read_csv(path)
    return df[["text", "gold", "pred"]]

# ─── RUN MCNEMAR TEST ──────────────────
all_results = []

print("Running McNemar tests for 5-shot model vs model comparisons...")

# Generate all pairwise model comparisons
for i, model1 in enumerate(MODELS):
    for model2 in MODELS[i+1:]:  # Only upper triangular comparisons
        print(f"Comparing {model1} vs {model2}")
        
        p1_all, p2_all = [], []
        
        for dataset in DATASETS:
            for fold in FOLDS:
                df1 = load_preds(model1, dataset, fold, SHOT_COUNT)
                df2 = load_preds(model2, dataset, fold, SHOT_COUNT)
                
                if df1 is None or df2 is None:
                    continue
                
                # Use inner join to handle mismatched samples
                merged = pd.merge(df1, df2, on="text", suffixes=("_1", "_2"), how="inner")
                
                gold = merged["gold_1"].str.lower().str.strip()
                pred1 = merged["pred_1"].str.lower().str.strip()
                pred2 = merged["pred_2"].str.lower().str.strip()
                
                for y, p1, p2 in zip(gold, pred1, pred2):
                    p1_all.append(p1 == y)
                    p2_all.append(p2 == y)
        
        if len(p1_all) == 0:
            print(f"No data found for {model1} vs {model2}")
            continue
        
        # McNemar contingency table
        both_correct = sum(p1 and p2 for p1, p2 in zip(p1_all, p2_all))
        only_model1 = sum(p1 and not p2 for p1, p2 in zip(p1_all, p2_all))
        only_model2 = sum(p2 and not p1 for p1, p2 in zip(p1_all, p2_all))
        both_wrong = sum(not p1 and not p2 for p1, p2 in zip(p1_all, p2_all))
        
        table = [[both_correct, only_model1], [only_model2, both_wrong]]
        
        # Run McNemar test
        result = mcnemar(table, exact=False, correction=True)
        
        # Calculate accuracies
        acc1 = sum(p1_all) / len(p1_all)
        acc2 = sum(p2_all) / len(p2_all)
        
        all_results.append({
            "Model_1": model1,
            "Model_2": model2,
            "Model_1_Accuracy": acc1,
            "Model_2_Accuracy": acc2,
            "Accuracy_Diff": acc2 - acc1,
            "Both_Correct": both_correct,
            "Only_Model_1": only_model1,
            "Only_Model_2": only_model2,
            "Both_Wrong": both_wrong,
            "Total_Samples": len(p1_all),
            "Chi_Square": result.statistic,
            "P_Value": result.pvalue,
            "Significant": result.pvalue < 0.05
        })

# ─── CREATE RESULTS TABLE ──────────────────
results_df = pd.DataFrame(all_results)

# Round numerical columns for better display
results_df["Model_1_Accuracy"] = results_df["Model_1_Accuracy"].round(4)
results_df["Model_2_Accuracy"] = results_df["Model_2_Accuracy"].round(4)
results_df["Accuracy_Diff"] = results_df["Accuracy_Diff"].round(4)
results_df["Chi_Square"] = results_df["Chi_Square"].round(4)
results_df["P_Value"] = results_df["P_Value"].round(6)

# Sort by p-value for easier interpretation
results_df = results_df.sort_values("P_Value")

# Save CSV
results_df.to_csv(OUT_CSV, index=False)

# ─── CREATE MATRIX VIEW ──────────────────
# Create a matrix showing p-values for all pairwise comparisons
matrix_data = []

for model1 in MODELS:
    row = [model1]  # Start with model name
    
    for model2 in MODELS:
        if model1 == model2:
            row.append("-")  # Diagonal
        else:
            # Find the comparison (could be either direction)
            comparison = next((r for r in all_results 
                             if (r['Model_1'] == model1 and r['Model_2'] == model2) or
                                (r['Model_1'] == model2 and r['Model_2'] == model1)), None)
            
            if comparison:
                p_val = comparison['P_Value']
                significance = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
                row.append(f"{p_val:.4f}{significance}")
            else:
                row.append("")
    
    matrix_data.append(row)

# Create matrix DataFrame
matrix_columns = ["Model"] + MODELS
matrix_df = pd.DataFrame(matrix_data, columns=matrix_columns)

# ─── CREATE STRUCTURED MATRIX LIKE YOUR SCREENSHOT ──────────────────
# This creates the format you showed: Model rows with shot type sub-rows
structured_matrix_data = []

for model in MODELS:
    # Create a row for this model showing comparisons with all other models
    row = [model, "5 shot"]  # Model name and shot type
    
    # Add comparisons with each other model
    for other_model in MODELS:
        if model == other_model:
            row.append("-")  # Same model
        else:
            # Find comparison between this model and other_model
            comparison = next((r for r in all_results 
                             if (r['Model_1'] == model and r['Model_2'] == other_model) or
                                (r['Model_1'] == other_model and r['Model_2'] == model)), None)
            
            if comparison:
                chi_val = comparison['Chi_Square']
                p_val = comparison['P_Value']
                row.append(f"({chi_val:.3f}/{p_val:.3f})")
            else:
                row.append("")
    
    structured_matrix_data.append(row)

# Create structured matrix DataFrame (like your screenshot)
structured_columns = ["Model", "Shot Type"] + MODELS
structured_matrix_df = pd.DataFrame(structured_matrix_data, columns=structured_columns)

# Save additional structured matrix CSV
structured_csv = "mcnemar_results/mcnemar_5shot_structured_matrix.csv"
structured_matrix_df.to_csv(structured_csv, index=False)

# ─── DETERMINE WINNER MODEL ──────────────────
# Calculate overall performance metrics for each model
model_performance = {}

for model in MODELS:
    # Get all accuracy values for this model (both as Model_1 and Model_2)
    model1_results = results_df[results_df['Model_1'] == model]['Model_1_Accuracy'].tolist()
    model2_results = results_df[results_df['Model_2'] == model]['Model_2_Accuracy'].tolist()
    
    all_accuracies = model1_results + model2_results
    
    if all_accuracies:
        avg_accuracy = sum(all_accuracies) / len(all_accuracies)
    else:
        avg_accuracy = 0
    
    # Count wins (significantly better performance)
    wins = 0
    losses = 0
    
    for _, row in results_df.iterrows():
        if row['Significant']:
            if row['Model_1'] == model and row['Accuracy_Diff'] < 0:  # Model_1 (this model) is better
                wins += 1
            elif row['Model_2'] == model and row['Accuracy_Diff'] > 0:  # Model_2 (this model) is better
                wins += 1
            elif row['Model_1'] == model and row['Accuracy_Diff'] > 0:  # Model_1 (this model) is worse
                losses += 1
            elif row['Model_2'] == model and row['Accuracy_Diff'] < 0:  # Model_2 (this model) is worse
                losses += 1
    
    model_performance[model] = {
        'avg_accuracy': avg_accuracy,
        'wins': wins,
        'losses': losses,
        'win_rate': wins / (wins + losses) if (wins + losses) > 0 else 0
    }

# Find the winner
best_model = max(model_performance.keys(), 
                key=lambda x: (model_performance[x]['avg_accuracy'], 
                              model_performance[x]['win_rate']))

second_best = sorted(model_performance.keys(), 
                    key=lambda x: (model_performance[x]['avg_accuracy'], 
                                  model_performance[x]['win_rate']), 
                    reverse=True)[1]

# ─── SAVE EXCEL WITH MULTIPLE SHEETS ──────────────────
# Save as Excel with multiple sheets
with pd.ExcelWriter(OUT_EXCEL, engine='openpyxl') as writer:
    # Sheet 1: Detailed results
    results_df.to_excel(writer, sheet_name='Detailed Results', index=False)
    
    # Sheet 2: P-value matrix
    matrix_df.to_excel(writer, sheet_name='P-Value Matrix', index=False)
    
    # Sheet 3: Structured matrix (like your screenshot)
    structured_matrix_df.to_excel(writer, sheet_name='Structured Matrix', index=False)
    
    # Format the sheets
    from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
    
    # Define styles
    header_font = Font(bold=True, color="FFFFFF")
    header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
    model_font = Font(bold=True)
    model_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")
    significant_fill = PatternFill(start_color="FFE6E6", end_color="FFE6E6", fill_type="solid")
    center_alignment = Alignment(horizontal="center", vertical="center")
    thin_border = Border(
        left=Side(style='thin'), right=Side(style='thin'),
        top=Side(style='thin'), bottom=Side(style='thin')
    )
    
    # Format detailed results sheet
    ws1 = writer.sheets['Detailed Results']
    for cell in ws1[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = center_alignment
        cell.border = thin_border
    
    # Highlight significant results
    for row in range(2, ws1.max_row + 1):
        p_val_cell = ws1[f'L{row}']  # P_Value column
        significant_cell = ws1[f'M{row}']  # Significant column
        
        if significant_cell.value:
            for col in range(1, ws1.max_column + 1):
                ws1.cell(row=row, column=col).fill = significant_fill
    
    # Format matrix sheet
    ws2 = writer.sheets['P-Value Matrix']
    for cell in ws2[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = center_alignment
        cell.border = thin_border
    
    # Format structured matrix sheet (like your screenshot)
    ws3 = writer.sheets['Structured Matrix']
    
    # Format header row
    for cell in ws3[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = center_alignment
        cell.border = thin_border
    
    # Format data rows with model name styling
    for row in range(2, ws3.max_row + 1):
        model_cell = ws3[f'A{row}']
        model_cell.font = model_font
        model_cell.fill = model_fill
        model_cell.alignment = center_alignment
        model_cell.border = thin_border
        
        # Format other cells in the row
        for col in range(2, ws3.max_column + 1):
            cell = ws3.cell(row=row, column=col)
            cell.alignment = center_alignment
            cell.border = thin_border
    
    # Auto-adjust column widths for all sheets
    for worksheet in [ws1, ws2, ws3]:
        for column in worksheet.columns:
            max_length = 0
            column_letter = column[0].column_letter
            for cell in column:
                try:
                    if len(str(cell.value)) > max_length:
                        max_length = len(str(cell.value))
                except:
                    pass
            adjusted_width = min(max_length + 2, 25)
            worksheet.column_dimensions[column_letter].width = adjusted_width

# ─── PRINT RESULTS ──────────────────
print(f"\nFiles saved:")
print(f"CSV: {OUT_CSV}")
print(f"Structured Matrix CSV: {structured_csv}")
print(f"Excel: {OUT_EXCEL}")

print(f"\n🏆 WINNER ANALYSIS - 5-Shot Performance:")
print("=" * 50)
print(f"🥇 BEST MODEL: {best_model}")
print(f"   Average Accuracy: {model_performance[best_model]['avg_accuracy']:.4f}")
print(f"   Significant Wins: {model_performance[best_model]['wins']}")
print(f"   Significant Losses: {model_performance[best_model]['losses']}")
print(f"   Win Rate: {model_performance[best_model]['win_rate']:.3f}")

print(f"\n🥈 SECOND BEST: {second_best}")
print(f"   Average Accuracy: {model_performance[second_best]['avg_accuracy']:.4f}")
print(f"   Significant Wins: {model_performance[second_best]['wins']}")
print(f"   Significant Losses: {model_performance[second_best]['losses']}")
print(f"   Win Rate: {model_performance[second_best]['win_rate']:.3f}")

print(f"\n📊 ALL MODEL RANKINGS:")
print("-" * 40)
sorted_models = sorted(model_performance.items(), 
                      key=lambda x: (x[1]['avg_accuracy'], x[1]['win_rate']), 
                      reverse=True)

for i, (model, stats) in enumerate(sorted_models, 1):
    print(f"{i}. {model:<15} | Acc: {stats['avg_accuracy']:.4f} | "
          f"W/L: {stats['wins']}/{stats['losses']} | "
          f"Win Rate: {stats['win_rate']:.3f}")

print(f"\nSummary of 5-shot model comparisons:")
print(f"Total comparisons: {len(results_df)}")
print(f"Significant differences (p < 0.05): {sum(results_df['Significant'])}")

# Print top significant differences
print("\nMost significant differences:")
significant_results = results_df[results_df['Significant']].head(5)
for _, row in significant_results.iterrows():
    winner = row['Model_1'] if row['Accuracy_Diff'] < 0 else row['Model_2']
    loser = row['Model_2'] if row['Accuracy_Diff'] < 0 else row['Model_1']
    print(f"🔥 {winner} > {loser}: p = {row['P_Value']:.6f}, "
          f"Acc diff = {abs(row['Accuracy_Diff']):.4f}")

print(f"\nStructured matrix format saved as: {structured_csv}")
print("This matches the format shown in your screenshot with:")
print("- Model names in first column")
print("- Shot type (5 shot) in second column") 
print("- (Chi-square/P-value) format in remaining columns")
print("- Each row represents one model's 5-shot performance vs all others")



Running McNemar tests for 5-shot model vs model comparisons...
Comparing llama3_8b vs llama3_70b
Comparing llama3_8b vs mistral_7b
Comparing llama3_8b vs gemma3_4b
Comparing llama3_8b vs wizardlm2_7b
Comparing llama3_70b vs mistral_7b
Comparing llama3_70b vs gemma3_4b
Comparing llama3_70b vs wizardlm2_7b
Comparing mistral_7b vs gemma3_4b
Comparing mistral_7b vs wizardlm2_7b
Comparing gemma3_4b vs wizardlm2_7b

Files saved:
CSV: mcnemar_results/mcnemar_5shot_model_comparison.csv
Structured Matrix CSV: mcnemar_results/mcnemar_5shot_structured_matrix.csv
Excel: mcnemar_results/mcnemar_5shot_model_comparison.xlsx

🏆 WINNER ANALYSIS - 5-Shot Performance:
🥇 BEST MODEL: gemma3_4b
   Average Accuracy: 0.5928
   Significant Wins: 4
   Significant Losses: 0
   Win Rate: 1.000

🥈 SECOND BEST: llama3_70b
   Average Accuracy: 0.5612
   Significant Wins: 3
   Significant Losses: 1
   Win Rate: 0.750

📊 ALL MODEL RANKINGS:
----------------------------------------
1. gemma3_4b       | Acc: 0.5928 | W/

In [10]:
import pandas as pd
import os
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side

# ─── CONFIG ─────────────────────────────
IN_CSV = "mcnemar_results/mcnemar_5shot_model_comparison.csv"
OUT_DIR = "mcnemar_results"
os.makedirs(OUT_DIR, exist_ok=True)

TARGET_MODEL = "llama3_70b"
COMPARE_MODELS = ["gemma3_4b", "wizardlm2_7b"]
MODELS = COMPARE_MODELS + [TARGET_MODEL]

# ─── LOAD FULL MCNEMAR RESULTS ─────────
df = pd.read_csv(IN_CSV)

# ─── STRUCTURED CHI2/PIVALUE TABLE ─────
structured_chi2 = []
for model in MODELS:
    row = [model, "5 shot"]
    for other in MODELS:
        if model == other:
            row.append("-")
        else:
            comp = df[((df["Model_1"] == model) & (df["Model_2"] == other)) |
                      ((df["Model_1"] == other) & (df["Model_2"] == model))]
            if not comp.empty:
                chi = comp["Chi_Square"].values[0]
                p = comp["P_Value"].values[0]
                row.append(f"({chi:.3f}/{p:.3f})")
            else:
                row.append("")
    structured_chi2.append(row)

columns = ["Model", "Shot Type"] + MODELS
df_chi2 = pd.DataFrame(structured_chi2, columns=columns)
chi2_csv = os.path.join(OUT_DIR, "focused_llama70b_vs_gemma_wizard_chi2.csv")
df_chi2.to_csv(chi2_csv, index=False)

# ─── STRUCTURED WINNER TABLE ───────────
structured_winner = []
for model in MODELS:
    row = [model, "5 shot"]
    for other in MODELS:
        if model == other:
            row.append("-")
        else:
            comp = df[((df["Model_1"] == model) & (df["Model_2"] == other)) |
                      ((df["Model_1"] == other) & (df["Model_2"] == model))]
            if not comp.empty:
                acc1 = comp["Model_1_Accuracy"].values[0]
                acc2 = comp["Model_2_Accuracy"].values[0]
                m1 = comp["Model_1"].values[0]
                m2 = comp["Model_2"].values[0]
                winner = m1 if acc1 > acc2 else m2
                row.append(winner)
            else:
                row.append("")
    structured_winner.append(row)

df_winner = pd.DataFrame(structured_winner, columns=columns)
winner_csv = os.path.join(OUT_DIR, "focused_llama70b_vs_gemma_wizard_winner.csv")
df_winner.to_csv(winner_csv, index=False)

# ─── SAVE AS EXCEL ─────────────────────
excel_out = os.path.join(OUT_DIR, "focused_llama70b_vs_gemma_wizard.xlsx")
with pd.ExcelWriter(excel_out, engine="openpyxl") as writer:
    df_chi2.to_excel(writer, sheet_name="Chi2_Pvalue", index=False)
    df_winner.to_excel(writer, sheet_name="Winner", index=False)

# ─── FORMAT EXCEL ──────────────────────
wb = load_workbook(excel_out)
for sheet_name in wb.sheetnames:
    ws = wb[sheet_name]
    # Style header
    for cell in ws[1]:
        cell.font = Font(bold=True, color="FFFFFF")
        cell.fill = PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid")
        cell.alignment = Alignment(horizontal="center", vertical="center")
        cell.border = Border(
            left=Side(style='thin'), right=Side(style='thin'),
            top=Side(style='thin'), bottom=Side(style='thin')
        )
    # Style data
    for row in ws.iter_rows(min_row=2):
        for cell in row:
            cell.alignment = Alignment(horizontal="center", vertical="center")
            cell.border = Border(
                left=Side(style='thin'), right=Side(style='thin'),
                top=Side(style='thin'), bottom=Side(style='thin')
            )
# Adjust column width
for ws in wb.worksheets:
    for col in ws.columns:
        max_len = max((len(str(cell.value)) for cell in col if cell.value), default=0)
        ws.column_dimensions[col[0].column_letter].width = min(max_len + 2, 30)

wb.save(excel_out)

# ─── DONE ──────────────────────────────
print(f"✅ Structured pivot table with chi2 saved to:\n→ {chi2_csv}")
print(f"✅ Structured winner table saved to:\n→ {winner_csv}")
print(f"✅ Excel file with both sheets saved to:\n→ {excel_out}")


✅ Structured pivot table with chi2 saved to:
→ mcnemar_results/focused_llama70b_vs_gemma_wizard_chi2.csv
✅ Structured winner table saved to:
→ mcnemar_results/focused_llama70b_vs_gemma_wizard_winner.csv
✅ Excel file with both sheets saved to:
→ mcnemar_results/focused_llama70b_vs_gemma_wizard.xlsx


In [3]:
#gpt
import os
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar
from collections import defaultdict

# ─── CONFIG ─────────────────────────────
FS_PREFIX = "few_shot_folds_"
MODELS = ["llama3_70b", "llama3_8b", "mistral_7b", "gemma3_4b", "wizardlm2_7b"]
DATASETS = ["pan", "maalej", "scalabrino"]
FOLDS = range(10)
SHOT = 5

OUT_DIR = "mcnemar_results_modelvsmodel_gpt"
OUT_CSV = os.path.join(OUT_DIR, "mcnemar_5shot_all_model_pairs.csv")
os.makedirs(OUT_DIR, exist_ok=True)

# ─── LOAD PREDICTIONS ──────────────────
def load_preds(model, dataset, fold):
    path = os.path.join(f"{FS_PREFIX}{model}_{dataset}", "preds", f"{dataset}_fold{fold}_{SHOT}shot_preds.csv")
    if not os.path.exists(path):
        print(f"⚠️ Missing: {path}")
        return None
    df = pd.read_csv(path)
    return df[["text", "gold", "pred"]]

# ─── RUN MCNEMAR TEST ──────────────────
results = []
wins = defaultdict(int)

for i, model1 in enumerate(MODELS):
    for model2 in MODELS[i+1:]:
        print(f"\n🔍 Comparing {model1} vs {model2}")
        all_m1, all_m2 = [], []

        for dataset in DATASETS:
            for fold in FOLDS:
                df1 = load_preds(model1, dataset, fold)
                df2 = load_preds(model2, dataset, fold)
                if df1 is None or df2 is None:
                    continue
                merged = pd.merge(df1, df2, on="text", suffixes=("_1", "_2"))
                gold = merged["gold_1"].str.lower().str.strip()
                p1 = merged["pred_1"].str.lower().str.strip()
                p2 = merged["pred_2"].str.lower().str.strip()
                for y, pred1, pred2 in zip(gold, p1, p2):
                    all_m1.append(pred1 == y)
                    all_m2.append(pred2 == y)

        if not all_m1:
            print(f"⚠️ No data for {model1} vs {model2}")
            continue

        # Contingency table
        both_correct = sum(m1 and m2 for m1, m2 in zip(all_m1, all_m2))
        only_m1 = sum(m1 and not m2 for m1, m2 in zip(all_m1, all_m2))
        only_m2 = sum(m2 and not m1 for m1, m2 in zip(all_m1, all_m2))
        both_wrong = sum(not m1 and not m2 for m1, m2 in zip(all_m1, all_m2))
        table = [[both_correct, only_m1], [only_m2, both_wrong]]
        test_result = mcnemar(table, exact=False, correction=True)

        # Decide winner
        if only_m1 > only_m2:
            winner = model1
        elif only_m2 > only_m1:
            winner = model2
        else:
            winner = "Tie"

        if winner in MODELS:
            wins[winner] += 1

        results.append({
            "Model_1": model1,
            "Model_2": model2,
            "Only_Model_1_Correct": only_m1,
            "Only_Model_2_Correct": only_m2,
            "Chi2_Stat": test_result.statistic,
            "P_Value": test_result.pvalue,
            "Significant": test_result.pvalue < 0.05,
            "Winner": winner,
            "Total_Samples": len(all_m1)
        })

# ─── SAVE RESULTS ──────────────────────
df = pd.DataFrame(results)
df.to_csv(OUT_CSV, index=False)

# ─── RANKING SUMMARY ───────────────────
ranking = sorted(wins.items(), key=lambda x: x[1], reverse=True)
print("\n🏆 McNemar Win Count Summary (5-shot):")
for model, count in ranking:
    print(f"{model}: {count} wins")

if ranking:
    print(f"\n🎯 Final Best Model (5-shot, pairwise wins): **{ranking[0][0]}**")
else:
    print("❌ No valid comparisons available.")

print(f"\n📄 Detailed results saved to: {OUT_CSV}")



🔍 Comparing llama3_70b vs llama3_8b

🔍 Comparing llama3_70b vs mistral_7b

🔍 Comparing llama3_70b vs gemma3_4b

🔍 Comparing llama3_70b vs wizardlm2_7b

🔍 Comparing llama3_8b vs mistral_7b

🔍 Comparing llama3_8b vs gemma3_4b

🔍 Comparing llama3_8b vs wizardlm2_7b

🔍 Comparing mistral_7b vs gemma3_4b

🔍 Comparing mistral_7b vs wizardlm2_7b

🔍 Comparing gemma3_4b vs wizardlm2_7b

🏆 McNemar Win Count Summary (5-shot):
gemma3_4b: 4 wins
llama3_70b: 3 wins
wizardlm2_7b: 2 wins
mistral_7b: 1 wins

🎯 Final Best Model (5-shot, pairwise wins): **gemma3_4b**

📄 Detailed results saved to: mcnemar_results_modelvsmodel_gpt/mcnemar_5shot_all_model_pairs.csv


In [18]:
#Benchmark
import os
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar
from itertools import combinations
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl import load_workbook

# ─── CONFIG ─────────────────────────────
INPUT_DIR = "all_preds_export"
OUT_DIR = "mcnemar_results_benchmark"
os.makedirs(OUT_DIR, exist_ok=True)

DATASETS = ["pan", "maalej", "scalabrino"]
MODELS = ["bert", "albert", "sbert", "roberta"]
FOLDS = range(10)

# ─── FUNCTION: Load and Merge All Folds ─────────────
def load_model_preds(model, dataset):
    dfs = []
    for fold in FOLDS:
        fname = f"{dataset}_{model}_fold{fold}_test_preds.csv"
        path = os.path.join(INPUT_DIR, fname)
        if not os.path.exists(path):
            print(f"⚠️ Missing: {fname}")
            continue
        df = pd.read_csv(path)
        df = df.rename(columns={"y_true": "gold", "y_pred": "pred"})
        df["fold"] = fold
        df["dataset"] = dataset
        df["id"] = df.index + fold * 100000  # Ensure uniqueness across folds
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True) if dfs else None

# ─── STEP 1: Collect predictions for each model ──────
model_preds = {}
for model in MODELS:
    all_df = []
    for dataset in DATASETS:
        df = load_model_preds(model, dataset)
        if df is not None:
            all_df.append(df)
    if all_df:
        model_preds[model] = pd.concat(all_df, ignore_index=True)

# ─── STEP 2: McNemar Test (model vs model) ───────────
results = []
model_pairs = list(combinations(MODELS, 2))

for m1, m2 in model_pairs:
    df1 = model_preds.get(m1)
    df2 = model_preds.get(m2)
    if df1 is None or df2 is None:
        continue

    # Align on same row order (same test set assumption)
    merged = pd.merge(df1, df2, on=["dataset", "fold", "id", "gold"], suffixes=(f"_{m1}", f"_{m2}"))
    correct1 = merged[f"pred_{m1}"] == merged["gold"]
    correct2 = merged[f"pred_{m2}"] == merged["gold"]

    both_correct = ((correct1) & (correct2)).sum()
    only_m1 = ((correct1) & (~correct2)).sum()
    only_m2 = ((~correct1) & (correct2)).sum()
    both_wrong = ((~correct1) & (~correct2)).sum()

    table = [[both_correct, only_m1],
             [only_m2, both_wrong]]

    result = mcnemar(table, exact=False, correction=True)
    results.append({
        "Model_1": m1.upper(),
        "Model_2": m2.upper(),
        "Chi_Square": result.statistic,
        "P_Value": result.pvalue
    })

# ─── STEP 3: Output as Matrix ───────────────────────
matrix_models = [m.upper() for m in MODELS]
matrix_data = []

for row_model in matrix_models:
    row = [row_model]
    for col_model in matrix_models:
        if row_model == col_model:
            row.append("-")
        else:
            res = next((r for r in results if (r["Model_1"], r["Model_2"]) == (row_model, col_model) or 
                                                (r["Model_2"], r["Model_1"]) == (row_model, col_model)), None)
            if res:
                val = f"{res['Chi_Square']:.3f}/{res['P_Value']:.3f}"
            else:
                val = ""
            row.append(val)
    matrix_data.append(row)

# ─── STEP 4: Save CSV & Excel ───────────────────────
csv_path = os.path.join(OUT_DIR, "mcnemar_model_vs_model.csv")
xlsx_path = os.path.join(OUT_DIR, "mcnemar_model_vs_model.xlsx")

df_matrix = pd.DataFrame(matrix_data, columns=[""] + matrix_models)
df_matrix.to_csv(csv_path, index=False)

with pd.ExcelWriter(xlsx_path, engine="openpyxl") as writer:
    df_matrix.to_excel(writer, sheet_name="Model Comparison", index=False)
    ws = writer.sheets["Model Comparison"]

    # Header style
    header_font = Font(bold=True, color="FFFFFF")
    header_fill = PatternFill("solid", fgColor="366092")
    center_align = Alignment(horizontal="center", vertical="center")
    thin_border = Border(left=Side(style='thin'), right=Side(style='thin'),
                         top=Side(style='thin'), bottom=Side(style='thin'))

    # Format header
    for cell in ws[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = center_align
        cell.border = thin_border

    # Format data rows
    for row in ws.iter_rows(min_row=2):
        for cell in row:
            cell.alignment = center_align
            cell.border = thin_border

    # Auto-size columns
    for col in ws.columns:
        max_len = max(len(str(cell.value)) for cell in col)
        ws.column_dimensions[col[0].column_letter].width = min(max_len + 2, 25)

print("✅ Model-vs-Model McNemar test completed.")
print(f"📁 CSV:   {csv_path}")
print(f"📊 Excel: {xlsx_path}")

  
# ─── STEP 2B: Winner Matrix ───────────────
winner_matrix = []

for row_model in matrix_models:
    row = [row_model]
    for col_model in matrix_models:
        if row_model == col_model:
            row.append("-")
        else:
            res = next((r for r in detailed_results if 
                        (r["Model_1"], r["Model_2"]) == (row_model, col_model) or
                        (r["Model_2"], r["Model_1"]) == (row_model, col_model)), None)
            row.append(res["Winner"] if res else "")
    winner_matrix.append(row)


df_winner = pd.DataFrame(winner_matrix, columns=[""] + matrix_models)

# Save
winner_csv = os.path.join(OUT_DIR, "mcnemar_model_vs_model_winner.csv")
winner_xlsx = os.path.join(OUT_DIR, "mcnemar_model_vs_model_winner.xlsx")
df_winner.to_csv(winner_csv, index=False)
df_winner.to_excel(winner_xlsx, index=False)

print(f"✅ Winner matrix saved:\n - {winner_csv}\n - {winner_xlsx}")
detailed_results = []

for m1, m2 in model_pairs:
    df1 = model_preds.get(m1)
    df2 = model_preds.get(m2)
    if df1 is None or df2 is None:
        continue

    merged = pd.merge(df1, df2, on=["dataset", "fold", "id", "gold"], suffixes=(f"_{m1}", f"_{m2}"))
    correct1 = merged[f"pred_{m1}"] == merged["gold"]
    correct2 = merged[f"pred_{m2}"] == merged["gold"]

    both_correct = ((correct1) & (correct2)).sum()
    only_m1 = ((correct1) & (~correct2)).sum()
    only_m2 = ((~correct1) & (correct2)).sum()
    both_wrong = ((~correct1) & (~correct2)).sum()

    table = [[both_correct, only_m1],
             [only_m2, both_wrong]]

    result = mcnemar(table, exact=False, correction=True)

    acc1 = correct1.mean()
    acc2 = correct2.mean()
    acc_diff = acc1 - acc2

    if acc1 > acc2:
        winner = m1.upper()
    elif acc2 > acc1:
        winner = m2.upper()
    else:
        winner = "TIE"

    detailed_results.append({
        "Model_1": m1.upper(),
        "Model_2": m2.upper(),
        "Model_1_Accuracy": round(acc1, 4),
        "Model_2_Accuracy": round(acc2, 4),
        "Accuracy_Diff": round(acc_diff, 4),
        "Both_Correct": int(both_correct),
        "Only_Model1": int(only_m1),
        "Only_Model2": int(only_m2),
        "Both_Wrong": int(both_wrong),
        "Chi_Square": round(result.statistic, 3),
        "P_Value": round(result.pvalue, 3),
        "Winner": winner
    })

df_detailed = pd.DataFrame(detailed_results)

# Save detailed file
detailed_csv = os.path.join(OUT_DIR, "mcnemar_model_comparison_detailed.csv")
detailed_xlsx = os.path.join(OUT_DIR, "mcnemar_model_comparison_detailed.xlsx")

df_detailed.to_csv(detailed_csv, index=False)
df_detailed.to_excel(detailed_xlsx, index=False)

print(f"📄 Detailed results saved:\n - {detailed_csv}\n - {detailed_xlsx}")


✅ Model-vs-Model McNemar test completed.
📁 CSV:   mcnemar_results_benchmark/mcnemar_model_vs_model.csv
📊 Excel: mcnemar_results_benchmark/mcnemar_model_vs_model.xlsx
✅ Winner matrix saved:
 - mcnemar_results_benchmark/mcnemar_model_vs_model_winner.csv
 - mcnemar_results_benchmark/mcnemar_model_vs_model_winner.xlsx
📄 Detailed results saved:
 - mcnemar_results_benchmark/mcnemar_model_comparison_detailed.csv
 - mcnemar_results_benchmark/mcnemar_model_comparison_detailed.xlsx


In [24]:
#llama3:70b vs qlora
import os
import pandas as pd
from itertools import combinations
from statsmodels.stats.contingency_tables import mcnemar
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side

# ─── CONFIG ─────────────────────────────
OUT_DIR = "mcnemar_llama_vs_qlora"
os.makedirs(OUT_DIR, exist_ok=True)

DATASETS = ["pan", "maalej", "scalabrino"]
MODELS = ["llama3_70b", "qlora"]
MODEL_DIRS = {
    "llama3_70b": "llama3_70b",
    "qlora": "qlora"
}
FOLDS = range(10)

# ─── FUNCTION: Load Predictions ────────
def load_model_preds(model, dataset):
    dfs = []
    model_dir = MODEL_DIRS[model]
    for fold in FOLDS:
        fname = f"{dataset}_fold{fold}_test_preds.csv"
        path = os.path.join(model_dir, fname)
        if not os.path.exists(path):
            print(f"⚠️ Missing: {fname}")
            continue
        df = pd.read_csv(path)
        df = df.rename(columns={"y_true": "gold", "y_pred": "pred"})
        df["fold"] = fold
        df["dataset"] = dataset
        df["id"] = df.index + fold * 100000  # Ensure uniqueness across folds
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True) if dfs else None

# ─── STEP 1: Collect predictions ───────
model_preds = {}
for model in MODELS:
    all_df = []
    for dataset in DATASETS:
        df = load_model_preds(model, dataset)
        if df is not None:
            all_df.append(df)
    if all_df:
        model_preds[model] = pd.concat(all_df, ignore_index=True)

# ─── STEP 2: McNemar Test ──────────────
results = []
detailed_results = []
model_pairs = list(combinations(MODELS, 2))

for m1, m2 in model_pairs:
    df1 = model_preds.get(m1)
    df2 = model_preds.get(m2)
    if df1 is None or df2 is None:
        continue

    merged = pd.merge(df1, df2, on=["dataset", "fold", "id", "gold"], suffixes=(f"_{m1}", f"_{m2}"))
    correct1 = merged[f"pred_{m1}"] == merged["gold"]
    correct2 = merged[f"pred_{m2}"] == merged["gold"]

    both_correct = ((correct1) & (correct2)).sum()
    only_m1 = ((correct1) & (~correct2)).sum()
    only_m2 = ((~correct1) & (correct2)).sum()
    both_wrong = ((~correct1) & (~correct2)).sum()

    table = [[both_correct, only_m1],
             [only_m2, both_wrong]]

    result = mcnemar(table, exact=False, correction=True)

    acc1 = correct1.mean()
    acc2 = correct2.mean()
    acc_diff = acc1 - acc2

    if acc1 > acc2:
        winner = m1.upper()
    elif acc2 > acc1:
        winner = m2.upper()
    else:
        winner = "TIE"

    results.append({
        "Model_1": m1.upper(),
        "Model_2": m2.upper(),
        "Chi_Square": result.statistic,
        "P_Value": result.pvalue
    })

    detailed_results.append({
        "Model_1": m1.upper(),
        "Model_2": m2.upper(),
        "Model_1_Accuracy": round(acc1, 4),
        "Model_2_Accuracy": round(acc2, 4),
        "Accuracy_Diff": round(acc_diff, 4),
        "Both_Correct": int(both_correct),
        "Only_Model1": int(only_m1),
        "Only_Model2": int(only_m2),
        "Both_Wrong": int(both_wrong),
        "Chi_Square": round(result.statistic, 3),
        "P_Value": round(result.pvalue, 3),
        "Winner": winner
    })

# ─── STEP 3: McNemar Matrix ─────────────
matrix_models = [m.upper() for m in MODELS]
matrix_data = []

for row_model in matrix_models:
    row = [row_model]
    for col_model in matrix_models:
        if row_model == col_model:
            row.append("-")
        else:
            res = next((r for r in results if 
                        (r["Model_1"], r["Model_2"]) == (row_model, col_model) or 
                        (r["Model_2"], r["Model_1"]) == (row_model, col_model)), None)
            val = f"{res['Chi_Square']:.3f}/{res['P_Value']:.3f}" if res else ""
            row.append(val)
    matrix_data.append(row)

df_matrix = pd.DataFrame(matrix_data, columns=[""] + matrix_models)

# Save matrix
matrix_csv = os.path.join(OUT_DIR, "mcnemar_model_vs_model.csv")
matrix_xlsx = os.path.join(OUT_DIR, "mcnemar_model_vs_model.xlsx")
df_matrix.to_csv(matrix_csv, index=False)

# Excel styling
with pd.ExcelWriter(matrix_xlsx, engine="openpyxl") as writer:
    df_matrix.to_excel(writer, sheet_name="Model Comparison", index=False)
    ws = writer.sheets["Model Comparison"]

    header_font = Font(bold=True, color="FFFFFF")
    header_fill = PatternFill("solid", fgColor="366092")
    center_align = Alignment(horizontal="center", vertical="center")
    thin_border = Border(left=Side(style='thin'), right=Side(style='thin'),
                         top=Side(style='thin'), bottom=Side(style='thin'))

    for cell in ws[1]:
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = center_align
        cell.border = thin_border

    for row in ws.iter_rows(min_row=2):
        for cell in row:
            cell.alignment = center_align
            cell.border = thin_border

    for col in ws.columns:
        max_len = max(len(str(cell.value)) for cell in col)
        ws.column_dimensions[col[0].column_letter].width = min(max_len + 2, 25)

# ─── STEP 4: Winner Matrix ───────────────
winner_matrix = []
for row_model in matrix_models:
    row = [row_model]
    for col_model in matrix_models:
        if row_model == col_model:
            row.append("-")
        else:
            res = next((r for r in detailed_results if 
                        (r["Model_1"], r["Model_2"]) == (row_model, col_model) or 
                        (r["Model_2"], r["Model_1"]) == (row_model, col_model)), None)
            row.append(res["Winner"] if res else "")
    winner_matrix.append(row)

df_winner = pd.DataFrame(winner_matrix, columns=[""] + matrix_models)
winner_csv = os.path.join(OUT_DIR, "mcnemar_model_vs_model_winner.csv")
winner_xlsx = os.path.join(OUT_DIR, "mcnemar_model_vs_model_winner.xlsx")
df_winner.to_csv(winner_csv, index=False)
df_winner.to_excel(winner_xlsx, index=False)

# ─── STEP 5: Detailed Results ─────────────
df_detailed = pd.DataFrame(detailed_results)
detailed_csv = os.path.join(OUT_DIR, "mcnemar_model_comparison_detailed.csv")
detailed_xlsx = os.path.join(OUT_DIR, "mcnemar_model_comparison_detailed.xlsx")
df_detailed.to_csv(detailed_csv, index=False)
df_detailed.to_excel(detailed_xlsx, index=False)

# ─── DONE ────────────────────────────────
print("✅ McNemar test complete.")
print(f"📁 Matrix CSV:     {matrix_csv}")
print(f"📊 Matrix Excel:   {matrix_xlsx}")
print(f"🏆 Winner Matrix:  {winner_csv}")
print(f"📄 Detailed File:  {detailed_csv}")


✅ McNemar test complete.
📁 Matrix CSV:     mcnemar_llama_vs_qlora/mcnemar_model_vs_model.csv
📊 Matrix Excel:   mcnemar_llama_vs_qlora/mcnemar_model_vs_model.xlsx
🏆 Winner Matrix:  mcnemar_llama_vs_qlora/mcnemar_model_vs_model_winner.csv
📄 Detailed File:  mcnemar_llama_vs_qlora/mcnemar_model_comparison_detailed.csv


In [39]:
#gen vs nongen
import os
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl import load_workbook

# ─── CONFIG ─────────────────────────────
FOLDS = range(10)
DATASETS = ["pan", "maalej", "scalabrino"]
BASE_DIR = ""
MODELS = ["qlora", "roberta"]
OUT_DIR = "mcnemar_qlora_vs_roberta"
os.makedirs(OUT_DIR, exist_ok=True)

# ─── LOAD & MERGE ───────────────────────
def load_preds(model, dataset):
    dfs = []
    for fold in FOLDS:
        file = f"{dataset}_fold{fold}_test_preds.csv"
        path = os.path.join(BASE_DIR, model, file)
        if not os.path.exists(path):
            print(f"⚠️ Missing: {path}")
            continue
        df = pd.read_csv(path)
        df = df[["id", "gold", "pred"]].copy()
        df["fold"] = fold
        df["dataset"] = dataset
        df["model"] = model
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True) if dfs else None

df_q = []
df_r = []

for d in DATASETS:
    q_df = load_preds("qlora", d)
    r_df = load_preds("roberta", d)
    if q_df is not None and r_df is not None:
        df_q.append(q_df)
        df_r.append(r_df)

df_qlora = pd.concat(df_q, ignore_index=True)
df_roberta = pd.concat(df_r, ignore_index=True)

# ─── ALIGN ROWS ─────────────────────────
merged = pd.merge(
    df_qlora,
    df_roberta,
    on=["dataset", "fold", "id", "gold"],
    suffixes=("_qlora", "_roberta")
)

# ─── RUN MCNEMAR TEST ───────────────────
correct_q = merged["pred_qlora"] == merged["gold"]
correct_r = merged["pred_roberta"] == merged["gold"]

both_correct = ((correct_q) & (correct_r)).sum()
only_q = ((correct_q) & (~correct_r)).sum()
only_r = ((~correct_q) & (correct_r)).sum()
both_wrong = ((~correct_q) & (~correct_r)).sum()

table = [[both_correct, only_q],
         [only_r, both_wrong]]

test = mcnemar(table, exact=False, correction=True)

# ─── STRUCTURE RESULTS ──────────────────
acc_q = correct_q.mean()
acc_r = correct_r.mean()


if test.pvalue < 0.05:
    if acc_q > acc_r:
        winner = "QLORA"
    elif acc_r > acc_q:
        winner = "ROBERTA"
    else:
        winner = "TIE"
else:
    winner = "TIE"


#winner = "QLORA" if acc_q > acc_r else "ROBERTA" if acc_r > acc_q else "TIE"

detailed = pd.DataFrame([{
    "Model_1": "QLORA",
    "Model_2": "ROBERTA",
    "Accuracy_QLORA": round(acc_q, 4),
    "Accuracy_ROBERTA": round(acc_r, 4),
    "Accuracy_Diff": round(acc_q - acc_r, 4),
    "Both_Correct": int(both_correct),
    "Only_QLORA_Correct": int(only_q),
    "Only_ROBERTA_Correct": int(only_r),
    "Both_Wrong": int(both_wrong),
    "Chi_Square": round(test.statistic, 3),
    "P_Value": round(test.pvalue, 3),
    "Winner": winner
}])

# ─── OUTPUTS ────────────────────────────
csv_detailed = os.path.join(OUT_DIR, "qlora_vs_roberta_detailed.csv")
xlsx_detailed = os.path.join(OUT_DIR, "qlora_vs_roberta_detailed.xlsx")

detailed.to_csv(csv_detailed, index=False)
detailed.to_excel(xlsx_detailed, index=False)

# ─── CHI / P MATRIX ─────────────────────
matrix = pd.DataFrame([
    ["", "QLORA", "ROBERTA"],
    ["QLORA", "-", f"{test.statistic:.3f}/{test.pvalue:.3f}"],
    ["ROBERTA", f"{test.statistic:.3f}/{test.pvalue:.3f}", "-"]
])
matrix.columns = matrix.iloc[0]
matrix = matrix[1:]
matrix.to_csv(os.path.join(OUT_DIR, "chi_p_matrix.csv"), index=False)

# ─── WINNER MATRIX ──────────────────────
win_matrix = pd.DataFrame([
    ["", "QLORA", "ROBERTA"],
    ["QLORA", "-", winner],
    ["ROBERTA", winner, "-"]
])

win_matrix.columns = win_matrix.iloc[0]
win_matrix = win_matrix[1:]
win_matrix.to_csv(os.path.join(OUT_DIR, "winner_matrix.csv"), index=False)

print("✅ QLoRA vs RoBERTa McNemar test complete.")


✅ QLoRA vs RoBERTa McNemar test complete.


In [36]:
import os
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar
import numpy as np

# ─── CONFIG ─────────────────────────────
FOLDS = range(10)
DATASETS = ["pan", "maalej", "scalabrino"]
BASE_DIR = ""
MODELS = ["qlora", "roberta"]
OUT_DIR = "mcnemar_qlora_vs_roberta_with_claude"
os.makedirs(OUT_DIR, exist_ok=True)

# ─── LOAD & MERGE ───────────────────────
def load_preds(model, dataset):
    dfs = []
    for fold in FOLDS:
        file = f"{dataset}_fold{fold}_test_preds.csv"
        path = os.path.join(BASE_DIR, model, file)
        if not os.path.exists(path):
            print(f"⚠️ Missing: {path}")
            continue
        df = pd.read_csv(path)
        df = df[["id", "gold", "pred"]].copy()
        df["fold"] = fold
        df["dataset"] = dataset
        df["model"] = model
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True) if dfs else None

# Load data for both models
df_q = []
df_r = []
for d in DATASETS:
    q_df = load_preds("qlora", d)
    r_df = load_preds("roberta", d)
    if q_df is not None and r_df is not None:
        df_q.append(q_df)
        df_r.append(r_df)

df_qlora = pd.concat(df_q, ignore_index=True)
df_roberta = pd.concat(df_r, ignore_index=True)

# ─── ALIGN ROWS ─────────────────────────
merged = pd.merge(
    df_qlora,
    df_roberta,
    on=["dataset", "fold", "id", "gold"],
    suffixes=("_qlora", "_roberta")
)

print(f"Total aligned samples: {len(merged)}")

# ─── COMPUTE CORRECTNESS ────────────────
correct_q = merged["pred_qlora"] == merged["gold"]
correct_r = merged["pred_roberta"] == merged["gold"]

# ─── BUILD CONTINGENCY TABLE ────────────
both_correct = ((correct_q) & (correct_r)).sum()
only_q = ((correct_q) & (~correct_r)).sum()  # QLora correct, RoBERTa wrong
only_r = ((~correct_q) & (correct_r)).sum()  # QLora wrong, RoBERTa correct
both_wrong = ((~correct_q) & (~correct_r)).sum()

# Validate counts
total_samples = len(merged)
assert both_correct + only_q + only_r + both_wrong == total_samples, "Counts don't sum to total!"

print(f"Both correct: {both_correct}")
print(f"Only QLora correct: {only_q}")
print(f"Only RoBERTa correct: {only_r}")
print(f"Both wrong: {both_wrong}")
print(f"Discordant pairs: {only_q + only_r}")

# ─── MCNEMAR CONTINGENCY TABLE ──────────
# Structure: [[both_correct, only_model1_correct],
#             [only_model2_correct, both_wrong]]
table = np.array([[both_correct, only_q],
                  [only_r, both_wrong]])

print("\nMcNemar Contingency Table:")
print("                RoBERTa")
print("              Correct  Wrong")
print(f"QLora Correct    {both_correct:4d}   {only_q:4d}")
print(f"      Wrong      {only_r:4d}   {both_wrong:4d}")

# ─── RUN MCNEMAR TEST ───────────────────
n_discordant = only_q + only_r

# Use exact test for small discordant pairs, otherwise continuity correction
if n_discordant < 25:
    print(f"\nUsing exact test (discordant pairs = {n_discordant} < 25)")
    test = mcnemar(table, exact=True)
else:
    print(f"\nUsing chi-square test with continuity correction (discordant pairs = {n_discordant})")
    test = mcnemar(table, exact=False, correction=True)

# ─── COMPUTE METRICS ────────────────────
acc_q = correct_q.mean()
acc_r = correct_r.mean()
acc_diff = acc_q - acc_r

# Statistical significance
alpha = 0.05
is_significant = test.pvalue < alpha

if is_significant:
    if acc_diff > 0:
        winner = "QLORA (significant)"
    else:
        winner = "ROBERTA (significant)"
else:
    winner = f"No significant difference (p={test.pvalue:.3f})"

print(f"\nResults:")
print(f"QLora Accuracy: {acc_q:.4f}")
print(f"RoBERTa Accuracy: {acc_r:.4f}")
print(f"Difference: {acc_diff:+.4f}")
print(f"Chi-square statistic: {test.statistic:.3f}")
print(f"P-value: {test.pvalue:.3f}")
print(f"Significant at α=0.05: {is_significant}")
print(f"Winner: {winner}")

# ─── STRUCTURE DETAILED RESULTS ─────────
detailed = pd.DataFrame([{
    "Model_1": "QLORA",
    "Model_2": "ROBERTA",
    "Accuracy_QLORA": round(acc_q, 4),
    "Accuracy_ROBERTA": round(acc_r, 4),
    "Accuracy_Diff": round(acc_diff, 4),
    "Both_Correct": int(both_correct),
    "Only_QLORA_Correct": int(only_q),
    "Only_ROBERTA_Correct": int(only_r),
    "Both_Wrong": int(both_wrong),
    "Total_Samples": int(total_samples),
    "Discordant_Pairs": int(n_discordant),
    "Chi_Square": round(test.statistic, 3),
    "P_Value": round(test.pvalue, 3),
    "Significant": is_significant,
    "Winner": winner,
    "Test_Type": "Exact" if n_discordant < 25 else "Chi-square"
}])

# ─── OUTPUTS ────────────────────────────
csv_detailed = os.path.join(OUT_DIR, "qlora_vs_roberta_detailed.csv")
xlsx_detailed = os.path.join(OUT_DIR, "qlora_vs_roberta_detailed.xlsx")

detailed.to_csv(csv_detailed, index=False)
detailed.to_excel(xlsx_detailed, index=False)

# Save contingency table
contingency_df = pd.DataFrame(table, 
                            index=['QLora_Correct', 'QLora_Wrong'],
                            columns=['RoBERTa_Correct', 'RoBERTa_Wrong'])
contingency_df.to_csv(os.path.join(OUT_DIR, "mcnemar_contingency_table.csv"))

print(f"\n✅ McNemar test complete. Results saved to {OUT_DIR}/")
print(f"Key insight: {winner}")

Total aligned samples: 3000
Both correct: 2581
Only QLora correct: 102
Only RoBERTa correct: 104
Both wrong: 213
Discordant pairs: 206

McNemar Contingency Table:
                RoBERTa
              Correct  Wrong
QLora Correct    2581    102
      Wrong       104    213

Using chi-square test with continuity correction (discordant pairs = 206)

Results:
QLora Accuracy: 0.8943
RoBERTa Accuracy: 0.8950
Difference: -0.0007
Chi-square statistic: 0.005
P-value: 0.944
Significant at α=0.05: False
Winner: No significant difference (p=0.944)

✅ McNemar test complete. Results saved to mcnemar_qlora_vs_roberta_with_claude/
Key insight: No significant difference (p=0.944)
