# Comparison of Standardization Methods on GLUE Datasets

Evaluate and compare four methods side-by-side:
- **Keyword baseline** — rule-based synonym matching
- **Embedding baseline** — cosine similarity via `all-MiniLM-L6-v2`
- **Local LLM** — `Qwen/Qwen3-0.6B` running locally
- **API LLM** — `claude-opus-4.6` via OpenRouter

Metric: **Jaccard similarity** between predicted fields and Unitxt ground-truth fields.

## 1. Imports & Setup

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))  # notebooks/ → project root

import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import logging
logging.getLogger("unitxt").setLevel(logging.ERROR)

from src.eval import evaluate
from src.baselines import baseline_keyword_match, baseline_embedding_match
from src.standardize_api import load_standardized_dataset
from src.standardize_local import load_standardized_dataset_local

print("Imports OK")

In [None]:
import os
os.environ["OPENROUTER_API_KEY"] = "your_key_here"  # replace before running

## 2. Datasets & Methods

In [ ]:
GLUE_DATASETS = [
    {"card_id": "sst2", "hf_name": "glue", "hf_config": "sst2"},
    {"card_id": "mrpc", "hf_name": "glue", "hf_config": "mrpc"},
    {"card_id": "qnli", "hf_name": "glue", "hf_config": "qnli"},
    {"card_id": "mnli", "hf_name": "glue", "hf_config": "mnli"},
    {"card_id": "wnli", "hf_name": "glue", "hf_config": "wnli"},
]

METHODS = {
    "keyword":   baseline_keyword_match,
    "embedding": baseline_embedding_match,
    "local_llm": load_standardized_dataset_local,
    "api_llm":   load_standardized_dataset,
}

print(f"{len(GLUE_DATASETS)} datasets  ×  {len(METHODS)} methods  =  {len(GLUE_DATASETS) * len(METHODS)} evaluations")

## 3. Run Evaluation

Each cell below runs one method group. Run them independently to avoid re-running the expensive LLM calls.

In [None]:
raw_results = []  # [{dataset, method, score, struct_score, annot_score}, ...]

for exp in GLUE_DATASETS:
    card_id, hf_name, hf_config = exp["card_id"], exp["hf_name"], exp["hf_config"]
    print(f"\n── {card_id} ──")

    for method_name, standardize_fn in METHODS.items():
        print(f"  {method_name}...", end=" ", flush=True)
        try:
            result = evaluate(
                hf_name=hf_name,
                hf_config=hf_config,
                card_id=card_id,
                standardize_fn=standardize_fn,
            )
            score        = result["score"]
            struct_score = result["struct_score"]
            annot_score  = result["annot_score"]
            print(f"✓  score={score:.3f}  struct={struct_score:.3f}  annot={annot_score:.3f}")
        except Exception as e:
            score = struct_score = annot_score = None
            print(f"✗  {e}")

        raw_results.append({
            "dataset":      card_id,
            "method":       method_name,
            "score":        score,
            "struct_score": struct_score,
            "annot_score":  annot_score,
        })

print("\n✅ Evaluation complete.")

## 4. Results Table

In [None]:
COL_ORDER = ["keyword", "embedding", "local_llm", "api_llm"]

df_results = pd.DataFrame(raw_results)

def make_pivot(score_col: str) -> pd.DataFrame:
    p = (
        df_results
        .pivot(index="dataset", columns="method", values=score_col)
        .reindex(columns=[c for c in COL_ORDER if c in df_results["method"].values])
    )
    p.loc["Average"] = p.mean()
    return p.round(3)

# ── Combined score (struct + annot) / 2 ──────────────────
print("=== Combined score  (struct + annot) / 2 ===")
display(make_pivot("score"))

# ── Structural score: Jaccard on field names ──────────────
print("\n=== Structural score  (Jaccard on field names) ===")
display(make_pivot("struct_score"))

# ── Annotation score: value match for *_type, type_of_*, classes ──
print("\n=== Annotation score  (value match for *_type / type_of_* / classes) ===")
display(make_pivot("annot_score"))

## 5. Visualization

In [None]:
pivot_data = pivot.drop(index="Average").astype(float)
avg = pivot.loc["Average"].astype(float)

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# ── Heatmap ──────────────────────────────────────────────
sns.heatmap(
    pivot_data,
    ax=axes[0],
    annot=True, fmt=".2f",
    cmap="YlGn", vmin=0, vmax=1,
    linewidths=0.5, linecolor="white",
)
axes[0].set_title("Jaccard score per dataset & method", fontsize=12)
axes[0].set_xlabel("")
axes[0].set_ylabel("")
axes[0].tick_params(axis="x", rotation=30)

# ── Average bar chart ─────────────────────────────────────
colors = sns.color_palette("Set2", len(avg))
bars = axes[1].bar(avg.index, avg.values, color=colors, edgecolor="black", width=0.5)
axes[1].set_title("Average Jaccard score per method", fontsize=12)
axes[1].set_ylim(0, 1.1)
axes[1].set_ylabel("Jaccard score")
axes[1].tick_params(axis="x", rotation=30)
for bar in bars:
    axes[1].text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + 0.02,
        f"{bar.get_height():.3f}",
        ha="center", va="bottom", fontsize=10,
    )

plt.tight_layout()
plt.savefig("../results/comparison_plot.png", dpi=150, bbox_inches="tight")
plt.show()