In [None]:

import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display

try:
    from scipy import stats
    SCIPY_AVAILABLE = True
except ImportError:
    SCIPY_AVAILABLE = False


def find_eval_dir():
    cwd = Path.cwd()
    candidates = []
    for base in [cwd, *cwd.parents]:
        candidates.append(base / "backend/data/ea-logs/json")
        candidates.append(base / "data/ea-logs/json")
    for cand in candidates:
        if cand.is_dir():
            return cand
    raise FileNotFoundError("Could not locate backend/data/ea-logs/json")


def mean_curve(histories):
    histories = [h for h in histories if isinstance(h, (list, tuple)) and len(h) > 0]
    if not histories:
        return []
    max_len = max(len(h) for h in histories)
    mat = np.full((len(histories), max_len), np.nan)
    for i, h in enumerate(histories):
        mat[i, : len(h)] = h
    return np.nanmean(mat, axis=0)


def cliffs_delta(x, y):
    x = np.array(x)
    y = np.array(y)
    total = len(x) * len(y)
    if total == 0:
        return np.nan
    gt = 0
    lt = 0
    for xi in x:
        gt += np.sum(xi > y)
        lt += np.sum(xi < y)
    return (gt - lt) / total


EVAL_DIR = find_eval_dir()
print("Using EVAL_DIR:", EVAL_DIR)

rows = []
for path in sorted(EVAL_DIR.glob("ea_run_*.json")):
    with path.open() as f:
        data = json.load(f)

    runs_by_variant = {r["variant"]: r for r in data.get("runs", [])}
    if not {"ea_rl", "ea_only"} <= runs_by_variant.keys():
        print(f"Skipping {path.name}: missing expected variants")
        continue

    cfg = data["config"]
    rl_run = runs_by_variant["ea_rl"]
    ea_run = runs_by_variant["ea_only"]

    rl_constraints = rl_run.get("best_constraints", {}) or {}
    ea_constraints = ea_run.get("best_constraints", {}) or {}

    rows.append({
        "file": path.name,
        "run_id": data.get("run_id"),
        "floor_id": data.get("floor_id"),
        "grid_size": data.get("grid_size"),
        "rotate_k": data.get("rotate_k"),
        "gens": cfg.get("generations"),
        "population": cfg.get("population_size"),

        "rl_best": rl_run.get("best_fitness_final"),
        "rl_best_initial": rl_run.get("best_fitness_initial"),
        "rl_gen_at_best": rl_run.get("gen_at_best"),
        "rl_duration_s": rl_run.get("duration_s"),
        "rl_is_real": rl_run.get("is_real"),
        "rl_realism_score": rl_run.get("realism_score"),
        "rl_history": rl_run.get("history", []),
        "rl_bandit_arm": rl_run.get("bandit_arm_index"),
        "rl_bandit_reward": rl_run.get("bandit_reward"),
        "rl_seeder": rl_run.get("seeder_name"),
        "rl_mask": rl_constraints.get("mask"),
        "rl_compactness": rl_constraints.get("compactness"),
        "rl_holes": rl_constraints.get("holes"),
        "rl_area": rl_constraints.get("area"),
        "rl_budget": rl_constraints.get("budget"),

        "ea_best": ea_run.get("best_fitness_final"),
        "ea_best_initial": ea_run.get("best_fitness_initial"),
        "ea_gen_at_best": ea_run.get("gen_at_best"),
        "ea_duration_s": ea_run.get("duration_s"),
        "ea_is_real": ea_run.get("is_real"),
        "ea_realism_score": ea_run.get("realism_score"),
        "ea_history": ea_run.get("history", []),
        "ea_bandit_arm": ea_run.get("bandit_arm_index"),
        "ea_bandit_reward": ea_run.get("bandit_reward"),
        "ea_seeder": ea_run.get("seeder_name"),
        "ea_mask": ea_constraints.get("mask"),
        "ea_compactness": ea_constraints.get("compactness"),
        "ea_holes": ea_constraints.get("holes"),
        "ea_area": ea_constraints.get("area"),
        "ea_budget": ea_constraints.get("budget"),
    })

df = pd.DataFrame(rows)

if df.empty:
    print(f"No runs found in {EVAL_DIR}")
else:
    display(df.head())

    df["improvement_abs"] = df["ea_best"] - df["rl_best"]
    df["improvement_rel"] = (df["ea_best"] - df["rl_best"]) / df["ea_best"]
    df["rl_better"] = df["rl_best"] < df["ea_best"]
    df["time_speedup_s"] = df["ea_duration_s"] - df["rl_duration_s"]

    df["rl_efficiency"] = df["rl_best"] / df["rl_duration_s"]
    df["ea_efficiency"] = df["ea_best"] / df["ea_duration_s"]
    duration_delta = df["rl_duration_s"] - df["ea_duration_s"]
    df["improvement_cost"] = (df["ea_best"] - df["rl_best"]) / duration_delta
    df.loc[duration_delta == 0, "improvement_cost"] = pd.NA

    df["rl_convergence_rate"] = (
        (df["rl_best_initial"] - df["rl_best"]) / df["rl_gen_at_best"]
    )
    df["ea_convergence_rate"] = (
        (df["ea_best_initial"] - df["ea_best"]) / df["ea_gen_at_best"]
    )
    df.loc[df["rl_gen_at_best"] == 0, "rl_convergence_rate"] = pd.NA
    df.loc[df["ea_gen_at_best"] == 0, "ea_convergence_rate"] = pd.NA

    n_runs = len(df)
    print("Total paired runs:", n_runs)
    print("RL better in:", df["rl_better"].sum(), "runs")
    print("RL better (%):", 100 * df["rl_better"].mean())

    print("Mean best fitness:")
    print("  RL:", df["rl_best"].mean())
    print("  EA:", df["ea_best"].mean())

    print("Median best fitness:")
    print("  RL:", df["rl_best"].median())
    print("  EA:", df["ea_best"].median())

    print("Mean relative improvement (%):", 100 * df["improvement_rel"].mean())
    print("Median relative improvement (%):", 100 * df["improvement_rel"].median())

    print("Mean gen_at_best:")
    print("  RL:", df["rl_gen_at_best"].mean())
    print("  EA:", df["ea_gen_at_best"].mean())

    print("Mean duration (s):")
    print("  RL:", df["rl_duration_s"].mean())
    print("  EA:", df["ea_duration_s"].mean())

    # Hist / box / scatter
    plt.figure(figsize=(6, 4))
    plt.hist(df["improvement_abs"], bins=40)
    plt.axvline(0, linestyle="--")
    plt.xlabel("ea_best - rl_best (positive = RL better)")
    plt.ylabel("Count")
    plt.title("Distribution of RL advantage over pure EA")
    plt.show()

    plt.figure(figsize=(4, 4))
    plt.boxplot([df["rl_best"], df["ea_best"]], tick_labels=["EA+RL", "EA only"])
    plt.ylabel("Best fitness (lower is better)")
    plt.title("Best fitness across paired runs")
    plt.show()

    plt.figure(figsize=(5, 5))
    plt.scatter(df["ea_best"], df["rl_best"], alpha=0.5)
    ea_min, ea_max = df["ea_best"].min(), df["ea_best"].max()
    plt.plot([ea_min, ea_max], [ea_min, ea_max], linestyle="--")
    plt.xlabel("EA only best fitness")
    plt.ylabel("EA+RL best fitness")
    plt.title("Per-run comparison of best fitness")
    plt.show()

    print("Per-floor performance summary:")
    floor_summary = (
        df.groupby("floor_id")
        .agg(
            rl_best_mean=("rl_best", "mean"),
            ea_best_mean=("ea_best", "mean"),
            rl_win_rate=("rl_better", "mean"),
            avg_improvement=("improvement_abs", "mean"),
            avg_time_delta_s=("time_speedup_s", "mean"),
            runs=("file", "count"),
        )
        .sort_index()
    )
    display(floor_summary)

    for floor_id, sub in df.groupby("floor_id"):
        rl_curve = mean_curve(sub["rl_history"].tolist())
        ea_curve = mean_curve(sub["ea_history"].tolist())
        if len(rl_curve) == 0 and len(ea_curve) == 0:
            continue
        plt.figure(figsize=(6, 4))
        if len(rl_curve):
            plt.plot(rl_curve, label="EA+RL mean best")
        if len(ea_curve):
            plt.plot(ea_curve, label="EA only mean best")
        plt.xlabel("Generation")
        plt.ylabel("Best fitness")
        plt.title(f"Mean convergence curve â€” floor {floor_id}")
        plt.legend()
        plt.show()

    plt.figure(figsize=(6, 4))
    plt.scatter(df["time_speedup_s"], df["improvement_abs"], alpha=0.5)
    plt.axvline(0, linestyle="--", color="gray")
    plt.axhline(0, linestyle="--", color="gray")
    plt.xlabel("EA duration - RL duration (s)")
    plt.ylabel("EA best - RL best (fitness)")
    plt.title("Tradeoff: time saved vs fitness gain (positive y = RL better)")
    plt.show()

    plt.figure(figsize=(6, 4))
    plt.hist(df["improvement_cost"].dropna(), bins=40)
    plt.xlabel("(EA best - RL best) / (RL time - EA time)")
    plt.ylabel("Count")
    plt.title("Cost per unit improvement (lower is better for RL)")
    plt.show()

    plt.figure(figsize=(6, 4))
    plt.boxplot(
        [df["rl_convergence_rate"].dropna(), df["ea_convergence_rate"].dropna()],
        tick_labels=["EA+RL", "EA only"],
    )
    plt.ylabel("(initial - best) / gen_at_best")
    plt.title("Convergence rate comparison")
    plt.show()

    rl_losses = df[~df["rl_better"]]
    if not rl_losses.empty:
        print("RL losses (EA only better):", len(rl_losses))
        print("Top floors where RL lost:")
        print(rl_losses["floor_id"].value_counts().head())
        if "rl_mask" in rl_losses:
            print("Mean mask penalty when RL lost:", rl_losses["rl_mask"].mean())
        big_ea_luck = rl_losses.sort_values("improvement_abs").head(5)[
            ["file", "floor_id", "improvement_abs", "ea_best", "rl_best"]
        ]
        print("Example EA-favored runs:")
        print(big_ea_luck)

    rl_big_wins = df[df["improvement_abs"] > df["improvement_abs"].quantile(0.9)]
    if not rl_big_wins.empty:
        print("RL big wins (top 10% improvement):", len(rl_big_wins))
        print("Floors with big wins:")
        print(rl_big_wins["floor_id"].value_counts().head())
        if "rl_mask" in rl_big_wins:
            print("Mean mask penalty when RL big win:", rl_big_wins["rl_mask"].mean())

    rl_runs = df[df["rl_bandit_arm"].notna()]
    if not rl_runs.empty:
        arm_summary = (
            rl_runs.groupby("rl_bandit_arm")
            .agg(
                runs=("file", "count"),
                win_rate=("rl_better", "mean"),
                mean_reward=("rl_bandit_reward", "mean"),
                mean_improvement=("improvement_abs", "mean"),
            )
            .sort_index()
        )
        print("Bandit arm performance (EA+RL runs):")
        display(arm_summary)

        floor_arm = (
            rl_runs.groupby(["floor_id", "rl_bandit_arm"])
            .agg(win_rate=("rl_better", "mean"), runs=("file", "count"))
            .reset_index()
        )
        print("Arm success probability by floor (rows with at least 3 runs):")
        display(
            floor_arm[floor_arm["runs"] >= 3]
            .pivot(index="floor_id", columns="rl_bandit_arm", values="win_rate")
        )

    diffs = (df["ea_best"] - df["rl_best"]).dropna()
    if SCIPY_AVAILABLE and len(diffs) > 0:
        t_stat, t_p = stats.ttest_rel(df["ea_best"], df["rl_best"])
        try:
            w_stat, w_p = stats.wilcoxon(df["ea_best"], df["rl_best"])
        except ValueError:
            w_stat, w_p = (np.nan, np.nan)
        d_cohen = diffs.mean() / diffs.std(ddof=1)
        delta = cliffs_delta(df["ea_best"], df["rl_best"])
        print("Significance tests (EA best - RL best):")
        print(f"  Paired t-test: stat={t_stat:.4f}, p={t_p:.4e}")
        print(f"  Wilcoxon signed-rank: stat={w_stat}, p={w_p}")
        print(f"  Cohen's d: {d_cohen:.4f}")
        print(f"  Cliff's delta: {delta:.4f}")
    else:
        print("SciPy not available; skipping statistical tests.")
