In [None]:
import os
import pandas as pd
import json
import seaborn as sns

# Path to ablation folder
notebook_dir = os.path.abspath("")
folder_path = os.path.join(notebook_dir, "..", "results", "experiments", "abliation")

# List to collect dataframes
df_list = []

# Loop through all JSON files
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r") as f:
            try:
                content = json.load(f)
                # Normalize content
                if isinstance(content, dict):
                    content = [content]  # wrap single dict

                # Create temp DataFrame with ablation_type column
                temp_df = pd.DataFrame(content)
                temp_df["ablation_type"] = os.path.splitext(filename)[0]  # remove .json
                df_list.append(temp_df)
            except json.JSONDecodeError as e:
                print(f"Skipping {filename}: JSON decode error ({e})")

# Combine all into one DataFrame
df = pd.concat(df_list, ignore_index=True)

In [None]:
df

In [None]:
df.info()

In [None]:
my_palette = sns.color_palette("pastel", 9)

sns.palplot(my_palette)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.figure(figsize=(7, 3.5))

# Draw boxplot
ax = sns.boxplot(
    data=df,
    x="ablation_type",
    y="fact_score.direct_fact_score",
    color=my_palette[7],
    showfliers=False,
    linewidth=1.5,
)

# X-tick metadata
xticks = ax.get_xticks()
xticklabels = [tick.get_text() for tick in ax.get_xticklabels()]

# Compute stats
means = df.groupby("ablation_type")["fact_score.direct_fact_score"].mean()
medians = df.groupby("ablation_type")["fact_score.direct_fact_score"].median()
pos_map = {label: pos for label, pos in zip(xticklabels, xticks)}

# Overlay mean/median lines + annotations
for ablation, pos in pos_map.items():
    mean_val = means.get(ablation)
    median_val = medians.get(ablation)

    if mean_val is not None:
        plt.hlines(
            mean_val, pos - 0.4, pos + 0.4, colors="black", linestyles="--", linewidth=2
        )
        plt.text(
            pos,
            mean_val + 0.015,
            f"{mean_val:.2f}",
            color="black",
            va="bottom",
            fontsize=9,
            ha="center",
        )

    if median_val is not None:
        plt.hlines(
            median_val,
            pos - 0.4,
            pos + 0.4,
            colors="black",
            linestyles="-",
            linewidth=1.5,
        )
        plt.text(
            pos,
            median_val + 0.015,
            f"{median_val:.2f}",
            color="black",
            va="bottom",
            fontsize=9,
            ha="center",
        )

# Final styling
plt.xlabel("")
plt.ylabel("Direct FactScore")
plt.ylim(0, 1)

# Top-centered legend without frame
ax.legend(
    handles=[
        plt.Line2D([0], [0], color="black", linestyle="--", linewidth=2, label="Mean"),
        plt.Line2D(
            [0], [0], color="black", linestyle="-", linewidth=1.5, label="Median"
        ),
    ],
    loc="lower center",
    bbox_to_anchor=(0.5, 1.05),
    fontsize=9,
    frameon=False,
    ncol=2,
)

# Add margin for legend
plt.subplots_adjust(top=0.82, bottom=0.25)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Group and count by ablation_type instead of experiment_name
stacked_data = (
    df.groupby(["ablation_type", "agent_judge_outcome"]).size().unstack(fill_value=0)
)

# Desired outcome stacking order
desired_order = ["Not Helpful", "Partially Helpful", "Fully Helpful"]
stacked_data = stacked_data[desired_order]

# Convert to percentages row-wise
stacked_percent = stacked_data.div(stacked_data.sum(axis=1), axis=0) * 100

# Plot
fig, ax = plt.subplots(figsize=(7, 4))
bottom = [0] * len(stacked_percent)

for idx, outcome in enumerate(desired_order):
    values = stacked_percent[outcome].values
    bars = ax.bar(
        stacked_percent.index,
        values,
        bottom=bottom,
        label=outcome,
        color=my_palette[idx],
        edgecolor="gray",
    )

    # Annotate with percent labels (skip small segments)
    for i, (val, btm) in enumerate(zip(values, bottom)):
        if val >= 5:
            ax.text(
                i,
                btm + val / 2,
                f"{val:.0f}%",
                ha="center",
                va="center",
                fontsize=8,
                color="black",
            )

    bottom = [btm + val for btm, val in zip(bottom, values)]

# X-axis labels
ax.set_xticks(range(len(stacked_percent.index)))
ax.set_xticklabels(stacked_percent.index, rotation=45, ha="right")

# Legend above
legend_handles = [
    plt.Rectangle((0, 0), 1, 1, color=my_palette[i], label=label)
    for i, label in enumerate(desired_order)
]
ax.legend(
    handles=legend_handles,
    loc="lower center",
    bbox_to_anchor=(0.5, 1.02),
    fontsize=9,
    frameon=False,
    ncol=len(legend_handles),
)

# Adjust layout
fig.subplots_adjust(top=0.75, bottom=0.3)
plt.ylabel("Percentage")
plt.ylim(0, 100)
plt.show()

In [None]:
import pandas as pd

# ---- 1. Direct FactScore ----
direct_score_stats = (
    df.groupby("ablation_type")["fact_score.direct_fact_score"]
    .agg(["mean", "std"])
    .round(3)
)
direct_score_stats.columns = ["DirectScore_Mean", "DirectScore_Std"]

# ---- 2. Agent Judge Outcome Distribution ----
outcome_dist = (
    df.groupby(["ablation_type", "agent_judge_outcome"])
    .size()
    .unstack(fill_value=0)
    .div(df.groupby("ablation_type").size(), axis=0)
    .round(3)
    * 100
)
outcome_dist.columns = [
    f"Judge_{col.replace(' ', '')}_%" for col in outcome_dist.columns
]

# ---- 3. Input Tokens ----
token_stats = (
    df.groupby("ablation_type")["tokens_consumed.input_tokens"]
    .agg(["mean", "std"])
    .round(1)
)
token_stats.columns = ["InputTokens_Mean", "InputTokens_Std"]

# ---- 4. Execution Time ----
time_stats = (
    df.groupby("ablation_type")["excecution_time_seconds"].agg(["mean", "std"]).round(2)
)
time_stats.columns = ["Time_Mean_s", "Time_Std_s"]

# ---- 5. Combine all ----
summary_table = pd.concat(
    [direct_score_stats, outcome_dist, token_stats, time_stats], axis=1
)

# Optional: sort by DirectScore_Mean descending
summary_table = summary_table.sort_values("DirectScore_Mean", ascending=False)

# Display
summary_table