In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import matplotlib.ticker as ticker
import matplotlib as mpl
import numpy as np
import matplotlib.patches as mpatches

# --------------------------------------------
# Step 1: Load timing data from .jsonl files
# --------------------------------------------

def load_jsonl_timings(label, path, file):
    file_path = os.path.join(path, file)
    rows = []
    if not os.path.exists(file_path):
        print(f"No file at: {file_path}")
        return rows

    with open(file_path) as f:
        for line in f:
            try:
                data = json.loads(line)
                rows.append({
                    "type": label,
                    "start_ms": int(data["start_ms"]),
                    "duration_ms": int(data["duration_ms"])
                })
            except Exception as e:
                print(f"Failed to parse line: {e}")
    return rows

mpl.rcParams.update({
    "text.usetex": True,
    "font.family": "sans-serif",
    "font.sans-serif": "Arial",
    "font.size": 9,
    "axes.titlesize": 10,
    "axes.labelsize": 9,
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
    "legend.fontsize": 8,
    "figure.titlesize": 10,
})

# Color Accessibility Pallette Tool
# https://gka.github.io/palettes/#/5|s|ffb000,fe6100,dc267f,785ef0,648fff|ffffe0,ff005e,93003a|0|1

custom_colors = ['#ffb000', '#f86c36', '#dc267f', '#9d62d6', '#648fff', '#00bfc4', '#90d200']

def run_analysis(label_to_path_dict, output_prefix, data_file):
    rows = []
    for label, path in label_to_path_dict.items():
        rows += load_jsonl_timings(label, path, data_file)

    df = pd.DataFrame(rows)
    if df.empty:
        print("⚠️ No timing data available. Skipping analysis.")
        return

    # Ensure experiments directory exists
    base_dir = f"experiments/{output_prefix}"
    os.makedirs(f"{base_dir}_plots", exist_ok=True)

    # Save summary CSVs
    df.to_csv(f"{base_dir}_timings_summary.csv", index=False)
    print(f"✅ Saved timing summary to {base_dir}_timings_summary.csv")

    df["start_time"] = pd.to_datetime(df["start_ms"], unit="ms")
    summary_table = df.groupby("type")["duration_ms"].describe()
    print(f"\n📊 Summary Table ({output_prefix}):")
    print(summary_table)
    summary_table.to_csv(f"{base_dir}_timings_stats.csv")

    n_labels = df["type"].nunique()

    # Histogram
    plt.figure(figsize=(5.5, 4.5))

    # Draw the Seaborn step histogram
    sns.histplot(
        data=df,
        x="duration_ms",
        hue="type",
        element="step",
        bins=30,
        common_norm=False,
        stat="count",
        palette=custom_colors[:df["type"].nunique()],
        legend=False  # Disable the broken default legend
    )

    # Manually build the legend
    handles = [
        mpatches.Patch(color=custom_colors[i], label=label)
        for i, label in enumerate(df["type"].unique())
    ]

    plt.title(r"\textbf{Distribution of Execution Times}")
    plt.xlabel("Duration (ms)")
    plt.ylabel("Count")

    # Place legend at the bottom, 1 column per label
    plt.legend(
        handles=handles,
        loc="upper center",
        bbox_to_anchor=(0.5, -0.2),
        ncol=len(handles),
        frameon=False
    )

    plt.tight_layout()
    plt.savefig(f"{base_dir}_plots/histogram.pdf", bbox_inches="tight")
    plt.close()

    # Boxplot
    plt.figure(figsize=(5.5, 4))  # widened from 4.5
    sns.boxplot(data=df, x="type", y="duration_ms", palette=custom_colors)
    plt.title(r"\textbf{Duration Variation}")
    plt.ylabel("Duration (ms)")
    plt.xlabel("")
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.savefig(f"{base_dir}_plots/boxplot.pdf", bbox_inches="tight")
    plt.close()


    # Violin plot
    plt.figure(figsize=(5.5, 3.5))  # widened from 3.5 to 5.5
    sns.violinplot(data=df, x="type", y="duration_ms", inner="box", palette=custom_colors)
    plt.title(r"\textbf{Density and Spread}")
    plt.ylabel("Duration (ms)")
    plt.xlabel("")
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.savefig(f"{base_dir}_plots/violinplot.pdf", bbox_inches="tight")
    plt.close()


    # CDF
    plt.figure(figsize=(4.5, 3))
    for i, label in enumerate(df['type'].unique()):
        subset = df[df['type'] == label]['duration_ms'].sort_values()
        cum_prob = range(1, len(subset) + 1)
        plt.plot(subset, [x / len(subset) for x in cum_prob], label=label, color=custom_colors[i % len(custom_colors)])
    plt.title(r"\textbf{Cumulative Distribution}")
    plt.xlabel("Duration (ms)")
    plt.ylabel("Cumulative Probability")
    plt.legend(title=None, loc='upper center',  bbox_to_anchor=(0.5, -0.2), ncol=n_labels)
    plt.tight_layout()
    plt.savefig(f"{base_dir}_plots/cdf.pdf", bbox_inches="tight")
    plt.close()


    # Trend
    # Group-wise normalization of elapsed time
    df["elapsed_sec"] = df.groupby("type")["start_time"].transform(lambda x: (x - x.min()).dt.total_seconds())

    plt.figure(figsize=(5.5, 4))
    for i, label in enumerate(df["type"].unique()):
        sub_df = df[df["type"] == label]
        plt.plot(
            sub_df["elapsed_sec"],
            sub_df["duration_ms"],
            label=label,
            linewidth=1,
            color=custom_colors[i % len(custom_colors)]
        )

    plt.legend(title=None, loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=n_labels)
    plt.title(r"\textbf{Execution Time Trends}")
    plt.xlabel("Elapsed Time (s)")
    plt.ylabel("Duration (ms)")
    plt.tight_layout()
    plt.savefig(f"{base_dir}_plots/trend.pdf", bbox_inches="tight")
    plt.close()

    # Order bars consistently in summary stats
    type_order = df["type"].drop_duplicates().tolist()
    means = df.groupby("type")["duration_ms"].mean().reindex(type_order)
    stds = df.groupby("type")["duration_ms"].std().reindex(type_order)


    # Bar chart with error bars
    plt.figure(figsize=(5.5, 3.5))  # widened from 3.5
    plt.bar(type_order, means.values, yerr=stds.values, capsize=4, color=custom_colors[:len(type_order)])
    plt.title(r"\textbf{Mean Duration with Std. Dev.}")
    plt.ylabel("Duration (ms)")
    plt.xlabel("")
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.savefig(f"{base_dir}_plots/means_with_error.pdf", bbox_inches="tight")
    plt.close()

    # Outliers
    print(f"\n📉 Outliers in {output_prefix} (> 2 std dev):")
    for label in df['type'].unique():
        subset = df[df['type'] == label]
        mu = subset["duration_ms"].mean()
        sigma = subset["duration_ms"].std()
        outliers = subset[subset["duration_ms"] > mu + 2 * sigma]
        if not outliers.empty:
            print(f"\nOutliers in {label}:")
            print(outliers[["start_ms", "duration_ms"]])


# ---- Core zk-timing results ----
core_paths = {
    "proof_gen": "json_files/1",
    "proof_verify": "json_files/2",
    "latency": "json_files/3"
}
run_analysis(core_paths, output_prefix="core", data_file="timings.jsonl")

# ---- Feature-based results ----
feature_paths = {
    "author": "json_files/author",
    "rate_pseudo": "json_files/rate_pseudo",
    "pseudo_msg": "json_files/pseudo_msg",
    "pseudo_vote": "json_files/pseudo_vote",
    "badge": "json_files/badge"
}
run_analysis(feature_paths, output_prefix="features", data_file="timings.jsonl")


latency_feature_paths = {
    "author": "json_files/author",
    "rate_pseudo": "json_files/rate_pseudo",
    "pseudo_msg": "json_files/pseudo_msg",
    "pseudo_vote": "json_files/pseudo_vote",
    "badge": "json_files/badge",
    "ban": "json_files/ban", 
    "rep": "json_files/rep"
}
run_analysis(latency_feature_paths, output_prefix="latency_features", data_file="features_timings.jsonl")

ban_rep_paths = {
    "ban": "json_files/ban", 
    "rep": "json_files/rep"
}
run_analysis(ban_rep_paths, output_prefix="ban_rep", data_file="features_timings.jsonl")