In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# --------------------------------------------
# Step 1: Load timing data from .jsonl files
# --------------------------------------------

def load_jsonl_timings(label, path):
    file_path = os.path.join(path, "timings.jsonl")
    rows = []
    if not os.path.exists(file_path):
        print(f"⚠️ No file at: {file_path}")
        return rows

    with open(file_path) as f:
        for line in f:
            try:
                data = json.loads(line)
                rows.append({
                    "type": label,
                    "start": data["start"],
                    "end": data["end"],
                    "duration_ms": int(data["duration_ms"])
                })
            except Exception as e:
                print(f"Failed to parse line: {e}")
    return rows

proof_rows = load_jsonl_timings("proof_gen", "json_files/1")
verify_rows = load_jsonl_timings("proof_verify", "json_files/2")
latency_rows = load_jsonl_timings("latency", "json_files/3")
df = pd.DataFrame(proof_rows + verify_rows + latency_rows)

# --------------------------------------------
# Step 2: Check and continue with analysis
# --------------------------------------------

if df.empty:
    print("⚠️ No timing data available. Skipping analysis.")
else:
    os.makedirs("plots", exist_ok=True)

    # Save raw data to CSV
    df.to_csv("zk_timings_summary.csv", index=False)
    print("✅ Saved timing summary to zk_timings_summary.csv")

    # Convert start to datetime
    df['start_time'] = pd.to_datetime(df['start'])

    # --------------------------------------------
    # Summary statistics table
    # --------------------------------------------
    summary_table = df.groupby("type")["duration_ms"].describe()
    print("\n📊 Summary Table:")
    print(summary_table)
    summary_table.to_csv("zk_timings_stats.csv")

    # --------------------------------------------
    # Histogram
    # --------------------------------------------
    sns.set(style="whitegrid")
    plt.figure(figsize=(12, 6))
    sns.histplot(data=df, x="duration_ms", hue="type", element="step", bins=30)
    plt.title("Timing Histogram by Type")
    plt.xlabel("Duration (ms)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig("plots/zk_timings_histogram.png")
    plt.show()

    # --------------------------------------------
    # Boxplot
    # --------------------------------------------
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df, x="type", y="duration_ms")
    plt.title("Timing Distribution by Type")
    plt.ylabel("Duration (ms)")
    plt.xlabel("Timing Type")
    plt.tight_layout()
    plt.savefig("plots/zk_timings_boxplot.png")
    plt.show()

    # --------------------------------------------
    # Violin plot
    # --------------------------------------------
    plt.figure(figsize=(8, 6))
    sns.violinplot(data=df, x="type", y="duration_ms", inner="box")
    plt.title("Timing Density by Type")
    plt.ylabel("Duration (ms)")
    plt.xlabel("Timing Type")
    plt.tight_layout()
    plt.savefig("plots/zk_timings_violinplot.png")
    plt.show()

    # --------------------------------------------
    # CDF plot
    # --------------------------------------------
    plt.figure(figsize=(10, 6))
    for label in df['type'].unique():
        subset = df[df['type'] == label]['duration_ms'].sort_values()
        cum_prob = list(range(1, len(subset) + 1))
        plt.plot(subset, [x / len(subset) for x in cum_prob], label=label)

    plt.title("CDF of Timing Durations")
    plt.xlabel("Duration (ms)")
    plt.ylabel("Cumulative Probability")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("plots/zk_timings_cdf.png")
    # plt.show()

    # --------------------------------------------
    # Line plot over time
    # --------------------------------------------
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df, x="start_time", y="duration_ms", hue="type", marker="o")
    plt.title("Timing Trends Over Experiment Duration")
    plt.xlabel("Start Time")
    plt.ylabel("Duration (ms)")
    plt.tight_layout()
    plt.savefig("plots/zk_timings_trend.png")
    plt.show()

    # --------------------------------------------
    # Bar chart of means with error bars
    # --------------------------------------------
    means = df.groupby("type")["duration_ms"].mean()
    stds = df.groupby("type")["duration_ms"].std()

    plt.figure(figsize=(8, 6))
    plt.bar(means.index, means.values, yerr=stds.values, capsize=5)
    plt.title("Average Duration with Error Bars")
    plt.ylabel("Duration (ms)")
    plt.xlabel("Timing Type")
    plt.tight_layout()
    plt.savefig("plots/zk_timings_means_with_error.png")
    plt.show()

    # --------------------------------------------
    # Outlier detection
    # --------------------------------------------
    print("\n📉 Outliers (> 2 std dev above mean):")
    for label in df['type'].unique():
        subset = df[df['type'] == label]
        mu = subset["duration_ms"].mean()
        sigma = subset["duration_ms"].std()
        outliers = subset[subset["duration_ms"] > mu + 2 * sigma]
        if not outliers.empty:
            print(f"\nOutliers in {label}:")
            print(outliers[["start", "duration_ms"]])

