In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

DATA_PATH = "toolwindow_data.csv"
OUT_DIR = "."

def load_and_clean(path):
    df = pd.read_csv(path)
    df.columns = [c.strip().lower() for c in df.columns]
    df["event"] = df["event"].str.strip().str.lower().map({"opened":"open","closed":"close"})
    df = df.rename(columns={"event":"event_id"})
    df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce").astype("Int64")
    df = df.dropna(subset=["timestamp"]).copy()
    df["timestamp"] = df["timestamp"].astype("int64")
    df["open_type"] = df["open_type"].astype(str).str.strip().str.lower()
    df.loc[~df["open_type"].isin(["manual","auto"]), "open_type"] = None
    df = df.sort_values(["user_id","timestamp"]).reset_index(drop=True)
    is_open = df["event_id"].eq("open")
    is_close = df["event_id"].eq("close")
    df_clean = df.loc[is_close | (is_open & df["open_type"].isin(["manual","auto"]))].copy()
    return df_clean

def build_episodes(df_clean, mode="implicit"):
    episodes = []
    orphan_closes = 0
    implicit_closes = 0
    skipped_double_opens = 0
    right_censored = 0
    for uid, g in df_clean.groupby("user_id", sort=False):
        current_open = None
        for _, row in g.iterrows():
            if row["event_id"] == "open":
                if current_open is not None:
                    if mode == "implicit":
                        episodes.append({"user_id": uid, "open_ts": current_open["timestamp"], "close_ts": row["timestamp"], "open_type": current_open["open_type"], "implicit_closed": True})
                        implicit_closes += 1
                        current_open = {"timestamp": row["timestamp"], "open_type": row["open_type"]}
                    elif mode == "skip":
                        skipped_double_opens += 1
                        continue
                else:
                    current_open = {"timestamp": row["timestamp"], "open_type": row["open_type"]}
            else:
                if current_open is None:
                    orphan_closes += 1
                else:
                    episodes.append({"user_id": uid, "open_ts": current_open["timestamp"], "close_ts": row["timestamp"], "open_type": current_open["open_type"], "implicit_closed": False})
                    current_open = None
        if current_open is not None:
            right_censored += 1
            if mode == "implicit":
                episodes.append({"user_id": uid, "open_ts": current_open["timestamp"], "close_ts": np.nan, "open_type": current_open["open_type"], "implicit_closed": False, "censored": True})
    ep = pd.DataFrame(episodes)
    if ep.empty:
        complete = pd.DataFrame(columns=["user_id","open_ts","close_ts","open_type","duration_s"])
    else:
        if "censored" in ep.columns:
            ep["censored"] = ep["censored"].fillna(False)
            ep = ep.loc[~ep["censored"] & ep["close_ts"].notna()].copy()
        complete = ep.copy()
        complete["duration_s"] = (complete["close_ts"] - complete["open_ts"]) / 1000.0
        complete = complete.loc[complete["duration_s"] > 0].copy()
    meta = {"orphan_closes": orphan_closes, "implicit_closes": implicit_closes, "skipped_double_opens": skipped_double_opens, "right_censored": right_censored}
    return complete, meta

def stats_by_type(complete):
    out = []
    for ot in ["manual","auto"]:
        vals = complete.loc[complete["open_type"]==ot,"duration_s"].to_numpy()
        if len(vals)==0:
            out.append({"open_type": ot, "n": 0, "mean_s": np.nan, "median_s": np.nan, "p75_s": np.nan, "p95_s": np.nan, "p99_s": np.nan, "max_s": np.nan})
        else:
            out.append({"open_type": ot, "n": int(len(vals)), "mean_s": float(vals.mean()), "median_s": float(np.median(vals)), "p75_s": float(np.percentile(vals,75)), "p95_s": float(np.percentile(vals,95)), "p99_s": float(np.percentile(vals,99)), "max_s": float(vals.max())})
    return pd.DataFrame(out)

def winsorize(arr, p_lo=1, p_hi=99):
    if len(arr)==0:
        return arr
    lo = np.percentile(arr, p_lo)
    hi = np.percentile(arr, p_hi)
    return np.clip(arr, lo, hi)

def winsor_stats(complete, p_lo=1, p_hi=99):
    out = []
    for ot in ["manual","auto"]:
        vals = complete.loc[complete["open_type"]==ot,"duration_s"].to_numpy()
        if len(vals)==0:
            out.append({"open_type": ot, "winsor_mean_p1_p99_s": np.nan})
        else:
            w = winsorize(vals, p_lo=p_lo, p_hi=p_hi)
            out.append({"open_type": ot, "winsor_mean_p1_p99_s": float(w.mean())})
    return pd.DataFrame(out)

def permutation_mean_diff(manual, auto, seed=123, perms=3000):
    if len(manual)==0 or len(auto)==0:
        return np.nan, np.nan
    rng = np.random.default_rng(seed)
    combined = np.concatenate([manual, auto])
    n_m = len(manual)
    obs = manual.mean() - auto.mean()
    extreme = 0
    for _ in range(perms):
        rng.shuffle(combined)
        m = combined[:n_m]
        a = combined[n_m:]
        diff = m.mean() - a.mean()
        if abs(diff) >= abs(obs):
            extreme += 1
    p_val = (extreme + 1) / (perms + 1)
    return p_val, obs

def ecdf(x):
    x = np.sort(x)
    y = np.arange(1, len(x)+1) / len(x)
    return x, y

df_clean = load_and_clean(DATA_PATH)
implicit_eps, implicit_meta = build_episodes(df_clean, mode="implicit")
skip_eps, skip_meta = build_episodes(df_clean, mode="skip")
implicit_eps.to_csv(os.path.join(OUT_DIR, "episodes_implicit.csv"), index=False)
skip_eps.to_csv(os.path.join(OUT_DIR, "episodes_skip.csv"), index=False)

imp_stats = stats_by_type(implicit_eps)
skp_stats = stats_by_type(skip_eps)
imp_w = winsor_stats(implicit_eps)
skp_w = winsor_stats(skip_eps)

imp_manual = implicit_eps.loc[implicit_eps["open_type"]=="manual","duration_s"].to_numpy()
imp_auto   = implicit_eps.loc[implicit_eps["open_type"]=="auto","duration_s"].to_numpy()
skp_manual = skip_eps.loc[skip_eps["open_type"]=="manual","duration_s"].to_numpy()
skp_auto   = skip_eps.loc[skip_eps["open_type"]=="auto","duration_s"].to_numpy()
p_imp, diff_imp = permutation_mean_diff(imp_manual, imp_auto, seed=42, perms=3000)
p_skp, diff_skp = permutation_mean_diff(skp_manual, skp_auto, seed=7, perms=3000)

pdf_path = os.path.join(OUT_DIR, "toolwindow_analysis_comparison_with_winsor.pdf")
with PdfPages(pdf_path) as pdf:
    fig = plt.figure(figsize=(8.27, 11.69))
    lines = []
    lines.append("Analyze Toolwindow Usage Data — Two strategies + Winsorizing")
    lines.append("")
    lines.append("Cleaning: keep CLOSE; keep OPEN only with open_type ∈ {manual, auto}; sort by user_id,timestamp.")
    lines.append("A) implicit_close: second OPEN closes current episode at its time, then new episode starts; censored excluded.")
    lines.append("B) skip_double_open: second OPEN while open is ignored; censored excluded.")
    lines.append("Orphan CLOSE ignored; non-positive durations dropped.")
    lines.append("")
    lines.append(f"A) meta: {implicit_meta}")
    lines.append(f"B) meta: {skip_meta}")
    plt.axis("off")
    plt.text(0.05, 0.98, "\n".join(lines), va="top", ha="left", fontsize=10, family="monospace")
    pdf.savefig(fig, bbox_inches="tight")
    plt.close(fig)

    for title, stats_df, wins_df in [("implicit_close", imp_stats, imp_w), ("skip_double_open", skp_stats, skp_w)]:
        fig = plt.figure(figsize=(8.27, 6))
        merged = stats_df.merge(wins_df, on="open_type", how="left")
        txt = [f"{title} — per open_type"]
        for _, r in merged.iterrows():
            txt.append(f"{r['open_type']}: n={int(r['n'])}, median={r['median_s']:.2f}s, mean={r['mean_s']:.2f}s, p75={r['p75_s']:.2f}s, p95={r['p95_s']:.2f}s, p99={r['p99_s']:.2f}s, max={r['max_s']:.2f}s, winsor_mean(p1–p99)={r['winsor_mean_p1_p99_s']:.2f}s")
        plt.axis("off")
        plt.text(0.02, 0.98, "\n".join(txt), va="top", ha="left", fontsize=10, family="monospace")
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

    for vname, eps in [("implicit_close", implicit_eps), ("skip_double_open", skip_eps)]:
        counts = eps["open_type"].value_counts().reindex(["manual","auto"]).fillna(0).astype(int)
        plt.figure()
        plt.bar(counts.index.astype(str), counts.values)
        plt.title(f"Episodes count by open_type — {vname}")
        plt.ylabel("count")
        pdf.savefig(bbox_inches="tight")
        plt.close()

    for vname, eps in [("implicit_close", implicit_eps), ("skip_double_open", skip_eps)]:
        manual = eps.loc[eps["open_type"]=="manual","duration_s"].to_numpy()
        auto   = eps.loc[eps["open_type"]=="auto","duration_s"].to_numpy()
        labels, data = [], []
        if len(manual)>0: labels.append("manual"); data.append(manual)
        if len(auto)>0: labels.append("auto"); data.append(auto)
        if data:
            plt.figure()
            plt.boxplot(data, labels=labels, showmeans=True)
            plt.title(f"Duration (s) — boxplot — {vname}")
            plt.ylabel("seconds")
            pdf.savefig(bbox_inches="tight")
            plt.close()

    for vname, eps in [("implicit_close", implicit_eps), ("skip_double_open", skip_eps)]:
        manual = eps.loc[eps["open_type"]=="manual","duration_s"].to_numpy()
        auto   = eps.loc[eps["open_type"]=="auto","duration_s"].to_numpy()
        plt.figure()
        plotted = False
        if len(manual)>0:
            xm = np.sort(manual); ym = np.arange(1, len(manual)+1)/len(manual)
            plt.step(xm, ym, where="post", label="manual")
            plotted = True
        if len(auto)>0:
            xa = np.sort(auto); ya = np.arange(1, len(auto)+1)/len(auto)
            plt.step(xa, ya, where="post", label="auto")
            plotted = True
        if plotted:
            plt.title(f"ECDF — {vname}")
            plt.xlabel("duration (s)")
            plt.ylabel("ECDF")
            plt.legend()
            pdf.savefig(bbox_inches="tight")
            plt.close()

    for vname, eps, wins in [("implicit_close", implicit_eps, imp_w), ("skip_double_open", skip_eps, skp_w)]:
        means = eps.groupby("open_type")["duration_s"].mean().reindex(["manual","auto"])
        wmeans = wins.set_index("open_type")["winsor_mean_p1_p99_s"].reindex(["manual","auto"])
        X = np.arange(2); width = 0.35
        plt.figure()
        plt.bar(X - width/2, means.values, width, label="mean")
        plt.bar(X + width/2, wmeans.values, width, label="winsor_mean p1–p99")
        plt.xticks(X, ["manual","auto"])
        plt.ylabel("seconds")
        plt.title(f"Mean vs winsor-mean — {vname}")
        plt.legend()
        pdf.savefig(bbox_inches="tight")
        plt.close()

    fig = plt.figure(figsize=(8.27, 4))
    lines = []
    lines.append("Permutation test for difference in means (manual - auto)")
    lines.append(f"implicit_close: diff={diff_imp:.2f}s, p={p_imp:.4f}")
    lines.append(f"skip_double_open: diff={diff_skp:.2f}s, p={p_skp:.4f}")
    plt.axis("off")
    plt.text(0.05, 0.92, "\n".join(lines), va="top", ha="left", fontsize=11, family="monospace")
    pdf.savefig(fig, bbox_inches="tight")
    plt.close(fig)

print("Saved:", os.path.join(OUT_DIR, "episodes_implicit.csv"))
print("Saved:", os.path.join(OUT_DIR, "episodes_skip.csv"))
print("Saved:", pdf_path)
print("Done")


  ep["censored"] = ep["censored"].fillna(False)


Saved: ./episodes_implicit.csv
Saved: ./episodes_skip.csv
Saved: ./toolwindow_analysis_comparison_with_winsor.pdf
Done
