<h1 style="
  background: linear-gradient(90deg, #000000 0%, #7e22ce 50%, #00ff88 100%);
  -webkit-background-clip: text;
  -webkit-text-fill-color: transparent;
  font-size: 60px;
  font-weight: 900;
  text-align: center;
  margin: 0;
">
  TOOL WINDOW ANALYSIS
</h1>


<div style= padding:18px 24px; border-radius:1px; text-align:center;">
  <span style="color:#f; font-size:25px; font-weight:100;">
    Setting up
  </span>
</div>

In [631]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.stats import mannwhitneyu

JB_COLORS = {
    "manual": "#7e22ce",
    "auto": "#00ff88",
    "background": "#0a0a0a",
    "text": "#ffffff"
}

JB_GRADIENT = [
    [0.0, "#000000"],
    [0.3, "#7e22ce"],
    [0.7, "#00ff88"],
    [1.0, "#baffc9"]
]

In [632]:
def update_layout(figure):
    figure.update_layout(
    bargap=0.1,
    template="plotly_dark",
    paper_bgcolor=JB_COLORS["background"],
    plot_bgcolor=JB_COLORS["background"],
    font_color=JB_COLORS["text"],
    title_font=dict(size=20, color=JB_COLORS["text"], family="Poppins")
)

In [633]:
df = pd.read_csv('toolwindow_data.csv')

In [634]:
df.head(10)

Unnamed: 0,timestamp,event,open_type,user_id
0,1752250204033,opened,manual,1
1,1751836141616,closed,,2
2,1752304475081,closed,,3
3,1752498934494,opened,auto,4
4,1752141991110,closed,,5
5,1752308210458,opened,auto,3
6,1752310292657,opened,auto,3
7,1752276666919,closed,,6
8,1752158089077,opened,auto,5
9,1752174540366,opened,auto,7


In [635]:
df["ts"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)

In [636]:
val = df.loc[1, "open_type"]
print("Value:", val)
print("Python type:", type(val))
print(df["open_type"].unique())

df["open_type"] = (
    df["open_type"].astype(str).str.strip().str.lower()
    .replace(["nan", "na", "none", ""], np.nan)
)

Value: nan
Python type: <class 'float'>
['auto' nan 'manual']


In [637]:
print("\nUsers:", df["user_id"].nunique())
print("Time span:", df["ts"].min(), "→", df["ts"].max())
print("Event types  :", df['event'].unique())
print("Open types   :", df['open_type'].dropna().unique())
print("\nEvent count:")
print(df['event'].value_counts(dropna=False))
opened_with_nan = df[(df["event"] == "opened") & (df["open_type"].isna())]
print("Number of (opened) events with NaN in open_type:", len(opened_with_nan))
closed_with_nan = df[(df["event"] == "closed") & (df["open_type"].isna())]
print("Number of (closed) events with NaN in open_type:", len(closed_with_nan))


Users: 205
Time span: 2025-07-03 21:54:15.846000+00:00 → 2025-07-23 23:01:03.785000+00:00
Event types  : ['opened' 'closed']
Open types   : ['auto' 'manual']

Event count:
event
opened    1865
closed    1638
Name: count, dtype: int64
Number of (opened) events with NaN in open_type: 0
Number of (closed) events with NaN in open_type: 1638


<div style= padding:18px 24px; border-radius:1px; text-align:center;">
  <span style="color:#f; font-size:20px; font-weight:100;">
    Match open/close pairs to reconstruct complete episodes of toolwindow usage
    <br>
                    &
    <br>
    Calculate how long the toolwindow was open during each episode
  </span>
</div>

In [638]:
episodes = []

In [639]:
df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)

In [640]:
def build_episodes(simple_df: pd.DataFrame) -> pd.DataFrame:
    eps = []
    for uid, g in simple_df.groupby("user_id", sort=False):
        t_open = None
        otype  = None
        for ev, ts, ot in zip(g["event"], g["timestamp"], g["open_type"]):
            if ev == "opened":
                if t_open is not None:
                    eps.append((uid, t_open, ts, (ts - t_open)/1000.0, otype))
                t_open = ts
                otype  = ot if ot in ("manual", "auto") else np.nan

            elif ev == "closed":
                if t_open is not None:
                    eps.append((uid, t_open, ts, (ts - t_open)/1000.0, otype))
                    t_open = None
                    otype  = None
    return pd.DataFrame(eps, columns=["user_id","open_ts","close_ts","duration_sec","open_type"])

epi = build_episodes(df)

In [641]:
epi = epi[epi["duration_sec"] > 0].copy()
epi

Unnamed: 0,user_id,open_ts,close_ts,duration_sec,open_type
0,1,1751826102123,1751826746077,643.954,auto
1,1,1751841258635,1751849544609,8285.974,manual
2,1,1751985487772,1751986623885,1136.113,manual
3,1,1752250204033,1752250204781,0.748,manual
4,1,1752250205587,1752257432775,7227.188,manual
...,...,...,...,...,...
1826,198,1751794789165,1751794914913,125.748,auto
1827,199,1752207158481,1752207181344,22.863,manual
1828,200,1752502731550,1752502765595,34.045,manual
1829,201,1751853675253,1751853683561,8.308,manual


In [642]:
print("Number of episodes is:", len(epi))

Number of episodes is: 1831


<div style= padding:18px 24px; border-radius:1px; text-align:center;">
  <span style="color:#f; font-size:20px; font-weight:100;">
    Compare the duration patterns between manual and automatic opens
  </span>
</div>

In [643]:
fig = go.Figure()

for label, g in epi.groupby("open_type"):
    x = np.sort(g["duration_sec"].values)
    y = np.arange(1, len(x) + 1) / len(x)
    fig.add_trace(go.Scatter(
        x=x, y=y,
        mode="lines",
        name=label,
        line=dict(color=JB_COLORS.get(label, "#cccccc"), width=3)
    ))
update_layout(fig)
fig.update_layout(
    title="ECDF of episode duration",
    xaxis_title="Duration (s)",
    yaxis_title="Episode contribution"
)
fig.show()

In [644]:
fig = px.box(
    epi,
    x="open_type",
    y="duration_sec",
    color="open_type",
    color_discrete_map={
        "manual": JB_COLORS["manual"],
        "auto": JB_COLORS["auto"]
    },
    title="Episodes duration per type"
)
fig.update_layout(
    template="plotly_dark",
    paper_bgcolor=JB_COLORS["background"],
    plot_bgcolor=JB_COLORS["background"],
    font_color=JB_COLORS["text"]
)
fig.update_yaxes(type="log", title="Time (s, log)")
fig.update_xaxes(title="Open Type")
fig.show()


In [645]:
epi["open_dt"] = pd.to_datetime(epi["open_ts"], unit="ms")
epi["date"] = epi["open_dt"].dt.date
epi["hour"] = epi["open_dt"].dt.hour
heat = epi.groupby(["date", "hour"]).size().reset_index(name="count")

fig = px.density_heatmap(
    heat,
    x="hour",
    y="date",
    z="count",
    color_continuous_scale=JB_GRADIENT,
    title="Tool Window Activity (Hour vs Day)",
    nbinsx=34
)
update_layout(fig)
fig.update_layout(
    xaxis_title="Hour",
    yaxis_title="Date",
    coloraxis_colorbar=dict(
        title="Num of openings",
        tickfont=dict(color="#ffffff"),
        titlefont=dict(color="#ffffff")
    )
)

fig.update_xaxes(
    dtick=1,
    tickmode='linear',
    showgrid=False,
    color="#ffffff"
)
fig.update_yaxes(
    autorange="reversed",
    showgrid=False,
    color="#ffffff"
)
fig.show()


In [646]:
print(epi.groupby("open_type")["duration_sec"].describe())

            count          mean           std    min      25%       50%  \
open_type                                                                 
auto       1180.0  17464.653237  61726.337743  0.154  47.1090  285.4165   
manual      651.0   4640.452693  32058.624031  0.015   2.3935   13.9980   

                 75%         max  
open_type                         
auto       2382.6035  728695.209  
manual      161.9940  447314.548  


In [647]:
desc = epi.groupby("open_type")["duration_sec"].describe().reset_index()
desc = desc[["open_type", "mean", "std", "max"]]
desc_melted = desc.melt(id_vars="open_type", var_name="stat", value_name="value")

fig = px.bar(
    desc_melted,
    x="stat",
    y="value",
    color="open_type",
    barmode="group",
    color_discrete_map={"manual": JB_COLORS["manual"], "auto": JB_COLORS["auto"]},
    title="Summary Statistics per Open Type"
)
fig.update_layout(
    xaxis_title="Statistic",
    yaxis_title="Duration (seconds)",
    title_font=dict(size=20, color=JB_COLORS["text"], family="Arial Black")
)
update_layout(fig)
fig.show()

<div style= padding:18px 24px; border-radius:1px; text-align:center;">
  <span style="color:#f; font-size:20px; font-weight:100;">
    Determine if any differences you find are statistically significant
    <br>
    <br>
    Mann–Whitney U test
    <br>
    H0 (Null Hypothesis) - There is no significant difference between the duration of manual and auto.
    <br>
    H1 (Alternative Hypothesis ) - There is a significant difference between the duration of manual and auto.
    <br>
    Let's set significance level at 0.05
  </span>
</div>

In [648]:
fig = px.histogram( epi.loc[epi["open_type"] == "manual"],
    x="duration_sec",color_discrete_sequence = ["#7e22ce"], range_x = (0,10000))
update_layout(fig)
fig.update_layout(
    xaxis_title="Statistic",
    yaxis_title="Duration (seconds)",
    title_font=dict(size=20, color=JB_COLORS["text"], family="Arial Black")
)
fig.show()

In [649]:
fig = px.histogram( epi.loc[epi["open_type"] == "auto"], x= "duration_sec", color_discrete_sequence = ["#00ff88"], range_x = (0,10000))
update_layout(fig)
fig.update_layout(
    xaxis_title="Statistic",
    yaxis_title="Duration (seconds)",
    title_font=dict(size=20, color=JB_COLORS["text"], family="Arial Black")
)
fig.show()

In [650]:
manual = epi.loc[epi["open_type"] == "manual", "duration_sec"].dropna()
auto   = epi.loc[epi["open_type"] == "auto", "duration_sec"].dropna()
U, p_value = mannwhitneyu(auto, manual)
print('Statistics=%.2f, p=%.15f' % (U, p_value))

alpha = 0.05
if p_value < alpha:
    print('Reject Null Hypothesis (Significant difference between two samples)')
else:
    print('Do not Reject Null Hypothesis (No significant difference between two samples)')

Statistics=579923.50, p=0.000000000000000
Reject Null Hypothesis (Significant difference between two samples)


In [651]:
epi["day_of_week"] = epi["open_dt"].dt.day_name()

In [652]:
epi

Unnamed: 0,user_id,open_ts,close_ts,duration_sec,open_type,open_dt,date,hour,day_of_week
0,1,1751826102123,1751826746077,643.954,auto,2025-07-06 18:21:42.123,2025-07-06,18,Sunday
1,1,1751841258635,1751849544609,8285.974,manual,2025-07-06 22:34:18.635,2025-07-06,22,Sunday
2,1,1751985487772,1751986623885,1136.113,manual,2025-07-08 14:38:07.772,2025-07-08,14,Tuesday
3,1,1752250204033,1752250204781,0.748,manual,2025-07-11 16:10:04.033,2025-07-11,16,Friday
4,1,1752250205587,1752257432775,7227.188,manual,2025-07-11 16:10:05.587,2025-07-11,16,Friday
...,...,...,...,...,...,...,...,...,...
1826,198,1751794789165,1751794914913,125.748,auto,2025-07-06 09:39:49.165,2025-07-06,9,Sunday
1827,199,1752207158481,1752207181344,22.863,manual,2025-07-11 04:12:38.481,2025-07-11,4,Friday
1828,200,1752502731550,1752502765595,34.045,manual,2025-07-14 14:18:51.550,2025-07-14,14,Monday
1829,201,1751853675253,1751853683561,8.308,manual,2025-07-07 02:01:15.253,2025-07-07,2,Monday


In [653]:
fig = px.histogram(
    epi,
    x="hour",
    color="open_type",
    nbins=24,
    color_discrete_map={"manual": JB_COLORS["manual"], "auto": JB_COLORS["auto"]},
    title="Openings throughout the day (manual vs auto)"
)
update_layout(fig)
fig.update_layout(
    xaxis_title="Statistic",
    yaxis_title="Count",
)
fig.show()

In [654]:
tmp = (
    epi.assign(is_auto=epi["open_type"].eq("auto"))
    .groupby(["day_of_week", "hour"])
    .agg(auto_share=("is_auto", "mean"))
    .reset_index()
)

fig = px.density_heatmap(
    tmp,
    x="hour",
    y="day_of_week",
    z="auto_share",
    color_continuous_scale=["#000000","#7e22ce","#00ff88","#baffc9"],
    title="Share of auto openings",
    labels={"auto_share": "auto share"}
)
update_layout(fig)
fig.update_yaxes(categoryorder="array", categoryarray=[
    "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"
])
fig.show()


In [655]:
user_stats = (
    epi.groupby("user_id")
    .agg(
        avg_duration=("duration_sec", "mean"),
        total_sessions=("duration_sec", "count"),
        manual_sessions=("open_type", lambda x: (x == "manual").sum()),
        auto_sessions=("open_type", lambda x: (x == "auto").sum()),
        max_duration=("duration_sec", "max")
    )
    .assign(
        manual_share=lambda d: d["manual_sessions"] / d["total_sessions"]
    )
    .fillna(0)
)


In [656]:
X = user_stats[["avg_duration", "manual_share", "total_sessions", "max_duration"]]
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42)
user_stats["cluster"] = kmeans.fit_predict(X_scaled)

In [657]:
preferred = [
    "avg_duration", "median_duration", "max_duration",
    "total_sessions", "manual_share", "auto_share",
    "opens_per_day", "opens_per_hour"
]
use_cols = [c for c in preferred if c in user_stats.columns]
if not use_cols:
    use_cols = [c for c in user_stats.select_dtypes(include="number").columns
                if c not in ["user_id", "cluster", "pc1", "pc2"]]

X = user_stats[use_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0).copy()

X_scaled = StandardScaler().fit_transform(X)
pca = PCA(n_components=2, random_state=42)
PC = pca.fit_transform(X_scaled)
user_stats["pc1"] = PC[:, 0]
user_stats["pc2"] = PC[:, 1]

In [658]:
fig = px.density_contour(
    user_stats, x="pc1", y="pc2", color="cluster",  facet_col="cluster",
    title="Cluster density in PCA space")
update_layout(fig)

fig.update_traces(contours_coloring="lines", showlegend=True)
fig.show()

fig = px.density_contour(
    user_stats, x="pc1", y="pc2", color="cluster",
    title="Cluster density in PCA space")
update_layout(fig)

fig.update_traces(contours_coloring="lines", showlegend=True)
fig.show()

In [659]:
cluster_profiles = (user_stats
    .groupby("cluster")[["avg_duration","max_duration"]]
    .mean().reset_index().melt("cluster", var_name="feature", value_name="value"))

fig = px.bar(cluster_profiles, x="feature", y="value", color="cluster", barmode="group", title="Cluster profiles (feature means)", color_continuous_scale=["#7e22ce","#00ff88"])
update_layout(fig)
fig.show()

In [660]:
fig = px.box(user_stats, x="cluster", y="avg_duration", points="outliers",
             title="Avg session duration by cluster", color_discrete_sequence=["#00ff88"])
update_layout(fig)
fig.show()

fig = px.violin(user_stats, x="cluster", y="manual_share", box=True, points=False,
                title="Manual share by cluster",color_discrete_sequence=["#00ff88"])
update_layout(fig)
fig.show()


In [661]:
user_stats["is_outlier"] = user_stats["max_duration"] > user_stats["max_duration"].quantile(0.95)

In [662]:
fig = px.box(
    user_stats, x="is_outlier", y="max_duration", points="outliers",
    title="Max duration distribution: Outliers vs Population",color_discrete_sequence=["#7e22ce"]
)
update_layout(fig)
fig.show()

fig = px.violin(
    user_stats, x="is_outlier", y="avg_duration", box=True, points=False,
    title="Avg session duration: Outliers vs Population",color_discrete_sequence=["#7e22ce"]
)
update_layout(fig)
fig.show()

In [663]:
power_users = user_stats[
    (user_stats["total_sessions"] > user_stats["total_sessions"].quantile(0.9)) &
    (user_stats["manual_share"] > 0.5)
]
power_users

Unnamed: 0_level_0,avg_duration,total_sessions,manual_sessions,auto_sessions,max_duration,manual_share,cluster,pc1,pc2,is_outlier
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
14,2626.016077,26,14,12,60193.996,0.538462,1,-0.003838,1.093296,False
37,1450.0391,30,18,12,22128.494,0.6,1,-0.21164,1.331628,False


In [664]:
user_stats = user_stats.copy()
user_stats["is_power"] = (
    (user_stats["total_sessions"] > user_stats["total_sessions"].quantile(0.9)) &
    (user_stats["manual_share"] > 0.5)
)

In [665]:
fig = px.box(
    user_stats, x="is_power", y="total_sessions",
    points="outliers", title="Total sessions: power vs others",color_discrete_sequence=["#7e22ce"]
)
update_layout(fig)
fig.show()

fig = px.violin(
    user_stats, x="is_power", y="manual_share",
    box=True, points=False, title="Manual share: power vs others",color_discrete_sequence=["#7e22ce"]
)
update_layout(fig)
fig.update_yaxes(tickformat=".0%")
fig.show()