In [1]:
import pandas as pd, numpy as np
from pathlib import Path

BASE = Path(r"C:\Users\maria\Documents\VANGUARD DATA")  # change if needed

demo = pd.read_csv(BASE / "df_final_demo_clean.csv")
exp  = pd.read_csv(BASE / "df_final_experiment_clients_clean.csv")
web  = pd.read_csv(BASE / "df_final_web_data_clean.csv")

for d in (demo, exp, web):
    d["client_id"] = d["client_id"].astype(str).str.strip()
web["process_step"] = web["process_step"].astype(str).str.lower().str.strip()
web["date_time"]    = pd.to_datetime(web["date_time"], errors="coerce")

# Map experiment group → 'Control' / 'Test'
if "variation" in exp.columns:
    v = exp["variation"].astype(str).str.lower().str.strip()
    mapping = {
        "control":"Control","variant a":"Control","a":"Control","0":"Control",
        "test":"Test","treatment":"Test","variant b":"Test","b":"Test","1":"Test"
    }
    exp["group"] = v.map(mapping).fillna("Unknown")
else:
    exp["group"] = "Unknown"

# Keep only experiment clients and attach group to events
w = web[web["client_id"].isin(exp["client_id"])].merge(
        exp[["client_id","group"]], on="client_id", how="left"
    ).sort_values(["client_id","visit_id","date_time"])

# If a visit has no 'start', mark its first event as 'start' (funnel consistency)
first_in_visit = w.groupby("visit_id").cumcount().eq(0)
has_start = w.groupby("visit_id")["process_step"].transform(lambda s: (s=="start").any())
w.loc[(~has_start) & first_in_visit, "process_step"] = "start"

# Step order for funnel and errors
order = ["start","step_1","step_2","step_3","confirm"]
w["step_order"] = pd.Categorical(w["process_step"], categories=order, ordered=True).codes

**KPI 1 - COMPLETION RATE (CLIENT AND VISIT)**

In [2]:
# Client-level completion per group
per_client = (w.groupby(["group","client_id"])
                .agg(completed=("process_step", lambda s: (s=="confirm").any()))
                .reset_index())
client_completion = per_client.groupby("group")["completed"].mean().sort_index()
print("Client completion rate:\n", (client_completion*100).round(2), "%")

# Visit-level completion per group
per_visit = (w.groupby(["group","visit_id"])
               .agg(completed=("process_step", lambda s: (s=="confirm").any()))
               .reset_index())
visit_completion = per_visit.groupby("group")["completed"].mean().sort_index()
print("\nVisit completion rate:\n", (visit_completion*100).round(2), "%")

Client completion rate:
 group
Control    65.16
Test       68.60
Unknown    67.41
Name: completed, dtype: float64 %

Visit completion rate:
 group
Control    48.03
Test       51.64
Unknown    49.92
Name: completed, dtype: float64 %


**KPI 2 - TIME SPENT ON EACH STEP**

In [3]:
# First timestamp per step within each visit & group
first_times = (w[w["process_step"].isin(order)]
               .groupby(["group","visit_id","process_step"])["date_time"]
               .min().unstack())

# Build durations (min) and aggregate by group (median & mean)
dur = pd.DataFrame({
    "start→1":      (first_times["step_1"] - first_times["start"]).dt.total_seconds()/60,
    "1→2":          (first_times["step_2"] - first_times["step_1"]).dt.total_seconds()/60,
    "2→3":          (first_times["step_3"] - first_times["step_2"]).dt.total_seconds()/60,
    "3→confirm":    (first_times["confirm"] - first_times["step_3"]).dt.total_seconds()/60,
}).reset_index()

step_time_median = dur.groupby("group").median(numeric_only=True).round(2)
step_time_mean   = dur.groupby("group").mean(numeric_only=True).round(2)
print("Median minutes per transition:\n", step_time_median)

Median minutes per transition:
          start→1   1→2   2→3  3→confirm
group                                  
Control     0.37  0.37  1.32       1.49
Test        0.20  0.52  1.22       0.95
Unknown     0.25  0.45  1.25       1.18


**KPI 3: ERROR RATE**

In [4]:
# Transitions within visit
w_sorted = w.sort_values(["group","visit_id","date_time"])
w_sorted["prev_order"] = w_sorted.groupby("visit_id")["step_order"].shift()
w_sorted["prev_step"]  = w_sorted.groupby("visit_id")["process_step"].shift()

# Any backward move?
trans = w_sorted.dropna(subset=["prev_order"]).copy()
trans["is_backward"] = trans["step_order"] < trans["prev_order"]

# Overall visit-level error rate
visit_errors = (trans.groupby(["group","visit_id"])["is_backward"].any().reset_index())
error_rate_overall = visit_errors.groupby("group")["is_backward"].mean().sort_index()
print("Visit-level error rate (any backward):\n", (error_rate_overall*100).round(2), "%")

# Error rate by 'from' step
backward_by_from = (trans[trans["is_backward"]]
                    .groupby(["group","prev_step"])["visit_id"].nunique()
                    .rename("backward_visits"))
reached_prev = (w.groupby(["group","process_step"])["visit_id"].nunique()
                .rename("visits_reaching_step"))
err_by_step = (backward_by_from / reached_prev).dropna().unstack(0).round(3)  # step rows × group cols
print("\nError rate by step (share of visits that regressed after reaching that step):\n", err_by_step)

Visit-level error rate (any backward):
 group
Control    24.68
Test       32.04
Unknown    28.98
Name: is_backward, dtype: float64 %

Error rate by step (share of visits that regressed after reaching that step):
 group                   Control   Test  Unknown
prev_step process_step                         
confirm   confirm         0.047  0.024    0.044
          start           0.023  0.012    0.022
          step_1          0.031  0.016    0.029
          step_2          0.036  0.018    0.034
          step_3          0.040  0.021    0.038
step_1    confirm         0.141  0.281    0.219
          start           0.068  0.145    0.109
          step_1          0.093  0.192    0.148
          step_2          0.109  0.221    0.171
          step_3          0.120  0.246    0.189
step_2    confirm         0.115  0.194    0.154
          start           0.055  0.100    0.077
          step_1          0.076  0.132    0.104
          step_2          0.089  0.152    0.120
          step_3   

**KPI 4 - DROPS BY STEP**

In [5]:
step_clients = (w.groupby(["group","process_step"])["client_id"]
                  .nunique().unstack().reindex(columns=order).fillna(0))
rate_vs_start = (step_clients.div(step_clients["start"], axis=0)).round(3)
print("Client reach rate by step vs start:\n", rate_vs_start)

Client reach rate by step vs start:
 process_step  start  step_1  step_2  step_3  confirm
group                                               
Control         1.0   0.851   0.790   0.735    0.652
Test            1.0   0.895   0.822   0.767    0.686
Unknown         1.0   0.876   0.808   0.755    0.674
