In [1]:
# <-- Import libraries, custom functions, and load configuration & datasets <-- #

import yaml
import datetime as dt
import re
import pandas as pd
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import seaborn as sns

from pathlib import Path
import sys
# Point to the project root (adjust parents[1] to parents[2] if your notebook is deeper)
PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

from statsmodels.stats.proportion import proportions_ztest, proportion_confint
from statsmodels.multivariate.manova import MANOVA
from scipy import stats
from scipy.stats import pearsonr, boxcox, chi2_contingency, shapiro, probplot
from scipy.stats.contingency import association

sns.set(style="whitegrid")

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

from lib.functions import (kpis_from_processes, 
                           step_dropoff_table,
                           two_proportion_ztest,
                           welch_t_one_sided,
                           stratified_completion_tests,
                           decision_line,
                           show_statistical_test)

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

df_proc_control_no_out = pd.read_csv(config['output_data']['file12'])   # 
df_proc_test_no_out = pd.read_csv(config['output_data']['file14'])      # 
df_demo_control = pd.read_csv(config['output_data']['file4'])           # 
df_demo_test = pd.read_csv(config['output_data']['file3'])              # 

In [2]:
df_proc_control_no_out['outcome'].unique()

array(['fail', 'successful'], dtype=object)

In [3]:
df_proc_test_no_out['outcome'].unique()

array(['successful', 'fail', 'completed_with_errors'], dtype=object)

In [4]:
df_demo_control.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23526 entries, 0 to 23525
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   client_id          23526 non-null  int64  
 1   clnt_tenure_yr     23526 non-null  float64
 2   clnt_tenure_mnth   23526 non-null  float64
 3   clnt_age           23526 non-null  float64
 4   num_accts          23526 non-null  float64
 5   bal                23526 non-null  float64
 6   calls_6_mnth       23526 non-null  float64
 7   logons_6_mnth      23526 non-null  float64
 8   Variation          23526 non-null  object 
 9   clnt_age_quantile  23526 non-null  object 
 10  tenure_quantile    23526 non-null  object 
dtypes: float64(7), int64(1), object(3)
memory usage: 2.0+ MB


In [5]:
df_demo_control.head(5)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,clnt_age_quantile,tenure_quantile
0,2304905,7.0,94.0,58.0,2.0,110860.3,6.0,9.0,Control,A3,T2
1,5126305,12.0,145.0,33.0,2.0,103671.75,0.0,3.0,Control,A2,T3
2,3727881,5.0,71.0,30.5,2.0,23915.6,0.0,3.0,Control,A1,T1
3,272934,5.0,66.0,58.5,2.0,27021.42,2.0,5.0,Control,A3,T1
4,285619,30.0,369.0,67.5,2.0,299388.72,3.0,6.0,Control,A4,T4


In [6]:
# Keep only the columns we need from df_demo_control and df_demo_test
demo_cols = [
    'client_id','clnt_tenure_yr','clnt_tenure_mnth','clnt_age','num_accts','bal','clnt_age_quantile','tenure_quantile'
]

proc_control_aug = (df_proc_control_no_out
    .merge(df_demo_control[demo_cols], on='client_id', how='left'))

proc_test_aug = (df_proc_test_no_out
    .merge(df_demo_test[demo_cols], on='client_id', how='left'))

# convenience aliases we’ll use below
control = proc_control_aug
test = proc_test_aug

In [7]:
control.columns

Index(['client_id', 'visitor_id', 'visit_id', 'reached_start',
       'reached_step_1', 'reached_step_2', 'reached_step_3', 'reached_confirm',
       'completed', 't_start_step1', 't_step1_step2', 't_step2_step3',
       't_step3_conf', 't_total', 'n_back_jumps', 'outcome', 'clnt_tenure_yr',
       'clnt_tenure_mnth', 'clnt_age', 'num_accts', 'bal', 'clnt_age_quantile',
       'tenure_quantile'],
      dtype='object')

In [8]:
kpis_control = kpis_from_processes(control)                        # KPIs
dropoff_control = step_dropoff_table(control)

display(control[['client_id', 'visitor_id', 'visit_id','reached_start','reached_step_1', 'reached_step_2', 'reached_step_3', 'reached_confirm','completed']].head(5))
display(control[['client_id', 'visitor_id', 'visit_id','t_start_step1', 't_step1_step2', 't_step2_step3','t_step3_conf', 't_total', 'n_back_jumps', 'outcome']].head(5))
# display(control.head(5))
display(kpis_control[['n_processes', 'started', 'step1_rate_%', 'step2_rate_%','step3_rate_%', 'completion_rate_%', 'successful_%', 'completed_with_errors_%', 'fail%']])
display(kpis_control[['t_total_avg_min','t_step1_avg_min', 't_step2_avg_min', 't_step3_avg_min','t_conf_avg_min', 'n_back_jumps', 'avg_back_jumps']])
# display(kpis_control)
display(dropoff_control)

print("Valid processes:", len(control))

# Count distinct control clients represented in web data
print("Distinct control clients with activity:", control['client_id'].nunique())

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed
0,1186,446844663_31615102958,507052512_11309370126_442139,1,0,0,0,0,0
1,1186,446844663_31615102958,795373564_99931517312_810896,1,1,1,0,0,0
2,1195,766842522_69992551638,393817425_39015278493_996341,1,1,1,1,1,1
3,1368,366307863_19014662045,784065271_45379483290_309335,1,0,0,0,0,0
4,2439,607208067_70160939111,848231744_22569944243_37711,1,0,0,0,0,0


Unnamed: 0,client_id,visitor_id,visit_id,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
0,1186,446844663_31615102958,507052512_11309370126_442139,,,,,,0,fail
1,1186,446844663_31615102958,795373564_99931517312_810896,0.183333,0.183333,,,,0,fail
2,1195,766842522_69992551638,393817425_39015278493_996341,0.55,1.633333,0.516667,1.383333,4.083333,0,successful
3,1368,366307863_19014662045,784065271_45379483290_309335,,,,,,0,fail
4,2439,607208067_70160939111,848231744_22569944243_37711,,,,,,0,fail


Unnamed: 0,n_processes,started,step1_rate_%,step2_rate_%,step3_rate_%,completion_rate_%,successful_%,completed_with_errors_%,fail%
0,17327,17327,66.439661,55.329832,49.829745,42.915681,42.915681,0.0,57.084319


Unnamed: 0,t_total_avg_min,t_step1_avg_min,t_step2_avg_min,t_step3_avg_min,t_conf_avg_min,n_back_jumps,avg_back_jumps
0,4.560866,0.593359,0.58524,1.500851,1.915156,0,0.0


Unnamed: 0,from_to,n_from,n_to,conv_rate_%,dropoff_%
0,start→step_1,17327,11512,66.439661,33.560339
1,step_1→step_2,11512,9587,83.278318,16.721682
2,step_2→step_3,9587,8634,90.059456,9.940544
3,step_3→confirm,8634,7436,86.124624,13.875376


Valid processes: 17327
Distinct control clients with activity: 14174


In [9]:
kpis_test = kpis_from_processes(test)                        # KPIs
dropoff_test = step_dropoff_table(test)

display(test[['client_id', 'visitor_id', 'visit_id','reached_start','reached_step_1', 'reached_step_2', 'reached_step_3', 'reached_confirm','completed']].head(5))
display(test[['client_id', 'visitor_id', 'visit_id','t_start_step1', 't_step1_step2', 't_step2_step3','t_step3_conf', 't_total', 'n_back_jumps', 'outcome']].head(5))
# display(test.head(5))
display(kpis_test[['n_processes', 'started', 'step1_rate_%', 'step2_rate_%','step3_rate_%', 'completion_rate_%', 'successful_%', 'completed_with_errors_%', 'fail%']])
display(kpis_test[['t_total_avg_min', 't_step1_avg_min', 't_step2_avg_min', 't_step3_avg_min','t_conf_avg_min', 'n_back_jumps', 'avg_back_jumps']])
# display(kpis_test)
display(dropoff_test)

print("Valid processes:", len(test))

# Count distinct control clients represented in web data
print("Distinct control clients with activity:", test['client_id'].nunique())

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed
0,555,402506806_56087378777,637149525_38041617439_716659,1,1,1,1,1,1
1,647,66758770_53988066587,40369564_40101682850_311847,1,1,1,1,1,1
2,934,810392784_45004760546,7076463_57954418406_971348,1,0,0,0,0,0
3,1516,182314299_63168583136,481123290_95510977345_707323,1,1,1,0,0,0
4,1643,910842861_51062379179,379946188_1773022140_107963,1,0,0,0,0,0


Unnamed: 0,client_id,visitor_id,visit_id,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
0,555,402506806_56087378777,637149525_38041617439_716659,0.116667,0.533333,1.65,0.333333,2.633333,0,successful
1,647,66758770_53988066587,40369564_40101682850_311847,0.116667,0.3,3.15,2.716667,6.283333,0,successful
2,934,810392784_45004760546,7076463_57954418406_971348,,,,,,0,fail
3,1516,182314299_63168583136,481123290_95510977345_707323,4.083333,,,,,3,fail
4,1643,910842861_51062379179,379946188_1773022140_107963,,,,,,0,fail


Unnamed: 0,n_processes,started,step1_rate_%,step2_rate_%,step3_rate_%,completion_rate_%,successful_%,completed_with_errors_%,fail%
0,22213,22213,83.595192,70.791879,63.251249,53.558727,43.861703,9.697024,46.441273


Unnamed: 0,t_total_avg_min,t_step1_avg_min,t_step2_avg_min,t_step3_avg_min,t_conf_avg_min,n_back_jumps,avg_back_jumps
0,4.349371,1.1773,0.542394,1.327059,1.460345,7475,0.336515


Unnamed: 0,from_to,n_from,n_to,conv_rate_%,dropoff_%
0,start→step_1,22213,18569,83.595192,16.404808
1,step_1→step_2,18569,15725,84.684151,15.315849
2,step_2→step_3,15725,14050,89.348172,10.651828
3,step_3→confirm,14050,11897,84.676157,15.323843


Valid processes: 22213
Distinct control clients with activity: 19095


**1) Build the aggregates we need to test (checked with values in kpis_test and dropoff_test)**

In [10]:
# --- CONTROL group counts ---
n_c = int(control['reached_start'].sum())   # total "trials": how many control processes actually started the funnel
x_c = int(control['completed'].sum())       # total "successes": how many control processes reached 'confirm'

# --- TEST group counts ---
n_t = int(test['reached_start'].sum())   # total "trials": how many test processes started the funnel
x_t = int(test['completed'].sum())       # total "successes": how many test processes completed

# --- Proportions (completion rates) with safe division (NaN if no starters) ---
p_c = 100*(x_c / n_c) if n_c else np.nan    # control completion rate = completed / started
p_t = 100*(x_t / n_t) if n_t else np.nan    # test completion rate    = completed / started

# --- Quick summary printout ---
print(f"Control: {x_c}/{n_c} = {p_c:.4f}%")
print(f"Test   : {x_t}/{n_t} = {p_t:.4f}%")
print(f"Diff (test - control): {p_t - p_c:.4f}%") # difference in completion rates (percentage points in decimal form)

alpha = 0.05

Control: 7436/17327 = 42.9157%
Test   : 11897/22213 = 53.5587%
Diff (test - control): 10.6430%


**2) Generic two-proportion Z-test (one-sided) | Helpers to compute test statistics**

In [11]:
res_comp = two_proportion_ztest(x1=x_t, n1=n_t, x2=x_c, n2=n_c, alternative='larger', diff0=0.05, alpha=alpha)
res_comp

{'z_stat': 11.137729953054288,
 'p_value': 4.109377226772415e-29,
 'p_test': 0.5355872687165174,
 'p_control': 0.4291568072949732,
 'diff': 0.10643046142154422,
 'null_diff': 0.05,
 'ci_test': (0.5290230717273287, 0.542139159093394),
 'ci_control': (0.42180355901064576, 0.43654146100276475),
 'n_test': 22213,
 'x_test': 11897,
 'n_control': 17327,
 'x_control': 7436,
 'alternative': 'larger'}

**3) Completion rate ≥5 percentage-point threshold — H₀: p_test − p_ctrl ≤ 0.05 (one-sided “greater”)**

In [12]:
# Visualize with  existing function (normal z, one-sided "greater")
# n is only used to draw the reference curve; we pass a large-ish n for display.
show_statistical_test(
    statistic=res_comp['z_stat'],
    alpha=alpha,
    n=max(res_comp['n_test'], res_comp['n_control']),
    distribution='normal',
    alternative='greater'
)

**5) Error rate — H₀: p_err_test − p_err_ctrl ≥ 0 (one-sided “smaller”)**

In [13]:
# Define error as "at least one back-jump" over the process
err_c = int(kpis_control['n_back_jumps']); base_c = int(kpis_control['n_processes'])
err_t = int(kpis_test['n_back_jumps']); base_t =int(kpis_test['n_processes'])

res_err = two_proportion_ztest(err_t, base_t, err_c, base_c, alternative='smaller', diff0=0.005, alpha=alpha)
print(res_err)

show_statistical_test(
    statistic=res_err['z_stat'],
    alpha=alpha,
    n=max(res_err['n_test'], res_err['n_control']),
    distribution='normal',
    alternative='greater'  # test is "smaller" → lower tail
)


{'z_stat': 83.5343577058352, 'p_value': 1.0, 'p_test': 0.3365146535812362, 'p_control': 0.0, 'diff': 0.3365146535812362, 'null_diff': 0.005, 'ci_test': (0.3303295246804194, 0.3427563181716305), 'ci_control': (0.0, 0.00022165448976160245), 'n_test': 22213, 'x_test': 7475, 'n_control': 17327, 'x_control': 0, 'alternative': 'smaller'}



Calling int on a single element Series is deprecated and will raise a TypeError in the future. Use int(ser.iloc[0]) instead


Calling int on a single element Series is deprecated and will raise a TypeError in the future. Use int(ser.iloc[0]) instead



**6) Time (example with t_total) — H₀: mean_test − mean_ctrl ≥ 0 (one-sided “less”)**

In [14]:
metric = 't_total'  # switch to 't_start_step1', 't_step1_step2', etc.
res_time = welch_t_one_sided(test[metric], control[metric], alternative='less')
print(res_time)

# show_statistical_test needs df via n; we approximate df ≈ n_test + n_ctrl - 2 for display
show_statistical_test(
    statistic=res_time['w_stat'],
    alpha=alpha,
    n=(res_time['n_test'] + res_time['n_ctrl'] - 2),
    distribution='t-student',
    alternative='lower'  # "Test is faster" = lower mean → left tail
)

{'w_stat': -5.758094960007894, 'p_value': 4.3273921875238935e-09, 'n_test': 11897, 'n_ctrl': 7436}


**7) Optional: Stratified completion tests by age/tenure quantiles**

In [15]:
# By age quantile
comp_by_age = stratified_completion_tests(test, control, by_col='clnt_age_quantile', alpha=alpha)
display(comp_by_age)

Unnamed: 0,level,n_test,n_control,p_test,p_control,diff,z_stat,p_value
0,A1,5291,3778,0.618031,0.47856,0.139471,-34.09833,1.0
1,A2,5337,4101,0.568297,0.467691,0.100606,-38.513281,1.0
2,A3,5605,4315,0.509723,0.441483,0.06824,-42.671786,1.0
3,A4,5980,5133,0.457692,0.351646,0.106046,-42.117054,1.0


In [16]:
# By tenure quantile
comp_by_tenure = stratified_completion_tests(test, control, by_col='tenure_quantile', alpha=alpha)
display(comp_by_tenure)

Unnamed: 0,level,n_test,n_control,p_test,p_control,diff,z_stat,p_value
0,T1,6110,4570,0.549427,0.425164,0.124263,-38.425467,1.0
1,T2,5748,4361,0.546625,0.443935,0.10269,-39.569583,1.0
2,T3,5633,4366,0.531866,0.446404,0.085462,-41.120071,1.0
3,T4,4722,4030,0.508683,0.399007,0.109675,-36.529373,1.0


In [18]:
decision_line("Completion (TEST > CONTROL)", res_comp['p_value'], alpha, "higher completion in TEST")
#decision_line("≥5pp lift (TEST - CONTROL > 0.05)", res_comp_5pp['p'], alpha, "≥5pp improvement confirmed")
decision_line("Error rate (TEST < CONTROL)", res_err['p_value'], alpha, "lower error rate in TEST")
decision_line(f"Time {metric} (TEST faster)", res_time['p_value'], alpha, "TEST mean time is lower")

Completion (TEST > CONTROL): p=4.109e-29 → Reject H0; higher completion in TEST
Error rate (TEST < CONTROL): p=1 → Fail to reject H0.
Time t_total (TEST faster): p=4.327e-09 → Reject H0; TEST mean time is lower
