In [29]:
# <-- Import libraries, custom functions, and load configuration & datasets <-- #

import yaml
import datetime as dt
import re
import pandas as pd
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import seaborn as sns

from statsmodels.multivariate.manova import MANOVA
from scipy import stats
from scipy.stats import pearsonr, boxcox, chi2_contingency, shapiro, probplot
from scipy.stats.contingency import association

sns.set(style="whitegrid")

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

# from functions import ()

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

df_proc_control_no_out = pd.read_csv(config['output_data']['file12'])   # web_data demo_control clients (for Sanity checks)
df_proc_test_no_out = pd.read_csv(config['output_data']['file14'])      # web_data demo_test clients (for Sanity checks)


In [30]:
df_proc_control_no_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17327 entries, 0 to 17326
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   client_id        17327 non-null  int64  
 1   visitor_id       17327 non-null  object 
 2   visit_id         17327 non-null  object 
 3   reached_start    17327 non-null  int64  
 4   reached_step_1   17327 non-null  int64  
 5   reached_step_2   17327 non-null  int64  
 6   reached_step_3   17327 non-null  int64  
 7   reached_confirm  17327 non-null  int64  
 8   completed        17327 non-null  int64  
 9   t_start_step1    11512 non-null  float64
 10  t_step1_step2    9587 non-null   float64
 11  t_step2_step3    8634 non-null   float64
 12  t_step3_conf     7436 non-null   float64
 13  t_total          7436 non-null   float64
 14  n_back_jumps     17327 non-null  int64  
 15  outcome          17327 non-null  object 
dtypes: float64(5), int64(8), object(3)
memory usage: 2.1+ MB


In [31]:
df_proc_control_no_out['outcome'].unique()

array(['fail', 'successful'], dtype=object)

In [32]:
df_proc_test_no_out['outcome'].unique()

array(['successful', 'fail', 'completed_with_errors'], dtype=object)

In [33]:
# ========= 5) KPIs & Drop-off =========
def kpis_from_processes(proc: pd.DataFrame) -> pd.DataFrame:
    d = proc.copy()
    denom = max(1, int(d['reached_start'].sum()))   # who started

    kpis = {
        'n_processes'          : int(len(d)),
        'started'              : int(d['reached_start'].sum()),
        'step1_rate_%'         : 100*d['reached_step_1'].sum()/denom,
        'step2_rate_%'         : 100*d['reached_step_2'].sum()/denom,
        'step3_rate_%'         : 100*d['reached_step_3'].sum()/denom,
        'completion_rate_%'    : 100*d['completed'].sum()/denom,
        'successful_%'              : 100*(d['outcome'].eq('successful').sum())/denom,
        'completed_with_errors_%'   : 100*(d['outcome'].eq('completed_with_errors').sum())/denom,
        'fail%'                     : 100*(d['outcome'].eq('fail').sum())/denom,
        't_total_avg_min'   : float(np.nanmean(d['t_total'])),
        't_total_median_min'   : float(np.nanmedian(d['t_total'])),
        't_step1_avg_min'   : float(np.nanmean(d['t_start_step1'])),
        't_step1_median_min'   : float(np.nanmedian(d['t_start_step1'])),
        't_step2_avg_min'   : float(np.nanmean(d['t_step1_step2'])),
        't_step2_median_min'   : float(np.nanmedian(d['t_step1_step2'])),
        't_step3_avg_min'   : float(np.nanmean(d['t_step2_step3'])),
        't_step3_median_min'   : float(np.nanmedian(d['t_step2_step3'])),
        't_conf_avg_min'    : float(np.nanmean(d['t_step3_conf'])),
        't_conf_median_min'    : float(np.nanmedian(d['t_step3_conf'])),
        'avg_back_jumps'       : float(d['n_back_jumps'].mean())
    }
    return pd.DataFrame([kpis])

In [34]:
def step_dropoff_table(proc: pd.DataFrame) -> pd.DataFrame:
    d = proc.copy()
    rows = [
        ("start→step_1", int(d['reached_start'].sum()),   int(d['reached_step_1'].sum())),
        ("step_1→step_2", int(d['reached_step_1'].sum()), int(d['reached_step_2'].sum())),
        ("step_2→step_3", int(d['reached_step_2'].sum()), int(d['reached_step_3'].sum())),
        ("step_3→confirm", int(d['reached_step_3'].sum()), int(d['completed'].sum())),
    ]
    out = pd.DataFrame(rows, columns=['from_to','n_from','n_to'])
    out['conv_rate_%'] = 100 * out['n_to'] / out['n_from'].replace({0:np.nan})
    out['conv_rate_%'] = out['conv_rate_%'].fillna(0.0)
    out['dropoff_%'] = 100 - out['conv_rate_%']
    return out

In [35]:
kpis_control = kpis_from_processes(df_proc_control_no_out)                        # KPIs
dropoff_control = step_dropoff_table(df_proc_control_no_out)

display(df_proc_control_no_out.head(5))
display(kpis_control)
display(dropoff_control)

print("Valid processes:", len(df_proc_control_no_out))

# Count distinct control clients represented in web data
print("Distinct control clients with activity:", df_proc_control_no_out['client_id'].nunique())

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
0,1186,446844663_31615102958,507052512_11309370126_442139,1,0,0,0,0,0,,,,,,0,fail
1,1186,446844663_31615102958,795373564_99931517312_810896,1,1,1,0,0,0,0.183333,0.183333,,,,0,fail
2,1195,766842522_69992551638,393817425_39015278493_996341,1,1,1,1,1,1,0.55,1.633333,0.516667,1.383333,4.083333,0,successful
3,1368,366307863_19014662045,784065271_45379483290_309335,1,0,0,0,0,0,,,,,,0,fail
4,2439,607208067_70160939111,848231744_22569944243_37711,1,0,0,0,0,0,,,,,,0,fail


Unnamed: 0,n_processes,started,step1_rate_%,step2_rate_%,step3_rate_%,completion_rate_%,successful_%,completed_with_errors_%,fail%,t_total_avg_min,t_total_median_min,t_step1_avg_min,t_step1_median_min,t_step2_avg_min,t_step2_median_min,t_step3_avg_min,t_step3_median_min,t_conf_avg_min,t_conf_median_min,avg_back_jumps
0,17327,17327,66.439661,55.329832,49.829745,42.915681,42.915681,0.0,57.084319,4.560866,4.0,0.593359,0.366667,0.58524,0.35,1.500851,1.216667,1.915156,1.383333,0.0


Unnamed: 0,from_to,n_from,n_to,conv_rate_%,dropoff_%
0,start→step_1,17327,11512,66.439661,33.560339
1,step_1→step_2,11512,9587,83.278318,16.721682
2,step_2→step_3,9587,8634,90.059456,9.940544
3,step_3→confirm,8634,7436,86.124624,13.875376


Valid processes: 17327
Distinct control clients with activity: 14174


In [36]:
kpis_test = kpis_from_processes(df_proc_test_no_out)                        # KPIs
dropoff_test = step_dropoff_table(df_proc_test_no_out)

display(df_proc_test_no_out.head(5))
display(kpis_test)
display(dropoff_test)

print("Valid processes:", len(df_proc_test_no_out))

# Count distinct control clients represented in web data
print("Distinct control clients with activity:", df_proc_test_no_out['client_id'].nunique())

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
0,555,402506806_56087378777,637149525_38041617439_716659,1,1,1,1,1,1,0.116667,0.533333,1.65,0.333333,2.633333,0,successful
1,647,66758770_53988066587,40369564_40101682850_311847,1,1,1,1,1,1,0.116667,0.3,3.15,2.716667,6.283333,0,successful
2,934,810392784_45004760546,7076463_57954418406_971348,1,0,0,0,0,0,,,,,,0,fail
3,1516,182314299_63168583136,481123290_95510977345_707323,1,1,1,0,0,0,4.083333,,,,,3,fail
4,1643,910842861_51062379179,379946188_1773022140_107963,1,0,0,0,0,0,,,,,,0,fail


Unnamed: 0,n_processes,started,step1_rate_%,step2_rate_%,step3_rate_%,completion_rate_%,successful_%,completed_with_errors_%,fail%,t_total_avg_min,t_total_median_min,t_step1_avg_min,t_step1_median_min,t_step2_avg_min,t_step2_median_min,t_step3_avg_min,t_step3_median_min,t_conf_avg_min,t_conf_median_min,avg_back_jumps
0,22213,22213,83.595192,70.791879,63.251249,53.558727,43.861703,9.697024,46.441273,4.349371,3.616667,1.1773,0.266667,0.542394,0.4,1.327059,1.1,1.460345,0.9,0.336515


Unnamed: 0,from_to,n_from,n_to,conv_rate_%,dropoff_%
0,start→step_1,22213,18569,83.595192,16.404808
1,step_1→step_2,18569,15725,84.684151,15.315849
2,step_2→step_3,15725,14050,89.348172,10.651828
3,step_3→confirm,14050,11897,84.676157,15.323843


Valid processes: 22213
Distinct control clients with activity: 19095
