In [33]:
# <-- Import libraries, custom functions, and load configuration & datasets <-- #

import yaml
import datetime as dt
import re
import pandas as pd
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import seaborn as sns

from statsmodels.multivariate.manova import MANOVA
from scipy import stats
from scipy.stats import pearsonr, boxcox, chi2_contingency, shapiro, probplot
from scipy.stats.contingency import association

sns.set(style="whitegrid")

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

# from functions import ()

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

df_demo_test = pd.read_csv(config['output_data']['file3'])          # demo_test clients
df_demo_control = pd.read_csv(config['output_data']['file4'])       # demo_control clients
df_web_data = pd.read_csv(config['output_data']['file5'])           # web_data clean
df_web_data_control = pd.read_csv(config['output_data']['file6'])   # web_data demo_control clients (for Sanity checks)
df_web_data_test = pd.read_csv(config['output_data']['file7'])      # web_data demo_test clients (for Sanity checks)


In [34]:
df_demo_control.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23526 entries, 0 to 23525
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   client_id          23526 non-null  int64  
 1   clnt_tenure_yr     23526 non-null  float64
 2   clnt_tenure_mnth   23526 non-null  float64
 3   clnt_age           23526 non-null  float64
 4   num_accts          23526 non-null  float64
 5   bal                23526 non-null  float64
 6   calls_6_mnth       23526 non-null  float64
 7   logons_6_mnth      23526 non-null  float64
 8   Variation          23526 non-null  object 
 9   clnt_age_quantile  23526 non-null  object 
 10  tenure_quantile    23526 non-null  object 
dtypes: float64(7), int64(1), object(3)
memory usage: 2.0+ MB


In [35]:
df_demo_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26961 entries, 0 to 26960
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   client_id          26961 non-null  int64  
 1   clnt_tenure_yr     26961 non-null  float64
 2   clnt_tenure_mnth   26961 non-null  float64
 3   clnt_age           26961 non-null  float64
 4   num_accts          26961 non-null  float64
 5   bal                26961 non-null  float64
 6   calls_6_mnth       26961 non-null  float64
 7   logons_6_mnth      26961 non-null  float64
 8   Variation          26961 non-null  object 
 9   clnt_age_quantile  26961 non-null  object 
 10  tenure_quantile    26961 non-null  object 
dtypes: float64(7), int64(1), object(3)
memory usage: 2.3+ MB


In [36]:
# ========= Config =========
STEP_ORDER = ['start','step_1','step_2','step_3','confirm']
STEP_MAP   = {s:i for i,s in enumerate(STEP_ORDER)}
KEY = ['client_id','visitor_id','visit_id']   # key to the process

def _to_utc(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors='coerce')
    if getattr(s.dt, "tz", None) is None:
        return s.dt.tz_localize("UTC")
    return s.dt.tz_convert("UTC")

def _standardize_steps(df):
    w = df.copy()
    w['process_step'] = (w['process_step'].str.lower().str.strip()
                         .str.replace(' ', '_')
                         .str.replace('step1','step_1')
                         .str.replace('step2','step_2')
                         .str.replace('step3','step_3'))
    return w[w['process_step'].isin(STEP_ORDER)]


In [37]:
# ========= 1) Filtered to the group and normalization =========
def prepare_web_for_group(df_web_data, df_demo_group):
    ids = set(df_demo_group['client_id'].unique())
    w = df_web_data[df_web_data['client_id'].isin(ids)].copy()
    w = _standardize_steps(w)
    w['date_time'] = _to_utc(w['date_time'])
    # Order **by process**
    w = w.sort_values(KEY + ['date_time']).reset_index(drop=True)
    return w

In [38]:
# ========= 2) Complete sequence → step-backs =========
def compute_back_jumps(w_full: pd.DataFrame) -> pd.DataFrame:
    wf = w_full.copy()
    wf['step_idx'] = wf['process_step'].map(STEP_MAP).astype('Int64')
    wf['prev_step_idx'] = wf.groupby(KEY)['step_idx'].shift(1)
    wf['delta'] = wf['step_idx'] - wf['prev_step_idx']
    wf['is_back_jump'] = wf['delta'].lt(0)
    back = (wf.groupby(KEY)
              .agg(n_back_jumps=('is_back_jump','sum'))
              .reset_index())
    return wf, back

In [39]:
# ========= 3) Last occurrences per step and last confirm =========
def collapse_last_per_step_and_last_confirm(wf: pd.DataFrame) -> pd.DataFrame:
    '''
    For each process (KEY):
      - Keep only the *last* confirm (drop intermediate confirms).
      - Keep the *first* start.
      - Keep the *last* for step_1, step_2, step_3.
    Returns a compact event table with at most one row per (process, step).
    '''
    # 1) Keep only the last confirm per process
    is_last_confirm = (wf['process_step'].eq('confirm') & 
                       ~wf.duplicated(subset=KEY + ['process_step'], keep='last'))
    drop_mask = wf['process_step'].eq('confirm') & (~is_last_confirm)
    wf_no_mid_conf = wf.loc[~drop_mask].copy()
    
    # 2) Split into start vs others
    start_df  = wf_no_mid_conf[wf_no_mid_conf['process_step'].eq('start')].copy()
    other_df  = wf_no_mid_conf[~wf_no_mid_conf['process_step'].eq('start')].copy()

    # 3) For start: keep the *first* occurrence per process
    #    (sorted by time so "first" truly means earliest)
    start_first = (
        start_df.sort_values(KEY + ['date_time'])
                .drop_duplicates(subset=KEY + ['process_step'], keep='first')
    )

    # 4) For others (step_1/step_2/step_3/confirm): keep the *last* occurrence per step
    other_last = (
        other_df.sort_values(KEY + ['process_step', 'date_time'])
                .drop_duplicates(subset=KEY + ['process_step'], keep='last')
    )

    # 5) Combine back and keep a tidy ordering
    wf2 = (
        pd.concat([start_first, other_last], ignore_index=True)
          .sort_values(KEY + ['process_step', 'date_time'])
          .reset_index(drop=True)
    )

    return wf2
    
    # # We remove intermediate confirms per process (leaving only the last one)
    # is_last_confirm = (wf['process_step'].eq('confirm') &
    #                    ~wf.duplicated(subset=KEY+['process_step'], keep='last'))
    # drop_mask = wf['process_step'].eq('confirm') & (~is_last_confirm)
    # wf2 = wf.loc[~drop_mask].copy()

    # # For completion times we take the **last** occurrence of each step per process
    # wf2 = (wf2.sort_values(KEY + ['process_step','date_time'])
    #           .drop_duplicates(subset=KEY+['process_step'], keep='last'))
    # return wf2

In [40]:
# ========= 4) Table by process (1 row = 1 process) =========
def summarize_processes(wf2: pd.DataFrame, back: pd.DataFrame) -> pd.DataFrame:
    # Pivot de timestamps
    pv = wf2.pivot_table(index=KEY, columns='process_step', values='date_time', aggfunc='last') \
            .reindex(columns=STEP_ORDER)
    for c in STEP_ORDER:
        if c not in pv.columns:
            pv[c] = pd.NaT
    # all as naive UTC for subtractions
    for c in pv.columns:
        pv[c] = _to_utc(pv[c]).dt.tz_convert('UTC').dt.tz_localize(None)

    # flags reached and completed
    flags = pv.notna().astype(int).add_prefix('reached_')
    flags['completed'] = flags['reached_confirm']

    # times (min)
    def mins(b,a): return (pv[b] - pv[a]) / pd.Timedelta(minutes=1)
    times = pd.DataFrame(index=pv.index)
    times['t_start_step1'] = mins('step_1','start')
    times['t_step1_step2'] = mins('step_2','step_1')
    times['t_step2_step3'] = mins('step_3','step_2')
    times['t_step3_conf']  = mins('confirm','step_3')
    times['t_total']       = mins('confirm','start')

    proc = (flags.join(times)).reset_index()
    proc = proc.merge(back, on=KEY, how='left').fillna({'n_back_jumps':0})

    # Outcome:
    # - successful: completed and no step-backs
    # - completed_with_errors: completed with step-backs
    # - fail: no completed
    proc['outcome'] = np.where(
        (proc['completed']==1) & (proc['n_back_jumps']==0), 'successful',
        np.where((proc['completed']==1) & (proc['n_back_jumps']>0), 'completed_with_errors', 'fail')
    )
    return proc

In [41]:
# ========= 5) KPIs & Drop-off =========
def kpis_from_processes(proc: pd.DataFrame) -> pd.DataFrame:
    d = proc.copy()
    denom = max(1, int(d['reached_start'].sum()))   # who started

    kpis = {
        'n_processes'          : int(len(d)),
        'started'              : int(d['reached_start'].sum()),
        'step1_rate_%'         : 100*d['reached_step_1'].sum()/denom,
        'step2_rate_%'         : 100*d['reached_step_2'].sum()/denom,
        'step3_rate_%'         : 100*d['reached_step_3'].sum()/denom,
        'completion_rate_%'    : 100*d['completed'].sum()/denom,
        'successful_%'              : 100*(d['outcome'].eq('successful').sum())/denom,
        'completed_with_errors_%'   : 100*(d['outcome'].eq('completed_with_errors').sum())/denom,
        'fail%'                     : 100*(d['outcome'].eq('fail').sum())/denom,
        't_total_avg_min'   : float(np.nanmean(d['t_total'])),
        't_total_median_min'   : float(np.nanmedian(d['t_total'])),
        't_step1_avg_min'   : float(np.nanmean(d['t_start_step1'])),
        't_step1_median_min'   : float(np.nanmedian(d['t_start_step1'])),
        't_step2_avg_min'   : float(np.nanmean(d['t_step1_step2'])),
        't_step2_median_min'   : float(np.nanmedian(d['t_step1_step2'])),
        't_step3_avg_min'   : float(np.nanmean(d['t_step2_step3'])),
        't_step3_median_min'   : float(np.nanmedian(d['t_step2_step3'])),
        't_conf_avg_min'    : float(np.nanmean(d['t_step3_conf'])),
        't_conf_median_min'    : float(np.nanmedian(d['t_step3_conf'])),
        'avg_back_jumps'       : float(d['n_back_jumps'].mean())
    }
    return pd.DataFrame([kpis])

In [42]:
def step_dropoff_table(proc: pd.DataFrame) -> pd.DataFrame:
    d = proc.copy()
    rows = [
        ("start→step_1", int(d['reached_start'].sum()),   int(d['reached_step_1'].sum())),
        ("step_1→step_2", int(d['reached_step_1'].sum()), int(d['reached_step_2'].sum())),
        ("step_2→step_3", int(d['reached_step_2'].sum()), int(d['reached_step_3'].sum())),
        ("step_3→confirm", int(d['reached_step_3'].sum()), int(d['completed'].sum())),
    ]
    out = pd.DataFrame(rows, columns=['from_to','n_from','n_to'])
    out['conv_rate_%'] = 100 * out['n_to'] / out['n_from'].replace({0:np.nan})
    out['conv_rate_%'] = out['conv_rate_%'].fillna(0.0)
    out['dropoff_%'] = 100 - out['conv_rate_%']
    return out

In [43]:
def stranger_things(proc_df: pd.DataFrame, key_cols=(KEY)):
    '''
    Cleans up funnel inconsistencies in an already summarized process DataFrame (e.g., proc_control).
    Rules:
    - completed==1 but reached_start==0
    - Step reached with previous steps not reached.
    Returns:
    proc_clean, proc_anomalies
    '''
    df = proc_df.copy()

    # Ensures required columns (if missing, creates them with 0)
    needed = ['reached_start','reached_step_1','reached_step_2','reached_step_3','reached_confirm','completed','t_total']
    for c in needed:
        if c not in df.columns:
            df[c] = 0

    # Masks of inconsistencies
    m_step1_prev = (df['reached_step_1'].eq(1) & df['reached_start'].eq(0))
    m_step2_prev = (df['reached_step_2'].eq(1) &
                    ((df['reached_start'].eq(0)) | (df['reached_step_1'].eq(0))))

    m_step3_prev = (df['reached_step_3'].eq(1) &
                    ((df['reached_start'].eq(0)) | (df['reached_step_1'].eq(0)) | (df['reached_step_2'].eq(0))))

    m_conf_prev  = (df['reached_confirm'].eq(1) &
                    ((df['reached_start'].eq(0)) | (df['reached_step_1'].eq(0)) | (df['reached_step_2'].eq(0)) | (df['reached_step_3'].eq(0))))
    
    m_not_start  = (df['t_total'] < 0)
    
    # Full mask
    m_bad = m_step1_prev | m_step2_prev | m_step3_prev | m_conf_prev | m_not_start

    # Main reason (priority by order)
    reason = np.select(
        [
            m_conf_prev,
            m_step3_prev,
            m_step2_prev,
            m_step1_prev,
            m_not_start
        ],
        [
            "reached_confirm with missing previous step(s)",
            "reached_step_3 with missing previous step(s)",
            "reached_step_2 with missing previous step(s)",
            "reached_step_1 with reached_start==0",
            "first chronological event != start"
        ],
        default=""
    )

    df['reason'] = reason

    # Separates anomalies and cleans
    proc_anomalies = df.loc[m_bad].copy()
    proc_clean = df.loc[~m_bad].drop(columns=['reason'])

    # # Ordena columnas (optional): keys first
    # if all(k in proc_clean.columns for k in key_cols):
    #     front = list(key_cols)
    #     proc_clean = proc_clean[front + [c for c in proc_clean.columns if c not in front]]
    #     if not proc_anomalies.empty:
    #         proc_anomalies = proc_anomalies[front + [c for c in proc_anomalies.columns if c not in front]]

    return proc_clean.reset_index(drop=True), proc_anomalies.reset_index(drop=True)


In [44]:
def set_negative_to_nan(df: pd.DataFrame, cols=['t_start_step1', 't_step1_step2', 't_step2_step3'], inplace: bool = False) -> pd.DataFrame:
    """
    For each column in `cols`, if value < 0 -> NaN; else keep value.
    Ensures the column is numeric first.
    """
    d = df if inplace else df.copy()
    for c in ['t_start_step1','t_step1_step2','t_step2_step3','t_step3_conf']:
        d[c] = pd.to_numeric(d[c], errors='coerce').apply(lambda x: np.nan if pd.notna(x) and x < 0 else x)
    return d

In [51]:
# ========= 6) RUN (CONTROL) =========
w_control = prepare_web_for_group(df_web_data, df_demo_control)               # filters and normalizes
wf, back = compute_back_jumps(w_control)                                      # complete sequence → step-backs
wf2 = collapse_last_per_step_and_last_confirm(wf)                             # collapses to last occurrence per step
proc_control = summarize_processes(wf2, back)                                 # 11 row per process + outcomee
proc_control_clean, proc_control_anomalies = stranger_things(proc_control)
proc_control_clean = set_negative_to_nan(proc_control_clean)
kpis_control = kpis_from_processes(proc_control_clean)                        # KPIs
dropoff_control = step_dropoff_table(proc_control_clean)

display(proc_control_clean.head(15))
display(kpis_control)
display(dropoff_control)

print("Valid processes:", len(proc_control_clean))
print("Processes driven by inconsistencies:", len(proc_control_anomalies))
# display(proc_control_anomalies.head(5))

# Count distinct control clients represented in web data
print("Distinct control clients with activity:", proc_control_clean['client_id'].nunique())

# Save files
proc_control_clean.to_csv(config['output_data']['file8'], index=False, sep=",", encoding="utf-8")
proc_control_anomalies.to_csv(config['output_data']['file10'], index=False, sep=",", encoding="utf-8")

print(f'\n======================================================================')
print(f' ***   Sanity Checks:')
print(f'======================================================================')

client_id=[613664]
# client_id=[4079649, 4033851, 5305116, 1028, 69255]

for id in (client_id):
      display(f"Client {id} summarize_processes:", 
            proc_control_clean[proc_control_clean['client_id'] == id])
      
      display(f"Client {id} processes before collapse_last_per_step_and_last_confirm:", 
            wf[wf['client_id'] == id])
      
      display(f"Client {id} processes after collapse_last_per_step_and_last_confirm:", 
            wf2[wf2['client_id'] == id].sort_values(by=['date_time'],ascending=True))
      
      # display(f"Client {id} processes:", 
      #       proc_control_clean[proc_control_clean['client_id']==id][KEY+['completed','n_back_jumps','outcome','t_total']])
      
      display(f"Client {id} processes in raw we_data:", 
            df_web_data_control[df_web_data_control['client_id'] == id].sort_values(by='date_time'))
            

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
0,1028,42237450_62128060588,557292053_87239438319_391157,1,1,1,1,0,0,8.966667,,,,,2,fail
1,1186,446844663_31615102958,507052512_11309370126_442139,1,0,0,0,0,0,,,,,,0,fail
2,1186,446844663_31615102958,795373564_99931517312_810896,1,1,1,0,0,0,0.183333,0.183333,,,,0,fail
3,1195,766842522_69992551638,393817425_39015278493_996341,1,1,1,1,1,1,0.55,1.633333,0.516667,1.383333,4.083333,0,successful
4,1197,753759429_54481946928,71862471_21202285428_848395,1,1,1,1,1,1,0.066667,1.133333,0.1,0.283333,1.583333,1,completed_with_errors
5,1368,366307863_19014662045,784065271_45379483290_309335,1,0,0,0,0,0,,,,,,0,fail
6,2439,607208067_70160939111,848231744_22569944243_37711,1,0,0,0,0,0,,,,,,0,fail
7,2581,770616558_80928163524,182925466_27021409208_83502,1,0,0,0,0,0,,,,,,0,fail
8,3647,742136411_92170694666,492965200_82386590930_211259,1,0,0,0,0,0,,,,,,0,fail
9,5354,233679354_36495397627,192604721_14077630095_819213,1,1,1,1,1,1,0.216667,0.133333,0.283333,0.866667,1.5,0,successful


Unnamed: 0,n_processes,started,step1_rate_%,step2_rate_%,step3_rate_%,completion_rate_%,successful_%,completed_with_errors_%,fail%,t_total_avg_min,t_total_median_min,t_step1_avg_min,t_step1_median_min,t_step2_avg_min,t_step2_median_min,t_step3_avg_min,t_step3_median_min,t_conf_avg_min,t_conf_median_min,avg_back_jumps
0,22819,22819,74.51685,63.324423,57.254919,48.012621,36.776371,11.23625,51.987379,6.785173,4.783333,2.222788,0.516667,0.925299,0.4,1.607815,1.116667,2.339446,1.45,0.26066


Unnamed: 0,from_to,n_from,n_to,conv_rate_%,dropoff_%
0,start→step_1,22819,17004,74.51685,25.48315
1,step_1→step_2,17004,14450,84.980005,15.019995
2,step_2→step_3,14450,13065,90.415225,9.584775
3,step_3→confirm,13065,10956,83.857635,16.142365


Valid processes: 22819
Processes driven by inconsistencies: 541
Distinct control clients with activity: 17892

 ***   Sanity Checks:


'Client 613664 summarize_processes:'

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
1382,613664,41000179_53529812548,363881287_4970467575_714595,1,1,0,0,0,0,0.25,,,,,0,fail


'Client 613664 processes before collapse_last_per_step_and_last_confirm:'

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,step_idx,prev_step_idx,delta,is_back_jump
6004,613664,41000179_53529812548,363881287_4970467575_714595,start,2017-04-08 19:11:05+00:00,0,,,
6005,613664,41000179_53529812548,363881287_4970467575_714595,step_1,2017-04-08 19:11:20+00:00,1,0.0,1.0,False
6006,613664,41000179_53529812548,461322374_46377280172_415348,step_2,2017-04-08 19:11:50+00:00,2,,,
6007,613664,41000179_53529812548,461322374_46377280172_415348,step_3,2017-04-08 19:16:14+00:00,3,2.0,1.0,False
6008,613664,41000179_53529812548,461322374_46377280172_415348,confirm,2017-04-08 19:16:54+00:00,4,3.0,1.0,False
6009,613664,41000179_53529812548,461322374_46377280172_415348,start,2017-04-08 19:18:18+00:00,0,4.0,-4.0,True
6010,613664,41000179_53529812548,461322374_46377280172_415348,step_1,2017-04-08 19:18:38+00:00,1,0.0,1.0,False


'Client 613664 processes after collapse_last_per_step_and_last_confirm:'

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,step_idx,prev_step_idx,delta,is_back_jump
4879,613664,41000179_53529812548,363881287_4970467575_714595,start,2017-04-08 19:11:05+00:00,0,,,
4880,613664,41000179_53529812548,363881287_4970467575_714595,step_1,2017-04-08 19:11:20+00:00,1,0.0,1.0,False
4884,613664,41000179_53529812548,461322374_46377280172_415348,step_2,2017-04-08 19:11:50+00:00,2,,,
4885,613664,41000179_53529812548,461322374_46377280172_415348,step_3,2017-04-08 19:16:14+00:00,3,2.0,1.0,False
4881,613664,41000179_53529812548,461322374_46377280172_415348,confirm,2017-04-08 19:16:54+00:00,4,3.0,1.0,False
4882,613664,41000179_53529812548,461322374_46377280172_415348,start,2017-04-08 19:18:18+00:00,0,4.0,-4.0,True
4883,613664,41000179_53529812548,461322374_46377280172_415348,step_1,2017-04-08 19:18:38+00:00,1,0.0,1.0,False


'Client 613664 processes in raw we_data:'

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
56120,613664,41000179_53529812548,363881287_4970467575_714595,start,2017-04-08 19:11:05
56123,613664,41000179_53529812548,363881287_4970467575_714595,step_1,2017-04-08 19:11:20
56127,613664,41000179_53529812548,461322374_46377280172_415348,step_2,2017-04-08 19:11:50
56150,613664,41000179_53529812548,461322374_46377280172_415348,step_3,2017-04-08 19:16:14
56152,613664,41000179_53529812548,461322374_46377280172_415348,confirm,2017-04-08 19:16:54
56159,613664,41000179_53529812548,461322374_46377280172_415348,start,2017-04-08 19:18:18
56161,613664,41000179_53529812548,461322374_46377280172_415348,step_1,2017-04-08 19:18:38


In [49]:
proc_control_clean[proc_control_clean['t_step3_conf']<0]

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome


In [47]:
# ========= 6) RUN (TEST) =========
w_test = prepare_web_for_group(df_web_data, df_demo_test)               # filters and normalizes
wf_test, back = compute_back_jumps(w_test)                              # complete sequence → step-backs
wf2_test = collapse_last_per_step_and_last_confirm(wf_test)             # collapses to last occurrence per step
proc_test = summarize_processes(wf2_test, back)                         # 1 row per process + outcomee
proc_test_clean, proc_test_anomalies = stranger_things(proc_test)
proc_test_clean = set_negative_to_nan(proc_test_clean)
kpis_test = kpis_from_processes(proc_test_clean)                        # KPIs
dropoff_test = step_dropoff_table(proc_test_clean)

display(proc_test_clean.head(15))
display(kpis_test)
display(dropoff_test)

print("Valid processes:", len(proc_test_clean))
print("Processes driven by inconsistencies:", len(proc_test_anomalies))
# display(proc_test_anomalies.head(5))

# Count distinct test clients represented in web data
print("Distinct test clients with activity:", proc_test_clean['client_id'].nunique())

# Save files
proc_test_clean.to_csv(config['output_data']['file9'], index=False, sep=",", encoding="utf-8")
proc_test_anomalies.to_csv(config['output_data']['file11'], index=False, sep=",", encoding="utf-8")

print(f'\n======================================================================')
print(f' ***   Sanity Checks:')
print(f'======================================================================')

client_id=[4600680]
# client_id=[7230446, 555, 984576]

for id in (client_id):
      display(f"Client {id} summarize_processes:", 
            proc_test_clean[proc_test_clean['client_id'] == id])
      
      display(f"Client {id} processes before collapse_last_per_step_and_last_confirm:", 
            wf_test[wf_test['client_id'] == id].sort_values(by=['date_time'],ascending=True))
      
      display(f"Client {id} processes after collapse_last_per_step_and_last_confirm:", 
            wf2_test[wf2_test['client_id'] == id].sort_values(by=['date_time'],ascending=True))
      
      # display(f"Client {id} processes:", 
      #       proc_test_clean[proc_test_clean['client_id']==id][KEY+['completed','n_back_jumps','outcome','t_total']])
      
      display(f"Client {id} processes in raw we_data:", 
            df_web_data_test[df_web_data_test['client_id'] == id].sort_values(by='date_time'))

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
0,555,402506806_56087378777,637149525_38041617439_716659,1,1,1,1,1,1,0.116667,0.533333,1.65,0.333333,2.633333,0,successful
1,647,66758770_53988066587,40369564_40101682850_311847,1,1,1,1,1,1,0.116667,0.3,3.15,2.716667,6.283333,0,successful
2,934,810392784_45004760546,7076463_57954418406_971348,1,0,0,0,0,0,,,,,,0,fail
3,1516,182314299_63168583136,255400977_38039535960_779641,1,1,1,1,1,1,0.933333,0.583333,10.116667,4.266667,15.9,0,successful
4,1516,182314299_63168583136,481123290_95510977345_707323,1,1,1,0,0,0,4.083333,,,,,3,fail
5,1643,910842861_51062379179,379946188_1773022140_107963,1,0,0,0,0,0,,,,,,0,fail
6,1643,910842861_51062379179,633860590_96880450633_976109,1,1,0,0,0,0,0.416667,,,,,1,fail
7,1755,230596682_29327090182,420603142_53705621002_674558,1,1,1,1,1,1,2.016667,0.316667,0.45,0.933333,3.716667,0,successful
8,1755,230596682_29327090182,690855460_62242406510_53318,1,1,1,1,0,0,0.1,0.85,0.65,,,0,fail
9,1836,619440135_40983729273,275887696_51740057136_798210,1,1,0,0,0,0,2.516667,,,,,1,fail


Unnamed: 0,n_processes,started,step1_rate_%,step2_rate_%,step3_rate_%,completion_rate_%,successful_%,completed_with_errors_%,fail%,t_total_avg_min,t_total_median_min,t_step1_avg_min,t_step1_median_min,t_step2_avg_min,t_step2_median_min,t_step3_avg_min,t_step3_median_min,t_conf_avg_min,t_conf_median_min,avg_back_jumps
0,25932,25932,85.947864,74.332871,66.894185,55.360173,42.333796,13.026377,44.639827,6.984293,4.3,2.573227,0.35,0.873905,0.433333,1.664498,1.116667,2.463916,1.033333,0.479022


Unnamed: 0,from_to,n_from,n_to,conv_rate_%,dropoff_%
0,start→step_1,25932,22288,85.947864,14.052136
1,step_1→step_2,22288,19276,86.486001,13.513999
2,step_2→step_3,19276,17347,89.992737,10.007263
3,step_3→confirm,17347,14356,82.757826,17.242174


Valid processes: 25932
Processes driven by inconsistencies: 2884
Distinct test clients with activity: 21739

 ***   Sanity Checks:


'Client 4600680 summarize_processes:'

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
11946,4600680,208827041_70541462157,672737966_14726163487_971608,1,0,0,0,0,0,,,,,,0,fail
11947,4600680,208827041_70541462157,935958091_89261622321_315288,1,0,0,0,0,0,,,,,,0,fail


'Client 4600680 processes before collapse_last_per_step_and_last_confirm:'

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,step_idx,prev_step_idx,delta,is_back_jump
63809,4600680,208827041_70541462157,935958091_89261622321_315288,start,2017-04-05 21:31:00+00:00,0,,,
63808,4600680,208827041_70541462157,672737966_14726163487_971608,start,2017-04-08 08:40:30+00:00,0,,,
63800,4600680,208827041_70541462157,571402946_85481266703_66292,step_1,2017-04-08 09:12:37+00:00,1,,,
63801,4600680,208827041_70541462157,571402946_85481266703_66292,step_2,2017-04-08 09:12:49+00:00,2,1.0,1.0,False
63802,4600680,208827041_70541462157,571402946_85481266703_66292,step_3,2017-04-08 09:16:37+00:00,3,2.0,1.0,False
63803,4600680,208827041_70541462157,571402946_85481266703_66292,step_2,2017-04-08 09:20:56+00:00,2,3.0,-1.0,True
63804,4600680,208827041_70541462157,571402946_85481266703_66292,step_3,2017-04-08 09:21:04+00:00,3,2.0,1.0,False
63805,4600680,208827041_70541462157,571402946_85481266703_66292,confirm,2017-04-08 09:23:40+00:00,4,3.0,1.0,False
63806,4600680,208827041_70541462157,571402946_85481266703_66292,start,2017-04-08 09:28:21+00:00,0,4.0,-4.0,True
63807,4600680,208827041_70541462157,571402946_85481266703_66292,step_1,2017-04-08 09:28:23+00:00,1,0.0,1.0,False


'Client 4600680 processes after collapse_last_per_step_and_last_confirm:'

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,step_idx,prev_step_idx,delta,is_back_jump
47361,4600680,208827041_70541462157,935958091_89261622321_315288,start,2017-04-05 21:31:00+00:00,0,,,
47360,4600680,208827041_70541462157,672737966_14726163487_971608,start,2017-04-08 08:40:30+00:00,0,,,
47358,4600680,208827041_70541462157,571402946_85481266703_66292,step_2,2017-04-08 09:20:56+00:00,2,3.0,-1.0,True
47359,4600680,208827041_70541462157,571402946_85481266703_66292,step_3,2017-04-08 09:21:04+00:00,3,2.0,1.0,False
47355,4600680,208827041_70541462157,571402946_85481266703_66292,confirm,2017-04-08 09:23:40+00:00,4,3.0,1.0,False
47356,4600680,208827041_70541462157,571402946_85481266703_66292,start,2017-04-08 09:28:21+00:00,0,4.0,-4.0,True
47357,4600680,208827041_70541462157,571402946_85481266703_66292,step_1,2017-04-08 09:28:23+00:00,1,0.0,1.0,False


'Client 4600680 processes in raw we_data:'

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
63279,4600680,208827041_70541462157,935958091_89261622321_315288,start,2017-04-05 21:31:00
73395,4600680,208827041_70541462157,672737966_14726163487_971608,start,2017-04-08 08:40:30
73474,4600680,208827041_70541462157,571402946_85481266703_66292,step_1,2017-04-08 09:12:37
73475,4600680,208827041_70541462157,571402946_85481266703_66292,step_2,2017-04-08 09:12:49
73482,4600680,208827041_70541462157,571402946_85481266703_66292,step_3,2017-04-08 09:16:37
73491,4600680,208827041_70541462157,571402946_85481266703_66292,step_2,2017-04-08 09:20:56
73492,4600680,208827041_70541462157,571402946_85481266703_66292,step_3,2017-04-08 09:21:04
73504,4600680,208827041_70541462157,571402946_85481266703_66292,confirm,2017-04-08 09:23:40
73525,4600680,208827041_70541462157,571402946_85481266703_66292,start,2017-04-08 09:28:21
73526,4600680,208827041_70541462157,571402946_85481266703_66292,step_1,2017-04-08 09:28:23


In [50]:
proc_test_clean[proc_test_clean['t_step3_conf']<0]

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
