In [1]:
# <-- Import libraries, custom functions, and load configuration & datasets <-- #

import yaml
import datetime as dt
import re
import pandas as pd
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import seaborn as sns

from statsmodels.multivariate.manova import MANOVA
from scipy import stats
from scipy.stats import pearsonr, boxcox, chi2_contingency, shapiro, probplot
from scipy.stats.contingency import association

sns.set(style="whitegrid")

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

# from functions import ()

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")


# df_demo = pd.read_csv(config['output_data']['file1'])
# df_demo_variation = pd.read_csv(config['output_data']['file2'])
df_demo_test = pd.read_csv(config['output_data']['file3'])
# df_demo_control = pd.read_csv(config['output_data']['file4'])
df_web_data = pd.read_csv(config['output_data']['file5'])


In [2]:
# ========= Config =========
STEP_ORDER = ['start','step_1','step_2','step_3','confirm']
STEP_MAP   = {s:i for i,s in enumerate(STEP_ORDER)}
KEY = ['client_id','visitor_id','visit_id']   # clave del proceso

def _to_utc(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors='coerce')
    if getattr(s.dt, "tz", None) is None:
        return s.dt.tz_localize("UTC")
    return s.dt.tz_convert("UTC")

def _standardize_steps(df):
    w = df.copy()
    w['process_step'] = (w['process_step'].str.lower().str.strip()
                         .str.replace(' ', '_')
                         .str.replace('step1','step_1')
                         .str.replace('step2','step_2')
                         .str.replace('step3','step_3'))
    return w[w['process_step'].isin(STEP_ORDER)]


In [3]:
# ========= 1) Filtrado al grupo control y normalización =========
def prepare_web_for_group(df_web_data, df_demo_group):
    ids = set(df_demo_group['client_id'].unique())
    w = df_web_data[df_web_data['client_id'].isin(ids)].copy()
    w = _standardize_steps(w)
    w['date_time'] = _to_utc(w['date_time'])
    # Orden **por proceso**
    w = w.sort_values(KEY + ['date_time']).reset_index(drop=True)
    return w

In [4]:
# ========= 2) Secuencia completa → step-backs =========
def compute_back_jumps(w_full: pd.DataFrame) -> pd.DataFrame:
    wf = w_full.copy()
    wf['step_idx'] = wf['process_step'].map(STEP_MAP).astype('Int64')
    wf['prev_step_idx'] = wf.groupby(KEY)['step_idx'].shift(1)
    wf['delta'] = wf['step_idx'] - wf['prev_step_idx']
    wf['is_back_jump'] = wf['delta'].lt(0)
    back = (wf.groupby(KEY)
              .agg(n_back_jumps=('is_back_jump','sum'))
              .reset_index())
    return wf, back

In [5]:
# ========= 3) Últimas ocurrencias por paso y último confirm =========
def collapse_last_per_step_and_last_confirm(wf: pd.DataFrame) -> pd.DataFrame:
    # eliminamos confirms intermedios por proceso (deja solo el último)
    is_last_confirm = (wf['process_step'].eq('confirm') &
                       ~wf.duplicated(subset=KEY+['process_step'], keep='last'))
    drop_mask = wf['process_step'].eq('confirm') & (~is_last_confirm)
    wf2 = wf.loc[~drop_mask].copy()

    # para tiempos/completion tomamos **última** ocurrencia de cada paso por proceso
    wf2 = (wf2.sort_values(KEY + ['process_step','date_time'])
              .drop_duplicates(subset=KEY+['process_step'], keep='last'))
    return wf2

In [6]:
# ========= 4) Tabla por proceso (1 fila = 1 proceso) =========
def summarize_processes(wf2: pd.DataFrame, back: pd.DataFrame) -> pd.DataFrame:
    # Pivot de timestamps
    pv = wf2.pivot_table(index=KEY, columns='process_step', values='date_time', aggfunc='last') \
            .reindex(columns=STEP_ORDER)
    for c in STEP_ORDER:
        if c not in pv.columns:
            pv[c] = pd.NaT
    # todos como naive UTC para restas
    for c in pv.columns:
        pv[c] = _to_utc(pv[c]).dt.tz_convert('UTC').dt.tz_localize(None)

    # flags reached y completed
    flags = pv.notna().astype(int).add_prefix('reached_')
    flags['completed'] = flags['reached_confirm']

    # tiempos (min)
    def mins(b,a): return (pv[b] - pv[a]) / pd.Timedelta(minutes=1)
    times = pd.DataFrame(index=pv.index)
    times['t_start_step1'] = mins('step_1','start')
    times['t_step1_step2'] = mins('step_2','step_1')
    times['t_step2_step3'] = mins('step_3','step_2')
    times['t_step3_conf']  = mins('confirm','step_3')
    times['t_total']       = mins('confirm','start')

    proc = (flags.join(times)).reset_index()
    proc = proc.merge(back, on=KEY, how='left').fillna({'n_back_jumps':0})

    # Outcome:
    # - successful: completed y sin step-backs
    # - completed_with_errors: completed con step-backs
    # - unsuccessful: no completed
    proc['outcome'] = np.where(
        (proc['completed']==1) & (proc['n_back_jumps']==0), 'successful',
        np.where((proc['completed']==1) & (proc['n_back_jumps']>0), 'completed_with_errors', 'unsuccessful')
    )
    return proc

In [7]:
# ========= 5) KPIs & Drop-off =========
def kpis_from_processes(proc: pd.DataFrame) -> pd.DataFrame:
    d = proc.copy()
    denom = max(1, int(d['reached_start'].sum()))   # quienes empezaron

    kpis = {
        'n_processes'          : int(len(d)),
        'started'              : int(d['reached_start'].sum()),
        'step1_rate_%'         : 100*d['reached_step_1'].sum()/denom,
        'step2_rate_%'         : 100*d['reached_step_2'].sum()/denom,
        'step3_rate_%'         : 100*d['reached_step_3'].sum()/denom,
        'completion_rate_%'    : 100*d['completed'].sum()/denom,
        'successful_%'         : 100*(d['outcome'].eq('successful').sum())/denom,
        'completed_with_errors_%': 100*(d['outcome'].eq('completed_with_errors').sum())/denom,
        'unsuccessful_%'       : 100*(d['outcome'].eq('unsuccessful').sum())/denom,
        't_total_median_min'   : float(np.nanmedian(d['t_total'])),
        't_step1_median_min'   : float(np.nanmedian(d['t_start_step1'])),
        't_step2_median_min'   : float(np.nanmedian(d['t_step1_step2'])),
        't_step3_median_min'   : float(np.nanmedian(d['t_step2_step3'])),
        't_conf_median_min'    : float(np.nanmedian(d['t_step3_conf'])),
        'avg_back_jumps'       : float(d['n_back_jumps'].mean())
    }
    return pd.DataFrame([kpis])

In [8]:
def step_dropoff_table(proc: pd.DataFrame) -> pd.DataFrame:
    d = proc.copy()
    rows = [
        ("start→step_1", int(d['reached_start'].sum()),   int(d['reached_step_1'].sum())),
        ("step_1→step_2", int(d['reached_step_1'].sum()), int(d['reached_step_2'].sum())),
        ("step_2→step_3", int(d['reached_step_2'].sum()), int(d['reached_step_3'].sum())),
        ("step_3→confirm", int(d['reached_step_3'].sum()), int(d['completed'].sum())),
    ]
    out = pd.DataFrame(rows, columns=['from_to','n_from','n_to'])
    out['conv_rate_%'] = 100 * out['n_to'] / out['n_from'].replace({0:np.nan})
    out['conv_rate_%'] = out['conv_rate_%'].fillna(0.0)
    out['dropoff_%'] = 100 - out['conv_rate_%']
    return out

In [9]:
# ========= 6) RUN (CONTROL) =========
w_control = prepare_web_for_group(df_web_data, df_demo_test)              # filtra y normaliza
wf, back = compute_back_jumps(w_control)                                  # secuencia completa → step-backs
wf2 = collapse_last_per_step_and_last_confirm(wf)                         # colapsa a última ocurrencia por paso
proc_control = summarize_processes(wf2, back)                             # 1 fila por proceso + outcome
kpis_control = kpis_from_processes(proc_control)                          # KPIs
dropoff_control = step_dropoff_table(proc_control)

display(proc_control.head(15))
display(kpis_control)
display(dropoff_control)

# Sanity checks con los ejemplos que pasaste:
display("Client 4079649 processes:", 
      proc_control[proc_control['client_id']==4079649][KEY+['completed','n_back_jumps','outcome','t_total']])

display("Client 4033851 processes:", 
      proc_control[proc_control['client_id']==4033851][KEY+['completed','n_back_jumps','outcome','t_total']])

display("Client 5305116 processes:", 
      proc_control[proc_control['client_id']==5305116][KEY+['completed','n_back_jumps','outcome','t_total']])

Unnamed: 0,client_id,visitor_id,visit_id,reached_start,reached_step_1,reached_step_2,reached_step_3,reached_confirm,completed,t_start_step1,t_step1_step2,t_step2_step3,t_step3_conf,t_total,n_back_jumps,outcome
0,555,402506806_56087378777,637149525_38041617439_716659,1,1,1,1,1,1,0.116667,0.533333,1.65,0.333333,2.633333,0,successful
1,647,66758770_53988066587,40369564_40101682850_311847,1,1,1,1,1,1,0.116667,0.3,3.15,2.716667,6.283333,0,successful
2,934,810392784_45004760546,7076463_57954418406_971348,1,0,0,0,0,0,,,,,,0,unsuccessful
3,1516,182314299_63168583136,255400977_38039535960_779641,1,1,1,1,1,1,0.933333,0.583333,10.116667,4.266667,15.9,0,successful
4,1516,182314299_63168583136,481123290_95510977345_707323,1,1,1,0,0,0,0.233333,-0.133333,,,,3,unsuccessful
5,1643,910842861_51062379179,379946188_1773022140_107963,1,0,0,0,0,0,,,,,,0,unsuccessful
6,1643,910842861_51062379179,633860590_96880450633_976109,1,1,0,0,0,0,-1.8,,,,,1,unsuccessful
7,1702,470573753_93844895765,243444359_78696078676_118990,0,0,0,0,1,1,,,,,,0,successful
8,1755,230596682_29327090182,420603142_53705621002_674558,1,1,1,1,1,1,2.016667,0.316667,0.45,0.933333,3.716667,0,successful
9,1755,230596682_29327090182,690855460_62242406510_53318,1,1,1,1,0,0,0.1,0.85,0.65,,,0,unsuccessful


Unnamed: 0,n_processes,started,step1_rate_%,step2_rate_%,step3_rate_%,completion_rate_%,successful_%,completed_with_errors_%,unsuccessful_%,t_total_median_min,t_step1_median_min,t_step2_median_min,t_step3_median_min,t_conf_median_min,avg_back_jumps
0,28816,26015,86.61157,75.441092,68.6527,65.642898,52.454353,13.188545,45.123967,3.55,0.15,0.4,1.083333,1.033333,0.435938


Unnamed: 0,from_to,n_from,n_to,conv_rate_%,dropoff_%
0,start→step_1,26015,22532,86.61157,13.38843
1,step_1→step_2,22532,19626,87.102787,12.897213
2,step_2→step_3,19626,17860,91.001732,8.998268
3,step_3→confirm,17860,17077,95.615901,4.384099


'Client 4079649 processes:'

Unnamed: 0,client_id,visitor_id,visit_id,completed,n_back_jumps,outcome,t_total


'Client 4033851 processes:'

Unnamed: 0,client_id,visitor_id,visit_id,completed,n_back_jumps,outcome,t_total


'Client 5305116 processes:'

Unnamed: 0,client_id,visitor_id,visit_id,completed,n_back_jumps,outcome,t_total
