In [1]:
# <-- Import libraries, custom functions, and load configuration & datasets <-- #

import yaml
import datetime as dt
import re
import pandas as pd
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import seaborn as sns

from statsmodels.multivariate.manova import MANOVA
from scipy import stats
from scipy.stats import pearsonr, boxcox, chi2_contingency, shapiro, probplot
from scipy.stats.contingency import association

sns.set(style="whitegrid")

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

# from functions import ()

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")


# df_demo = pd.read_csv(config['output_data']['file1'])
# df_demo_variation = pd.read_csv(config['output_data']['file2'])
# df_demo_test = pd.read_csv(config['output_data']['file3'])
df_demo_control = pd.read_csv(config['output_data']['file4'])
df_web_data = pd.read_csv(config['output_data']['file5'])

In [2]:
df_web_data['date_time'] = pd.to_datetime(df_web_data['date_time'])

In [3]:
display(df_web_data.sort_values(by=['date_time'],ascending=True))
print(df_web_data.dtypes)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
284865,9088444,242404224_96732670250,423038079_46067236368_400417,step_3,2017-03-15 00:03:03
35092,7179755,167765295_97487764427,264484508_5982901710_928530,start,2017-03-15 00:19:28
35091,7179755,167765295_97487764427,264484508_5982901710_928530,step_1,2017-03-15 00:20:50
35090,7179755,167765295_97487764427,264484508_5982901710_928530,step_2,2017-03-15 00:22:52
35089,7179755,167765295_97487764427,264484508_5982901710_928530,step_3,2017-03-15 00:23:47
...,...,...,...,...,...
286660,1104891,710002770_61911521918,597284396_96145314289_460044,step_1,2017-04-30 23:57:32
144767,5402972,481155336_6881867780,989795424_84700610095_958376,step_3,2017-04-30 23:57:56
79749,8007865,502318665_28702751895,104145193_36067138148_118971,start,2017-04-30 23:57:59
79748,8007865,502318665_28702751895,104145193_36067138148_118971,start,2017-04-30 23:58:00


client_id                int64
visitor_id              object
visit_id                object
process_step            object
date_time       datetime64[ns]
dtype: object


In [4]:
display(df_demo_control)
print(df_demo_control.dtypes)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,clnt_age_quantile,tenure_quantile
0,2304905,7.0,94.0,58.0,2.0,110860.30,6.0,9.0,Control,A3,T2
1,5126305,12.0,145.0,33.0,2.0,103671.75,0.0,3.0,Control,A2,T3
2,3727881,5.0,71.0,30.5,2.0,23915.60,0.0,3.0,Control,A1,T1
3,272934,5.0,66.0,58.5,2.0,27021.42,2.0,5.0,Control,A3,T1
4,285619,30.0,369.0,67.5,2.0,299388.72,3.0,6.0,Control,A4,T4
...,...,...,...,...,...,...,...,...,...,...,...
23521,6662941,7.0,84.0,36.0,2.0,83101.84,4.0,7.0,Control,A2,T2
23522,2787512,23.0,283.0,48.0,3.0,123618.60,1.0,4.0,Control,A3,T4
23523,9932367,23.0,280.0,57.0,2.0,94466.55,6.0,9.0,Control,A3,T4
23524,4079649,23.0,276.0,62.5,3.0,57575.28,4.0,7.0,Control,A4,T4


client_id              int64
clnt_tenure_yr       float64
clnt_tenure_mnth     float64
clnt_age             float64
num_accts            float64
bal                  float64
calls_6_mnth         float64
logons_6_mnth        float64
Variation             object
clnt_age_quantile     object
tenure_quantile       object
dtype: object


In [5]:
df_web_data[df_web_data['client_id'] == 4079649].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
233226,4079649,135423080_71753269572,155993046_1538887816_806756,start,2017-04-10 12:16:03
233225,4079649,135423080_71753269572,155993046_1538887816_806756,step_1,2017-04-10 12:16:45
233224,4079649,135423080_71753269572,155993046_1538887816_806756,step_2,2017-04-10 12:17:39
233223,4079649,135423080_71753269572,155993046_1538887816_806756,step_3,2017-04-10 12:18:03
233222,4079649,135423080_71753269572,155993046_1538887816_806756,confirm,2017-04-10 12:19:52


In [6]:
# Keep only client_id to avoid duplicate column proliferation
df_web_data_control = (df_web_data.merge(df_demo_control[['client_id']].drop_duplicates(), on='client_id', how='inner'))

print(f"Original web rows: {len(df_web_data):,}")
print(f"Filtered web rows (control only): {len(df_web_data_control):,}")

Original web rows: 341,046
Filtered web rows (control only): 100,569


In [7]:
# 1. Ensure no client outside control group
assert set(df_web_data_control['client_id']).issubset(set(df_demo_control['client_id']))

# 2. Count distinct control clients represented in web data
print("Distinct control clients with activity:", df_web_data_control['client_id'].nunique())

# 3. How many control clients have zero web records?
control_without_web = (set(df_demo_control['client_id']) - set(df_web_data_control['client_id']))
print("Control clients without any web activity:", len(control_without_web))

Distinct control clients with activity: 18011
Control clients without any web activity: 5515


In [8]:
df_web_data_control

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:29:03
1,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:29:01
2,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:28:52
3,4033851,762728880_76361333336,949661017_22392791362_127391,step_3,2017-04-05 12:26:08
4,4033851,762728880_76361333336,949661017_22392791362_127391,step_2,2017-04-05 12:24:43
...,...,...,...,...,...
100564,5305116,87196875_82592805389,69164930_20082199445_40666,confirm,2017-04-20 09:30:11
100565,5305116,87196875_82592805389,69164930_20082199445_40666,step_3,2017-04-20 09:27:44
100566,5305116,87196875_82592805389,69164930_20082199445_40666,step_2,2017-04-20 09:26:41
100567,5305116,87196875_82592805389,69164930_20082199445_40666,step_1,2017-04-20 09:25:58


In [9]:
# Example of a completed & successful process (No error) 
df_web_data_control[df_web_data_control['client_id'] == 4079649].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
96366,4079649,135423080_71753269572,155993046_1538887816_806756,start,2017-04-10 12:16:03
96365,4079649,135423080_71753269572,155993046_1538887816_806756,step_1,2017-04-10 12:16:45
96364,4079649,135423080_71753269572,155993046_1538887816_806756,step_2,2017-04-10 12:17:39
96363,4079649,135423080_71753269572,155993046_1538887816_806756,step_3,2017-04-10 12:18:03
96362,4079649,135423080_71753269572,155993046_1538887816_806756,confirm,2017-04-10 12:19:52


In [10]:
# Example of complted operation with step-backs (it has reached process_step = 'confirm')
df_web_data_control[df_web_data_control['client_id'] == 4033851].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
30,4033851,762728880_76361333336,949661017_22392791362_127391,start,2017-04-05 12:04:00
29,4033851,762728880_76361333336,949661017_22392791362_127391,step_1,2017-04-05 12:04:56
28,4033851,762728880_76361333336,949661017_22392791362_127391,step_2,2017-04-05 12:06:37
27,4033851,762728880_76361333336,949661017_22392791362_127391,step_3,2017-04-05 12:08:56
9,4033851,762728880_76361333336,949661017_22392791362_127391,step_2,2017-04-05 12:10:25
10,4033851,762728880_76361333336,949661017_22392791362_127391,step_3,2017-04-05 12:10:25
7,4033851,762728880_76361333336,949661017_22392791362_127391,step_2,2017-04-05 12:10:26
8,4033851,762728880_76361333336,949661017_22392791362_127391,step_3,2017-04-05 12:10:26
6,4033851,762728880_76361333336,949661017_22392791362_127391,start,2017-04-05 12:24:04
5,4033851,762728880_76361333336,949661017_22392791362_127391,step_1,2017-04-05 12:24:12


In [11]:
# Example of a client with multiple process: 
# --> First process is a failed process that failed at the process_step = start
# --> Second process completd sucessfully with out step-backs

df_web_data_control[df_web_data_control['client_id'] == 5305116].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
100550,5305116,48895020_52187827152,454423043_73789478140_859004,start,2017-04-05 09:02:49
100549,5305116,48895020_52187827152,454423043_73789478140_859004,start,2017-04-05 09:03:22
100568,5305116,87196875_82592805389,69164930_20082199445_40666,start,2017-04-20 09:25:23
100567,5305116,87196875_82592805389,69164930_20082199445_40666,step_1,2017-04-20 09:25:58
100566,5305116,87196875_82592805389,69164930_20082199445_40666,step_2,2017-04-20 09:26:41
100565,5305116,87196875_82592805389,69164930_20082199445_40666,step_3,2017-04-20 09:27:44
100564,5305116,87196875_82592805389,69164930_20082199445_40666,confirm,2017-04-20 09:30:11


In [None]:
# Example of a failed process on start
df_web_data_control[df_web_data_control['client_id'] == 1368].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
40912,1368,366307863_19014662045,784065271_45379483290_309335,start,2017-04-09 21:58:28
