In [None]:
# <-- Import libraries, custom functions, and load configuration & datasets <-- #

import yaml
import datetime as dt
import re
import pandas as pd
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import seaborn as sns

from statsmodels.multivariate.manova import MANOVA
from scipy import stats
from scipy.stats import pearsonr, boxcox, chi2_contingency, shapiro, probplot
from scipy.stats.contingency import association

sns.set(style="whitegrid")

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

# from functions import ()

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

df_demo_test = pd.read_csv(config['output_data']['file3'])
df_web_data = pd.read_csv(config['output_data']['file5'])

In [2]:
df_web_data['date_time'] = pd.to_datetime(df_web_data['date_time'])

In [3]:
display(df_web_data.sort_values(by=['date_time'],ascending=True))
print(df_web_data.dtypes)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
284865,9088444,242404224_96732670250,423038079_46067236368_400417,step_3,2017-03-15 00:03:03
35092,7179755,167765295_97487764427,264484508_5982901710_928530,start,2017-03-15 00:19:28
35091,7179755,167765295_97487764427,264484508_5982901710_928530,step_1,2017-03-15 00:20:50
35090,7179755,167765295_97487764427,264484508_5982901710_928530,step_2,2017-03-15 00:22:52
35089,7179755,167765295_97487764427,264484508_5982901710_928530,step_3,2017-03-15 00:23:47
...,...,...,...,...,...
286660,1104891,710002770_61911521918,597284396_96145314289_460044,step_1,2017-04-30 23:57:32
144767,5402972,481155336_6881867780,989795424_84700610095_958376,step_3,2017-04-30 23:57:56
79749,8007865,502318665_28702751895,104145193_36067138148_118971,start,2017-04-30 23:57:59
79748,8007865,502318665_28702751895,104145193_36067138148_118971,start,2017-04-30 23:58:00


client_id                int64
visitor_id              object
visit_id                object
process_step            object
date_time       datetime64[ns]
dtype: object


In [4]:
display(df_demo_test)
print(df_demo_test.dtypes)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,clnt_age_quantile,tenure_quantile
0,836976,6.0,73.0,60.5,2.0,45105.30,6.0,9.0,Test,A4,T1
1,1439522,5.0,64.0,32.0,2.0,52467.79,6.0,9.0,Test,A1,T1
2,1562045,16.0,198.0,49.0,2.0,67454.65,3.0,6.0,Test,A3,T3
3,388801,30.0,361.0,57.5,5.0,522498.72,1.0,4.0,Test,A3,T4
4,8198645,15.0,189.0,54.5,2.0,382303.83,6.0,9.0,Test,A3,T3
...,...,...,...,...,...,...,...,...,...,...,...
26956,501496,23.0,281.0,35.0,2.0,32529.34,0.0,3.0,Test,A2,T4
26957,1780858,21.0,262.0,68.5,3.0,372100.59,6.0,9.0,Test,A4,T4
26958,5826160,20.0,249.0,56.5,2.0,44837.16,2.0,5.0,Test,A3,T4
26959,8739285,19.0,229.0,69.5,2.0,44994.24,1.0,4.0,Test,A4,T4


client_id              int64
clnt_tenure_yr       float64
clnt_tenure_mnth     float64
clnt_age             float64
num_accts            float64
bal                  float64
calls_6_mnth         float64
logons_6_mnth        float64
Variation             object
clnt_age_quantile     object
tenure_quantile       object
dtype: object


In [5]:
df_web_data[df_web_data['client_id'] == 4079649].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
233226,4079649,135423080_71753269572,155993046_1538887816_806756,start,2017-04-10 12:16:03
233225,4079649,135423080_71753269572,155993046_1538887816_806756,step_1,2017-04-10 12:16:45
233224,4079649,135423080_71753269572,155993046_1538887816_806756,step_2,2017-04-10 12:17:39
233223,4079649,135423080_71753269572,155993046_1538887816_806756,step_3,2017-04-10 12:18:03
233222,4079649,135423080_71753269572,155993046_1538887816_806756,confirm,2017-04-10 12:19:52


In [6]:
# Keep only client_id to avoid duplicate column proliferation
df_web_data_test = (df_web_data.merge(df_demo_test[['client_id']].drop_duplicates(), on='client_id', how='inner'))

print(f"Original web rows: {len(df_web_data):,}")
print(f"Filtered web rows (test only): {len(df_web_data_test):,}")

Original web rows: 341,046
Filtered web rows (test only): 139,342


- df_web_data_test['client_id']: Extracts the client_id column (a pandas Series) from the dataframe df_web_data_test.
- set(df_web_data_test['client_id']): Converts that Series to a Python set, removing duplicates. So you now have the unique client IDs that appear in the “web data” test dataframe.
- Same for set(df_demo_test['client_id']): the unique client IDs in the demo_test dataframe.

**.issubset(...)**
- Returns True if every element in the left-hand set (web data clients) appears in the right-hand set (demo clients). In other words:
- All client_ids present in df_web_data_test must also exist in df_demo_test.

**assert <condition>**
- If the condition is True, nothing happens (code continues).
- If it is False, Python raises an AssertionError, stopping execution (unless caught).

In [None]:
# 1. Ensure no client outside test group
assert set(df_web_data_test['client_id']).issubset(set(df_demo_test['client_id']))

# 2. Count distinct test clients represented in web data
print("Distinct test clients with activity:", df_web_data_test['client_id'].nunique())

# 3. How many test clients have zero web records?
test_without_web = (set(df_demo_test['client_id']) - set(df_web_data_test['client_id']))
print("Test clients without any web activity:", len(test_without_web))

Distinct test clients with activity: 22007
Test clients without any web activity: 4954


In [8]:
df_web_data_test['client_id'].unique()

array([9988021, 8320017, 1982004, ..., 7230446, 5230357, 6334360],
      shape=(22007,))

In [9]:
# Example of a successful process (no step-backs)
df_web_data_test[df_web_data_test['client_id'] == 8320017].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
12,8320017,39393514_33118319366,960651974_70596002104_312201,start,2017-04-05 13:08:06
11,8320017,39393514_33118319366,960651974_70596002104_312201,step_1,2017-04-05 13:08:24
10,8320017,39393514_33118319366,960651974_70596002104_312201,step_2,2017-04-05 13:08:40
9,8320017,39393514_33118319366,960651974_70596002104_312201,step_3,2017-04-05 13:09:43
8,8320017,39393514_33118319366,960651974_70596002104_312201,confirm,2017-04-05 13:10:05


In [10]:
# Example of complted operation with step-backs (it has reached process_step = 'confirm')
df_web_data_test[df_web_data_test['client_id'] == 6334360].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
139332,6334360,629124187_65258232847,586047816_14599436811_389185,start,2017-04-11 08:42:20
139331,6334360,629124187_65258232847,586047816_14599436811_389185,step_1,2017-04-11 08:42:36
139330,6334360,629124187_65258232847,586047816_14599436811_389185,step_2,2017-04-11 08:44:30
139329,6334360,629124187_65258232847,586047816_14599436811_389185,step_1,2017-04-11 08:44:43
139328,6334360,629124187_65258232847,586047816_14599436811_389185,step_2,2017-04-11 08:45:38
139327,6334360,629124187_65258232847,586047816_14599436811_389185,step_1,2017-04-11 08:45:40
139326,6334360,629124187_65258232847,586047816_14599436811_389185,step_2,2017-04-11 08:46:19
139325,6334360,629124187_65258232847,586047816_14599436811_389185,step_3,2017-04-11 08:46:49
139324,6334360,629124187_65258232847,586047816_14599436811_389185,confirm,2017-04-11 08:47:23


In [11]:
# Example of a failed process with multiple step-baks
df_web_data_test[df_web_data_test['client_id'] == 9988021].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
97,9988021,580560515_7732621733,219729223_42518459208_211914,start,2017-04-12 16:57:27
96,9988021,580560515_7732621733,219729223_42518459208_211914,start,2017-04-12 17:34:37
78,9988021,580560515_7732621733,219729223_42518459208_211914,start,2017-04-12 17:50:38
77,9988021,580560515_7732621733,219729223_42518459208_211914,start,2017-04-12 18:06:39
76,9988021,580560515_7732621733,219729223_42518459208_211914,start,2017-04-12 18:22:40
75,9988021,580560515_7732621733,219729223_42518459208_211914,start,2017-04-12 18:38:42
74,9988021,580560515_7732621733,219729223_42518459208_211914,start,2017-04-12 18:54:43
7,9988021,580560515_7732621733,781255054_21935453173_531117,start,2017-04-17 15:16:22
6,9988021,580560515_7732621733,781255054_21935453173_531117,step_1,2017-04-17 15:17:01
5,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:17:15


In [None]:
# Example of a failed process with multiple step-baks
df_web_data_test[df_web_data_test['client_id'] == 7230446].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
139318,7230446,301456995_82370441715,869576778_94474334805_445679,start,2017-03-15 20:17:36
139317,7230446,301456995_82370441715,869576778_94474334805_445679,step_1,2017-03-15 20:17:48
139316,7230446,301456995_82370441715,869576778_94474334805_445679,step_1,2017-03-15 20:18:57
139315,7230446,301456995_82370441715,869576778_94474334805_445679,step_2,2017-03-15 20:20:00
139314,7230446,301456995_82370441715,869576778_94474334805_445679,step_3,2017-03-15 20:20:44
139313,7230446,301456995_82370441715,869576778_94474334805_445679,step_2,2017-03-15 20:22:38
139312,7230446,301456995_82370441715,869576778_94474334805_445679,step_1,2017-03-15 20:23:16
139311,7230446,301456995_82370441715,869576778_94474334805_445679,step_2,2017-03-15 20:23:52
139310,7230446,301456995_82370441715,869576778_94474334805_445679,step_3,2017-03-15 20:23:53
139309,7230446,301456995_82370441715,869576778_94474334805_445679,step_2,2017-03-15 20:23:54


In [15]:
df_web_data_test[df_web_data_test['client_id'] == 1702].sort_values(by='date_time')

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
106998,1702,470573753_93844895765,243444359_78696078676_118990,confirm,2017-04-01 11:44:04
