In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
from statsmodels.stats.proportion import proportions_ztest

pd.options.mode.copy_on_write = True

# loading all datasets
url_client_profiles = 'data/df_final_demo.txt'
url_digital_footprints1 = 'data/df_final_web_data_pt_1.txt'
url_digital_footprints2 = 'data/df_final_web_data_pt_2.txt'
url_experiment_roster = 'data/df_final_experiment_clients.txt'

df1 = pd.read_csv(url_digital_footprints1)
df2 = pd.read_csv(url_digital_footprints2)

# imported dataframes to work with
df_client_profiles = pd.read_csv(url_client_profiles)
df_exp_roster = pd.read_csv(url_experiment_roster)
# merged footprint files
df_footprints = pd.concat([df1, df2])

In [2]:
# cleaning the datasets
df_client_profiles.rename(columns={'clnt_tenure_yr': 'client_tenure_years', 'clnt_tenure_mnth': 'client_tenure_months', 'clnt_age': 'client_age', 'gendr': 'gender', 'num_accts': 'num_accounts', 'bal': 'balance', 'calls_6_mnth': 'calls_6months', 'logons_6_mnth': 'logins_6months'}, inplace=True)
df_client_profiles_cleaned = df_client_profiles.dropna(subset=["client_tenure_years", "client_tenure_months", "client_age", "gender", "num_accounts", "balance", "calls_6months", "logins_6months"], how="all")
df_exp_roster.rename(columns={'Variation': 'variation'}, inplace=True)
df_exp_roster_cleaned = df_exp_roster.dropna(subset=["variation"], how="all")
df_footprints_cleaned = df_footprints.dropna(subset=["client_id", "visitor_id", "visit_id", "process_step", "date_time"], how="all")
df_footprints_cleaned = df_footprints_cleaned.drop_duplicates(subset=['client_id', 'visit_id', 'date_time'])
df_client_profiles_cleaned['gender'] = df_client_profiles_cleaned['gender'].apply(lambda x: 'U' if x == 'X' else x)
df_client_profiles_cleaned['gender'] = df_client_profiles_cleaned['gender'].fillna('U')

In [5]:
# joining footprints and experiment roster datasets
if not isinstance(locals().get('df_sorted'), pd.DataFrame):
    df_footprints_cleaned.set_index('client_id', inplace=True)
    df_exp_roster_cleaned.set_index('client_id', inplace=True)
    joined_df = df_footprints_cleaned.join(df_exp_roster_cleaned, how='inner')
    joined_df.reset_index(drop=False, inplace=True)
    joined_df['date_time'] = pd.to_datetime(joined_df['date_time'])
    
df_sorted = joined_df.sort_values(by=['variation', 'visit_id', 'date_time'])

In [7]:
# Ensure 'date_time' is in datetime format
df_sorted['date_time'] = pd.to_datetime(df_sorted['date_time'])

# Sort by visit_id and date_time to ensure correct time difference calculation
df_sorted = df_sorted.sort_values(by=['visit_id', 'date_time'])

# Shift the 'date_time' column to get the next timestamp within each visit_id group
df_sorted['next_date_time'] = df_sorted.groupby('visit_id')['date_time'].shift(-1)

# Calculate time spent on each step (difference between the next timestamp and the current one)
df_sorted['time_spent'] = (df_sorted['next_date_time'] - df_sorted['date_time']).dt.total_seconds()

# Drop the temporary 'next_date_time' column as it's no longer needed
df_sorted.drop(columns=['next_date_time'], inplace=True)

# set 0 for NaN values (last steps)
df_sorted['time_spent'] = df_sorted['time_spent'].fillna(0)

# Compute the average time spent per process_step
average_time_per_step = df_sorted.groupby('process_step')['time_spent'].mean()

In [9]:
##### Calculating Completion Rate per Group + Performing Hypothesis Testing #####

# Create seperate dfs for treatment and control group to pass/ calculate the completion rate kpi 
df_test = joined_df[joined_df["variation"] == "Test"]
df_control = joined_df[joined_df["variation"] == "Control"]

# Count total unique visits per group
n_control = df_control['visit_id'].nunique()
n_test = df_test['visit_id'].nunique()

# Count unique visits that reached the "Confirm" step per group
completed_control = df_control[df_control['process_step'] == 'confirm']['visit_id'].nunique()
completed_test = df_test[df_test['process_step'] == 'confirm']['visit_id'].nunique()

# Calculate completion rates
comp_rate_control = completed_control / n_control
comp_rate_test = completed_test / n_test

# Print completion rates
print(f"The completion rate for the control group (old version) is: {comp_rate_control:.4f}")
print(f"The completion rate for the test group (new version) is: {comp_rate_test:.4f}")

# Perform Z-test for proportions
count = np.array([completed_test, completed_control ])  # Successes in each group
nobs = np.array([n_test, n_control])  # Total observations in each group

z_stat, p_value = proportions_ztest(count, nobs, alternative='larger')

# Print results
print(f"Z-statistic: {z_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the result
alpha = 0.05  # Significance level
if p_value < alpha:
    print("Reject the null hypothesis: The new version has a significantly higher completion rate.")
else:
    print("Fail to reject the null hypothesis: No significant difference in completion rates.")

# Claculating: Completion Rate with a Cost-Effectiveness Threshold
threshold = 0.05  # 5% increase required
observed_increase = comp_rate_test - comp_rate_control

# Check if the observed increase meets the threshold
if observed_increase >= threshold:
    print(f"The observed increase in completion rate is {observed_increase:.4f}, which meets or exceeds the required 5% threshold.")
    if p_value < 0.05:
        print("Additionally, the increase is statistically significant. The new design is both effective and justifiable from a cost perspective.")
    else:
        print("However, the increase is not statistically significant. Further analysis may be needed before making a final decision.")
else:
    print(f"The observed increase in completion rate is {observed_increase:.4f}, which is below the required 5% threshold.")
    print("The new design does not meet the cost-effectiveness criterion and may not justify the associated costs.")

The completion rate for the control group (old version) is: 0.4956
The completion rate for the test group (new version) is: 0.5847
Z-statistic: 23.4761
P-value: 0.0000
Reject the null hypothesis: The new version has a significantly higher completion rate.
The observed increase in completion rate is 0.0891, which meets or exceeds the required 5% threshold.
Additionally, the increase is statistically significant. The new design is both effective and justifiable from a cost perspective.


In [11]:
possible_previous_steps = {
    'start': None,
    'step_1': 'start',
    'step_2': 'step_1',
    'step_3': 'step_2',
    'confirm': 'step_3'
}

def mark_errors(df, possible_previous_steps):
    df = df.copy()  # Avoid modifying original DataFrame
    
    # Sort data to ensure correct order
    df = df.sort_values(by=['visitor_id', 'visit_id', 'date_time'])
    
    # Shift previous step within each visit_id
    df['previous_step'] = df.groupby(['visit_id'])['process_step'].shift(1)

    # Define error condition
    df['error'] = (df['previous_step'] != df['process_step'].map(possible_previous_steps)) & (df['previous_step'].notna() & (df['previous_step'] != df['process_step']))

    return df.drop(columns=['previous_step'])

df_with_errors = mark_errors(df_sorted, possible_previous_steps)

# count errors and visits per variation
error_counts = df_with_errors.groupby('variation')['error'].sum()
visit_counts = df_with_errors.groupby('variation')['visit_id'].nunique()

# extract values safely
error_control, error_test = error_counts.get('Control', 0), error_counts.get('Test', 0)
visit_control, visit_test = visit_counts.get('Control', 1), visit_counts.get('Test', 1)

# compute error rates
error_rate_control = error_control / visit_control
error_rate_test = error_test / visit_test
df_with_errors.head(50)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation,time_spent,error
98750,6265388,100019786_4272121951,703380671_89376183829_330646,start,2017-04-15 15:13:33,Test,13.0,False
98749,6265388,100019786_4272121951,703380671_89376183829_330646,step_1,2017-04-15 15:13:46,Test,35.0,False
98748,6265388,100019786_4272121951,703380671_89376183829_330646,step_2,2017-04-15 15:14:21,Test,89.0,False
98747,6265388,100019786_4272121951,703380671_89376183829_330646,step_3,2017-04-15 15:15:50,Test,28.0,False
98746,6265388,100019786_4272121951,703380671_89376183829_330646,confirm,2017-04-15 15:16:18,Test,29.0,False
98745,6265388,100019786_4272121951,703380671_89376183829_330646,confirm,2017-04-15 15:16:47,Test,0.0,False
295779,2340487,100026388_52569174348,560716384_55516231568_734638,start,2017-06-09 18:37:47,Test,12.0,False
295778,2340487,100026388_52569174348,560716384_55516231568_734638,start,2017-06-09 18:37:59,Test,7.0,False
295777,2340487,100026388_52569174348,560716384_55516231568_734638,step_1,2017-06-09 18:38:06,Test,35.0,False
295776,2340487,100026388_52569174348,560716384_55516231568_734638,start,2017-06-09 18:38:41,Test,64.0,True


In [12]:
# Define counts of errors and total visits for both groups
errors = [error_control, error_test]
visits = [visit_control, visit_test]

# Hypothesis Testing
# H0: The error rates for test group and control group are equal
# H1: The error rate for test group is different (or higher) than the control group

alpha = 0.05
# Perform Z-test for proportions
z_stat, p_value = proportions_ztest(errors, visits)

print(f'Z_Stat: {z_stat}')
print(f'P_value: {p_value}')

Z_Stat: -40.79291868646565
P_value: 0.0


The p-value is 0.0, which is far below the common significance threshold of 0.05. This means we  reject the null hypothesis that the error rates for the test group and control group are equal.

Since the test group's error rate is higher than the control group's, this suggests that the variation introduced in the test group significantly increased errors.

Final Conclusion:
The test group has a statistically significant higher error rate compared to the control group. This means the changes applied to the test group negatively impacted user experience by leading to more errors.

In [16]:
# error rates calculation
# define dictionary with possible previous steps
possible_previous_steps = {
    'start' : None,
    'step_1' : 'start',
    'step_2' : 'step_1',
    'step_3' : 'step_2',
    'confirm' : 'step_3'
}

def check_error(df, possible_previous_steps):
    valid_counts = 0
    error_counts = {'Control': 0, 'Test': 0}
    visit_counts = {'Control': 0, 'Test': 0}
    
    for (visit_id, variation), group in df.groupby(['visit_id', 'variation']):
        previous_step = None
        visit_counts[variation] += 1
        # iterate over the rows in the group
        for _, row in group.iterrows():
            step = row['process_step']
                
            if previous_step is not None:
                # valid steps sequence (same step)
                if previous_step == step:
                    valid_counts += 1
                else:
                    expected_previous = possible_previous_steps.get(step)
                    # invalid step sequence (some step before current)
                    if previous_step != expected_previous:
                        error_counts[variation] += 1
                    else:
                        # valid step sequence (next step)
                        valid_counts += 1
            # setting the current step as previous step for the next iteration
            previous_step = step
    return error_counts, visit_counts

error_results = check_error(df_sorted, possible_previous_steps)

In [17]:
errors = error_results[0]
visits = error_results[1]

control_error_proportion = errors['Control']/visits['Control']
test_error_proportion = errors['Test']/visits['Test']

print(f'Control group error proportion: {control_error_proportion}')
print(f'Test group error proportion: {test_error_proportion}')

Control group error proportion: 0.3060051570412253
Test group error proportion: 0.45820766910814303


In [18]:
df_client_profiles_cleaned.groupby(['gender'])['client_age'].agg(['mean', 'median', 'min', 'max', 'count'])
df_client_profiles_cleaned.groupby(['gender'])['client_tenure_years'].agg(['mean', 'median', 'min', 'max', 'count'])
df_client_profiles_cleaned.groupby(['gender'])['balance'].agg(['mean', 'median', 'min', 'max', 'count'])

# activity score calculation
df_client_profiles_cleaned['activity_score'] = df_client_profiles_cleaned.iloc[:,8] + (df_client_profiles_cleaned.iloc[:,7]*0.5) + (df_client_profiles_cleaned.iloc[:,6]*0.00001)
df_client_profiles_cleaned.sort_values('activity_score', ascending=False)

#login and call scores
high_login_threshold = df_client_profiles_cleaned['logins_6months'].quantile(0.8)
low_login_threshold = df_client_profiles_cleaned['logins_6months'].quantile(0.2)
high_call_threshold = df_client_profiles_cleaned['calls_6months'].quantile(0.8)
low_call_threshold = df_client_profiles_cleaned['calls_6months'].quantile(0.2)

def segment_customer_logins(logins):
    if logins >= high_login_threshold:
        return "Highly Active"
    elif logins <= low_login_threshold:
        return "Inactive"
    else:
        return "Moderate"

def segment_customer_calls(calls):
    if calls >= high_call_threshold:
        return "Highly Active"
    elif calls <= low_call_threshold:
        return "Inactive"
    else:
        return "Moderate"

df_client_profiles_cleaned["login_activity"] = df_client_profiles_cleaned["logins_6months"].apply(segment_customer_logins)
df_client_profiles_cleaned["calls_activity"] = df_client_profiles_cleaned["calls_6months"].apply(segment_customer_calls)

In [19]:
login_activity_counts = df_client_profiles_cleaned["login_activity"].value_counts()
print(login_activity_counts)
df_client_profiles_cleaned.groupby(["login_activity", "gender"])[["client_age", "balance", "client_tenure_years"]].mean()

login_activity
Moderate         37245
Highly Active    17160
Inactive         16190
Name: count, dtype: int64


Unnamed: 0_level_0,Unnamed: 1_level_0,client_age,balance,client_tenure_years
login_activity,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Highly Active,F,51.671024,201911.402432,15.067443
Highly Active,M,50.301099,298856.219527,15.395201
Highly Active,U,41.852197,127230.842408,6.73058
Inactive,F,46.838916,87994.064935,14.378892
Inactive,M,45.8202,118772.402819,14.045059
Inactive,U,40.142882,64183.222127,6.668435
Moderate,F,50.022649,133975.731667,14.890884
Moderate,M,48.647805,198450.2395,14.907304
Moderate,U,41.821166,94046.713213,6.676526


In [20]:
calls_activity_counts = df_client_profiles_cleaned["calls_activity"].value_counts()
print(calls_activity_counts)
df_client_profiles_cleaned.groupby(["calls_activity", "gender"])[["client_age", "balance", "client_tenure_years"]].mean()

calls_activity
Moderate         32806
Highly Active    19041
Inactive         18748
Name: count, dtype: int64


Unnamed: 0_level_0,Unnamed: 1_level_0,client_age,balance,client_tenure_years
calls_activity,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Highly Active,F,50.253416,195625.676549,15.107315
Highly Active,M,49.255712,304081.806682,15.458676
Highly Active,U,40.269435,126506.58705,6.650484
Inactive,F,47.980739,93581.93503,14.311224
Inactive,M,47.377691,117070.1208,14.174319
Inactive,U,41.06976,67703.738549,6.655899
Moderate,F,50.164396,134731.841314,14.954165
Moderate,M,48.690181,193522.415732,14.818417
Moderate,U,42.322831,92761.973992,6.727281


In [72]:
#df_client_profiles_cleaned['client_age'] = df_client_profiles_cleaned['logins_6months'].astype(int)
#df_client_profiles_cleaned['client_age'].isna().count()
#df_client_profiles_cleaned['client_age'] = df_client_profiles_cleaned['client_age'].fillna(method='ffill')
df_client_profiles_cleaned['client_age'] = df_client_profiles_cleaned['client_age'].astype(int)
df_client_profiles_cleaned.dtypes



client_id                 int64
client_tenure_years       int64
client_tenure_months      int64
client_age                int64
gender                   object
num_accounts              int64
balance                 float64
calls_6months             int64
logins_6months            int64
activity_score          float64
login_activity           object
calls_activity           object
dtype: object

In [68]:
df_client_profiles_cleaned.to_csv('data/client_profiles.csv', index=False)
df_with_errors.to_csv('data/abtest.csv', index=False)