In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import binomtest
from scipy.stats import ttest_ind
import scipy.stats as stats
%matplotlib inline

In [3]:
df_final = pd.read_csv("../data/clean/df_combined.csv")
df_exp = pd.read_csv("../data/clean/experiments.csv")

In [4]:
# Merge the two tables based on 'client_id'
df_merged = df_final.merge(df_exp[['client_id', 'variation']], on='client_id', how='left')
df_merged['variation'] = df_merged['variation'].fillna('Unknown')
# Drop null rows
df_merged = df_merged[df_merged['variation'] != 'Unknown']
df_merged.reset_index(drop=True, inplace=True)

In [5]:
df_merged = df_merged.drop(columns=['time_diff', 'error', 'step_numeric'])

In [6]:
df_merged.head(10)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation
0,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,Control
1,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,Control
2,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,Control
3,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,Control
4,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,Control
5,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,Test
6,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,Test
7,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,Test
8,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,Test
9,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,Test


In [12]:
# Filter data for control and test groups
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

In [14]:
# Calculate completion rate for control group
control_completed = control_group[control_group['process_step'] == 'confirm']['client_id'].nunique()
control_started = control_group[control_group['process_step'] == 'start']['client_id'].nunique()
control_completion_rate = control_completed / control_started
print(control_completion_rate)

0.6718126225914388


In [16]:
# Calculate completion rate for test group
test_completed = test_group[test_group['process_step'] == 'confirm']['client_id'].nunique()
test_started = test_group[test_group['process_step'] == 'start']['client_id'].nunique()
test_completion_rate = test_completed / test_started
print(test_completion_rate)

0.700438547171933


In [18]:
observed_increase = test_completion_rate - control_completion_rate
print(observed_increase)

0.028625924580494178


In [20]:
threshold = 0.05

if observed_increase >= threshold:
    print(f"The observed increase is >= than the 5% threshold.")
else:
    print(f"The observed increase is <= than the 5% threshold.")


The observed increase is <= than the 5% threshold.


In [22]:
# Binomial test
successes = test_completed
total = test_started
p_null = control_completion_rate

binom_result = binomtest(successes, total, p=p_null, alternative='greater')
print(f"Binomial Test P-value : {binom_result.pvalue:.10e}")


Binomial Test P-value : 5.6838514065e-24


In [24]:
#Calculate completion rates (number of clients who completed each step)
clients_started = df_merged[df_merged['process_step'] == 'start']['client_id'].nunique()
completion_rates = (
    df_merged.groupby('process_step')['client_id'].nunique() / clients_started
).reset_index(name='completion_rate')
print(completion_rates)

  process_step  completion_rate
0      confirm         0.682721
1        start         1.000000
2       step_1         0.887722
3       step_2         0.817237
4       step_3         0.765747


In [26]:
step_order = {'start': 0, 'step_1': 1, 'step_2': 2, 'step_3': 3, 'confirm': 4}
df_merged['step_numeric'] = df_merged['process_step'].map(step_order)
# Ensure the data is sorted by client_id and date_time
df_merged = df_merged.sort_values(by=['client_id', 'date_time'])
# Check if the next step is less than the current step
df_merged['error'] = df_merged.groupby('client_id')['step_numeric'].shift(-1) < df_merged['step_numeric']
error_rate = df_merged['error'].mean()
print(error_rate)

0.10226471456216195


In [27]:
error_rate_by_step = df_merged.groupby('process_step')['error'].mean()
print(error_rate_by_step)

process_step
confirm    0.063439
start      0.000000
step_1     0.163266
step_2     0.141296
step_3     0.218799
Name: error, dtype: float64


In [56]:
# Create completion arrays for Control and Test groups
control_completion = control_group['process_step'].apply(lambda x: 1 if x == 'confirm' else 0)
test_completion = test_group['process_step'].apply(lambda x: 1 if x == 'confirm' else 0)

In [50]:
t_stat, p_value = ttest_ind(control_completion, test_completion, equal_var=False)
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.10e}")

T-statistic: -14.8044, P-value: 1.4182407520e-49


In [52]:
# Interpret the result
alpha = 0.05
if p_value < alpha:
    print("The difference in completion rates between Control and Test groups is statistically significant.")
else:
    print("The difference in completion rates between Control and Test groups is not statistically significant.")

The difference in completion rates between Control and Test groups is statistically significant.
