In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read the files
df_final_demo = pd.read_csv('df_final_demo.txt', delimiter=',')
df_final_experiment_clients = pd.read_csv('df_final_experiment_clients.txt', delimiter=',')
df_final_web_data_pt_1 = pd.read_csv('df_final_web_data_pt_1.txt', delimiter=',')
df_final_web_data_pt_2 = pd.read_csv('df_final_web_data_pt_2.txt', delimiter=',')

In [None]:
# Check for missing values
print(df_final_demo.isnull().sum())

# Fill missing values or drop rows with significant missing data
df_final_demo['clnt_age'] = df_final_demo['clnt_age'].fillna(df_final_demo['clnt_age'].median())
df_final_demo['gendr'] = df_final_demo['gendr'].fillna('Unknown')

# Replace negative or invalid values with NaN and handle them
df_final_demo['bal'] = df_final_demo['bal'].apply(lambda x: np.nan if x < 0 else x)
df_final_demo = df_final_demo.dropna()  # Drop rows with invalid values

In [None]:
#test case
print(df_final_experiment_clients.head())

In [None]:
#clean df_final_experiment
if 'Variation' in df_final_experiment_clients.columns:
    print(df_final_experiment_clients['Variation'].unique())
else:
    print("'Variation' column not found. Cannot proceed with analysis.")


In [None]:
#df_final_experiment_clients = pd.read_csv('df_final_experiment_clients.csv', delimiter=',')

# Ensure Variation contains only valid entries
print(df_final_experiment_clients['Variation'].unique())

# Fix invalid variations (if any)
valid_variations = ['Test', 'Control']
df_final_experiment_clients = df_final_experiment_clients[
    df_final_experiment_clients['Variation'].isin(valid_variations)
]

In [None]:
# Concatenate the two parts
df_final_web_data = pd.concat([df_final_web_data_pt_1, df_final_web_data_pt_2], ignore_index=True)

# Convert `date_time` to datetime format
df_final_web_data['date_time'] = pd.to_datetime(df_final_web_data['date_time'])

# Check for duplicates and drop them
df_final_web_data = df_final_web_data.drop_duplicates()

# Ensure `process_step` is valid (e.g., integers or known steps)
print(df_final_web_data['process_step'].unique())


In [None]:
merged_data = pd.merge(df_final_demo, df_final_experiment_clients, on='client_id', how='inner')

# Merge the web data
merged_data = pd.merge(merged_data, df_final_web_data, on='client_id', how='inner')

In [None]:
merged_data.to_csv('cleaned_data.csv', index=False)


In [None]:
#print out csv header 
df_all_clean = pd.read_csv("cleaned_data.csv", dtype={'Variation': 'str'}, low_memory=False)
print(df_all_clean.dtypes)

In [None]:
# Check the structure of the data
print(df_all_clean.info())
print(df_all_clean.head())

In [None]:
print(df_all_clean.describe())
print(df_all_clean['gendr'].value_counts())

In [None]:
#Age distribution visualization

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# Set pastel color for axis labels
pastel_color = '#AEC6CF'  # Example pastel blue from a pastel palette

# Plot with pastel style
df_all_clean['clnt_age'].hist(bins=20, color='#5F9EA0', edgecolor='white')  # Add pastel orange for the bars
plt.title('Age Distribution', color='#00CED1')  # Pastel green title
plt.xlabel('Age', color='#008B8B')  # Use pastel blue for the label
plt.ylabel('Frequency', color='#20B2AA')  # Use pastel blue for the label
plt.show()

In [None]:
df_all_clean.plot.scatter(x='num_accts', y='bal', alpha=0.5)
plt.title('Balance vs. Number of Accounts')
plt.show()

In [None]:
print(df_all_clean.groupby('Variation')['clnt_age'].mean())
print(df_all_clean.groupby('Variation')['bal'].mean())


In [None]:
# Filter for the 'confirm' step
completion_data = df_all_clean[df_all_clean['process_step'] == 'confirm']

# Calculate completion rates
total_test = len(df_all_clean[df_all_clean['Variation'] == 'Test'])
total_control = len(df_all_clean[df_all_clean['Variation'] == 'Control'])
confirm_test = len(df_all_clean[df_all_clean['Variation'] == 'Test'])
confirm_control = len(df_all_clean[df_all_clean['Variation'] == 'Control'])

test_completion_rate = confirm_test / total_test
control_completion_rate = confirm_control / total_control

print(f"Test Completion Rate: {test_completion_rate}")
print(f"Control Completion Rate: {control_completion_rate}")


In [None]:
df_all_clean['date_time'] = pd.to_datetime(df_all_clean['date_time'])

# Sort data by client and timestamp
df_all_clean = df_all_clean.sort_values(by=['client_id', 'date_time'])

# Calculate time differences
df_all_clean['time_diff'] = df_all_clean.groupby('client_id')['date_time'].diff()

# Average time spent per step
time_per_step = df_all_clean.groupby('process_step')['time_diff'].mean()
print(time_per_step)


In [None]:
# Calculate error rates with division by zero handling
test_error_rate = error_test / total_steps_test if total_steps_test > 0 else 0
control_error_rate = error_control / total_steps_control if total_steps_control > 0 else 0

print(f"Test Error Rate: {test_error_rate}")
print(f"Control Error Rate: {control_error_rate}")


In [None]:
'''
# Define mapping for process_step
step_mapping = {
    'start': 1,
    'step_1': 2,
    'step_2': 3,
    'step_3': 4,
    'confirm': 5
}

# Map process_step to numeric values
df_all_clean['process_step'] = df_all_clean['process_step'].map(step_mapping)

# Drop rows with NaN in process_step (if any invalid values exist)
df_all_clean = df_all_clean.dropna(subset=['process_step'])

# Calculate step differences
df_all_clean['step_diff'] = df_all_clean.groupby('client_id')['process_step'].diff()

# Identify backward steps
backward_steps = df_all_clean[df_all_clean['step_diff'] < 0]

# Calculate total steps for each group
total_steps_test = len(df_all_clean[df_all_clean['Variation'] == 'Test'])
total_steps_control = len(df_all_clean[df_all_clean['Variation'] == 'Control'])

# Count backward steps for each group
error_test = len(backward_steps[backward_steps['Variation'] == 'Test'])
error_control = len(backward_steps[backward_steps['Variation'] == 'Control'])

# Calculate error rates with zero division handling
test_error_rate = error_test / total_steps_test if total_steps_test > 0 else 0
control_error_rate = error_control / total_steps_control if total_steps_control > 0 else 0

print(f"Test Error Rate: {test_error_rate}")
print(f"Control Error Rate: {control_error_rate}")


In [None]:
# Filter data for Test and Control groups
test_group = df_all_clean[df_all_clean['Variation'] == 'Test']
control_group = df_all_clean[df_all_clean['Variation'] == 'Control']

print("Test group process_step value counts:")
print(test_group['process_step'].value_counts())

print("Control group process_step value counts:")
print(control_group['process_step'].value_counts())

In [None]:
# Filter data for Test and Control groups
test_group = df_all_clean[df_all_clean['Variation'] == 'Test']
control_group = df_all_clean[df_all_clean['Variation'] == 'Control']

# Calculate counts for 'confirm' in process_step
test_confirm_count = test_group['process_step'].value_counts().get('confirm', 0)
control_confirm_count = control_group['process_step'].value_counts().get('confirm', 0)

# Calculate completion rates with handling for zero division
test_completion_rate = test_confirm_count / len(test_group) if len(test_group) > 0 else 0
control_completion_rate = control_confirm_count / len(control_group) if len(control_group) > 0 else 0

print(f"Test Completion Rate: {test_completion_rate}")
print(f"Control Completion Rate: {control_completion_rate}")


In [None]:
#It's the proportion of users who completed the process (confirm step) in each group.
import matplotlib.pyplot as plt

rates = ['Completion Rate', 'Error Rate']
test_rates = [test_completion_rate, test_error_rate]
control_rates = [control_completion_rate, control_error_rate]

x = range(len(rates))

plt.bar(x, test_rates, width=0.4, label='Test', align='center')
plt.bar(x, control_rates, width=0.4, label='Control', align='edge')
plt.xticks(x, rates)
plt.legend()
plt.title('Test vs Control Rates')
plt.show()

In [None]:
print(df_all_clean['process_step'])

In [None]:
#Understand Error Rates:
#Backward navigation (going to a previous step) indicates confusion or errors.
df_all_clean['date_time'] = pd.to_datetime(df_all_clean['date_time'])
df_all_clean = df_all_clean.sort_values(['visitor_id', 'date_time'])

# Calculate time differences
df_all_clean['time_diff'] = df_all_clean.groupby('visitor_id')['date_time'].diff()

# Average time spent per step
avg_time_per_step = df_all_clean.groupby('process_step')['time_diff'].mean()
print(avg_time_per_step)


In [None]:
from statsmodels.stats.proportion import proportions_ztest

# Filter groups
test_group = df_all_clean[df_all_clean['Variation'] == 'Test']
control_group = df_all_clean[df_all_clean['Variation'] == 'Control']

# Ensure "confirm" step exists
if 'confirm' in df_all_clean['process_step'].unique():
    successes = [
        len(test_group[test_group['process_step'] == 'confirm']),
        len(control_group[control_group['process_step'] == 'confirm'])
    ]
    samples = [len(test_group), len(control_group)]

    # Perform z-test if data is valid
    if min(successes) > 0 and min(samples) > 0:
        z_stat, p_value = proportions_ztest(successes, samples)
        print(f"Z-Statistic: {z_stat}, P-Value: {p_value}")
    else:
        print("Invalid data for z-test: No completions or zero users in groups.")
else:
    print("The 'confirm' step is missing from process_step data.")
