In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read the files
df_final_demo = pd.read_csv('df_final_demo.txt', delimiter=',')
df_final_experiment_clients = pd.read_csv('df_final_experiment_clients.txt', delimiter=',')
df_final_web_data_pt_1 = pd.read_csv('df_final_web_data_pt_1.txt', delimiter=',')
df_final_web_data_pt_2 = pd.read_csv('df_final_web_data_pt_2.txt', delimiter=',')

In [None]:
# Check for missing values
print(df_final_demo.isnull().sum())

# Fill missing values or drop rows with significant missing data
df_final_demo['clnt_age'] = df_final_demo['clnt_age'].fillna(df_final_demo['clnt_age'].median())
df_final_demo['gendr'] = df_final_demo['gendr'].fillna('Unknown')

# Replace negative or invalid values with NaN and handle them
df_final_demo['bal'] = df_final_demo['bal'].apply(lambda x: np.nan if x < 0 else x)
df_final_demo = df_final_demo.dropna()  # Drop rows with invalid values

In [None]:
#test case
print(df_final_experiment_clients.head())

In [None]:
#clean df_final_experiment
if 'Variation' in df_final_experiment_clients.columns:
    print(df_final_experiment_clients['Variation'].unique())
else:
    print("'Variation' column not found. Cannot proceed with analysis.")


In [None]:
#df_final_experiment_clients = pd.read_csv('df_final_experiment_clients.csv', delimiter=',')

# Ensure Variation contains only valid entries
print(df_final_experiment_clients['Variation'].unique())

# Fix invalid variations (if any)
valid_variations = ['Test', 'Control']
df_final_experiment_clients = df_final_experiment_clients[
    df_final_experiment_clients['Variation'].isin(valid_variations)
]

In [None]:
# Concatenate the two parts
df_final_web_data = pd.concat([df_final_web_data_pt_1, df_final_web_data_pt_2], ignore_index=True)

# Convert `date_time` to datetime format
df_final_web_data['date_time'] = pd.to_datetime(df_final_web_data['date_time'])

# Check for duplicates and drop them
df_final_web_data = df_final_web_data.drop_duplicates()

# Ensure `process_step` is valid (e.g., integers or known steps)
print(df_final_web_data['process_step'].unique())


In [None]:
merged_data = pd.merge(df_final_demo, df_final_experiment_clients, on='client_id', how='inner')

# Merge the web data
merged_data = pd.merge(merged_data, df_final_web_data, on='client_id', how='inner')

In [None]:
merged_data.to_csv('cleaned_data.csv', index=False)


In [None]:
#print out csv header 
df_all_clean = pd.read_csv("cleaned_data.csv", dtype={'Variation': 'str'}, low_memory=False)
print(df_all_clean.dtypes)

In [None]:
# Check the structure of the data
print(df_all_clean.info())
print(df_all_clean.head())

In [None]:
print(df_all_clean.describe())
print(df_all_clean['gendr'].value_counts())

In [None]:
#Age distribution visualization

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# Set pastel color for axis labels
pastel_color = '#AEC6CF'  # Example pastel blue from a pastel palette

# Plot with pastel style
merged_data['clnt_age'].hist(bins=20, color='#5F9EA0', edgecolor='white')  # Add pastel orange for the bars
plt.title('Age Distribution', color='#00CED1')  # Pastel green title
plt.xlabel('Age', color='#008B8B')  # Use pastel blue for the label
plt.ylabel('Frequency', color='#20B2AA')  # Use pastel blue for the label
plt.show()

In [None]:
merged_data.plot.scatter(x='num_accts', y='bal', alpha=0.5)
plt.title('Balance vs. Number of Accounts')
plt.show()


In [None]:
print(merged_data.groupby('Variation')['clnt_age'].mean())
print(merged_data.groupby('Variation')['bal'].mean())


In [None]:
# Filter for the 'confirm' step
completion_data = merged_data[merged_data['process_step'] == 'confirm']

# Calculate completion rates
total_test = len(merged_data[merged_data['Variation'] == 'Test'])
total_control = len(merged_data[merged_data['Variation'] == 'Control'])
confirm_test = len(completion_data[completion_data['Variation'] == 'Test'])
confirm_control = len(completion_data[completion_data['Variation'] == 'Control'])

test_completion_rate = confirm_test / total_test
control_completion_rate = confirm_control / total_control

print(f"Test Completion Rate: {test_completion_rate}")
print(f"Control Completion Rate: {control_completion_rate}")


In [None]:
merged_data['date_time'] = pd.to_datetime(merged_data['date_time'])

# Sort data by client and timestamp
merged_data = merged_data.sort_values(by=['client_id', 'date_time'])

# Calculate time differences
merged_data['time_diff'] = merged_data.groupby('client_id')['date_time'].diff()

# Average time spent per step
time_per_step = merged_data.groupby('process_step')['time_diff'].mean()
print(time_per_step)


In [None]:
# Filter data for Test and Control groups
test_group = df_all_clean[df_all_clean['Variation'] == 'Test']
control_group = df_all_clean[df_all_clean['Variation'] == 'Control']

# Calculate completion rates
test_completion_rate = test_group['process_step'].value_counts()['confirm'] / len(test_group)
control_completion_rate = control_group['process_step'].value_counts()['confirm'] / len(control_group)

print(f"Test Completion Rate: {test_completion_rate}")
print(f"Control Completion Rate: {control_completion_rate}")



In [None]:
#check backward steps
# Map process_step to numeric values (if not already numeric)
step_mapping = {'start': 1, 'step_1': 2, 'step_2': 3, 'step_3': 4, 'confirm': 5}
df_final_web_data['process_step'] = df_final_web_data['process_step'].map(step_mapping)

# Ensure visitor_id columns are of the same type
df_final_web_data['visitor_id'] = df_final_web_data['visitor_id'].astype(str)
test_group['visitor_id'] = test_group['visitor_id'].astype(str)
control_group['visitor_id'] = control_group['visitor_id'].astype(str)

# Calculate step differences
df_final_web_data['step_diff'] = df_final_web_data.groupby('visitor_id')['process_step'].diff()

# Identify backward steps
backward_steps = df_final_web_data[df_final_web_data['step_diff'] < 0]

# Debug backward steps
print("Total backward steps:", len(backward_steps))

# Check visitor_id matches
test_visitors_in_backward_steps = backward_steps['visitor_id'].isin(test_group['visitor_id'])
control_visitors_in_backward_steps = backward_steps['visitor_id'].isin(control_group['visitor_id'])

print("Backward steps for Test group:", test_visitors_in_backward_steps.sum())
print("Backward steps for Control group:", control_visitors_in_backward_steps.sum())

# Ensure groups are not empty
if len(test_group) == 0 or len(control_group) == 0:
    print("One of the groups is empty. Cannot calculate error rates.")
else:
    # Calculate error rates
    test_error_rate = len(backward_steps[test_visitors_in_backward_steps]) / len(test_group)
    control_error_rate = len(backward_steps[control_visitors_in_backward_steps]) / len(control_group)

    print(f"Test Error Rate: {test_error_rate}")
    print(f"Control Error Rate: {control_error_rate}")


In [None]:
import matplotlib.pyplot as plt

# Calculate the number of backward steps for each group
test_backward_steps = len(backward_steps[backward_steps['visitor_id'].isin(test_group['visitor_id'])])
control_backward_steps = len(backward_steps[backward_steps['visitor_id'].isin(control_group['visitor_id'])])

# Ensure groups are not empty to avoid division by zero
if len(test_group) == 0 or len(control_group) == 0:
    print("One of the groups is empty. Cannot visualize error rates.")
else:
    # Calculate error rates
    test_error_rate = test_backward_steps / len(test_group)
    control_error_rate = control_backward_steps / len(control_group)

    # Data for visualization
    groups = ['Test', 'Control']
    backward_steps_counts = [test_backward_steps, control_backward_steps]
    error_rates = [test_error_rate, control_error_rate]

    # Create bar chart for backward steps
    plt.figure(figsize=(10, 6))
    plt.bar(groups, backward_steps_counts, alpha=0.7, label='Backward Steps', color='blue')
    plt.ylabel('Number of Backward Steps')
    plt.title('Backward Steps Comparison: Test vs. Control Groups')
    plt.legend()
    plt.show()

    # Create bar chart for error rates
    plt.figure(figsize=(10, 6))
    plt.bar(groups, error_rates, alpha=0.7, label='Error Rate', color='green')
    plt.ylabel('Error Rate')
    plt.title('Error Rate Comparison: Test vs. Control Groups')
    plt.legend()
    plt.show()
