# 3. Success indicators

- **3.1. Test group:**
  - **3.1.1. Completion rate:** The proportion of users who reach the final ‘confirm’ step.
  - **3.1.2. Time spent on each step:** The average duration users spend on each step.
  - **3.1.3. Error rates:** Steps where users go back to a previous step.

\
- **3.2. Control group:**
  - **3.2.1. Completion rate**
  - **3.2.2. Time spent on each step**
  - **3.2.3. Error rates**

\
- **3.3. Redesign outcome:** Given the 3 KPIs, how the new design’s performance compare to the old one?

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML

%matplotlib inline

In [2]:
# Load the dataset, concatenate the 2-part digital_footprint file.
df_digital_footprint_1 = pd.read_csv("/Users/milenko/My Drive (1307mile@gmail.com)/bootcamp/w5/w5w6_project2/data/df_final_web_data_pt_1.txt")
df_digital_footprint_2 = pd.read_csv("/Users/milenko/My Drive (1307mile@gmail.com)/bootcamp/w5/w5w6_project2/data/df_final_web_data_pt_2.txt")
df_digital_footprint = pd.concat([df_digital_footprint_1, df_digital_footprint_2], axis=0, ignore_index=True)

df_experiment_roster = pd.read_csv("/Users/milenko/My Drive (1307mile@gmail.com)/bootcamp/w5/w5w6_project2/data/df_final_experiment_clients.txt")

In [3]:
df_digital_footprint

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
755400,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
755401,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
755402,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
755403,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


In [4]:
df_experiment_roster

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control
...,...,...
70604,2443347,
70605,8788427,
70606,266828,
70607,1266421,


# Dataset exploration

In [5]:
df_digital_footprint.describe(include='object')

Unnamed: 0,visitor_id,visit_id,process_step,date_time
count,755405,755405,755405,755405
unique,130236,158095,5,629363
top,722943003_3441581446,875138661_34710212496_881092,start,2017-05-02 10:07:41
freq,104,104,243945,24


In [6]:
df_digital_footprint.nunique()

client_id       120157
visitor_id      130236
visit_id        158095
process_step         5
date_time       629363
dtype: int64

In [7]:
# 0 null values.
df_digital_footprint.isnull().sum()

client_id       0
visitor_id      0
visit_id        0
process_step    0
date_time       0
dtype: int64

In [8]:
# Experiment group is unknown for 20109 users. Drop those.
df_experiment_roster.isnull().sum()
df_experiment_roster = df_experiment_roster.dropna(thresh=2)

In [9]:
# 10764 duplicated rows. Will not drop them.
display(df_digital_footprint.duplicated().any())
display(df_digital_footprint.duplicated().sum())

True

10764

In [None]:
# Cast 'date_time' as datetime type.
df_digital_footprint.info()

In [None]:
df_digital_footprint['date_time'] = pd.to_datetime(df_digital_footprint['date_time'])

In [None]:
# Merge df_experiment_roster with df_digital_footprint to distinguish the groups.
df_digital_footprint = pd.merge(df_digital_footprint, df_experiment_roster, on='client_id')

In [None]:
# The merge introduces change in duplicates.
df_digital_footprint.duplicated().sum()

In [None]:
# Rename 'Variation' to 'experiment_group' for consistency.
df_digital_footprint = df_digital_footprint.rename(columns= {'Variation': 'experiment_group'})

# 3.1. Test group: success indicators

## 3.1.1. completion rate

In [None]:
# Filter for the Test group to measure the proportion of users who reach the final 'confirm' step
df_test_group = df_digital_footprint[df_digital_footprint['experiment_group'] == 'Test']

df_test_confirm = (df_test_group['process_step'] == 'confirm').sum() / len(df_test_group)
print("The proportion of Test group reaching the final 'confirm' step is: ", f"{df_test_confirm:.2%}")

In [None]:
# How this compares to other steps?
steps = df_digital_footprint['process_step'].unique()
steps

for step in steps:
    proportion = (df_test_group['process_step'] == step).sum() / len(df_test_group)
    print(f"{step}: {proportion:.2%}")

## 3.1.2. time spent on each step

In [None]:
# Initialize a dictionary to store mean duration strings for the Test group
test_durations = {}

# Define pairs of steps to calculate the transition duration between
step_pairs = [
    ('start', 'step_1'),
    ('step_1', 'step_2'),
    ('step_2', 'step_3'),
    ('step_3', 'confirm'),
]

# Loop over each pair of steps for the Test group
for start_step, end_step in step_pairs:
    # Filter for relevant steps
    relevant_steps = df_test_group[df_test_group['process_step'].isin([start_step, end_step])]
    
    # Sort by visit_id and date_time to ensure chronological order
    relevant_steps_sorted = relevant_steps.sort_values(by=['visit_id', 'date_time'])
    
    # Calculate the time difference between steps for each visit_id
    relevant_steps_sorted['time_diff'] = relevant_steps_sorted.groupby('visit_id')['date_time'].diff()
    
    # Select end_step rows to use the calculated time differences as durations from start_step to end_step
    end_step_durations = relevant_steps_sorted[relevant_steps_sorted['process_step'] == end_step]
    
    # Calculate the mean duration from start_step to end_step
    test_mean_duration = end_step_durations['time_diff'].mean()
    
    # Format the mean duration string to include days, hours, minutes, and seconds
    test_mean_duration_str = f"{test_mean_duration.days}d {test_mean_duration.components.hours}h {test_mean_duration.components.minutes}m {test_mean_duration.components.seconds}s"
    
    # Store the result in the dictionary keyed by the step pair
    test_durations[(start_step, end_step)] = test_mean_duration_str

## 3.1.3. error rates

In [None]:
# Sort by visit_id and date_time to ensure chronological order
df_test_group = df_test_group.sort_values(by=['client_id', 'visit_id', 'date_time'])

# Assign step orders
step_order = {'start': 0,
               'step_1': 1,
               'step_2': 2,
               'step_3': 3,
               'confirm': 4}


# Detect backward movements
df_test_group['step_order'] = df_test_group['process_step'].map(step_order)

# Calculate the difference in step order to identify backward movements
df_test_group['step_diff'] = df_test_group.groupby(['client_id', 'visit_id'])['step_order'].diff()

# A negative step_diff indicates a backward movement
df_test_group['is_backward'] = df_test_group['step_diff'] < 0

# Proportion of sessions with at least one backward movement
error_sessions = df_test_group[df_test_group['is_backward']].groupby(['client_id', 'visit_id']).ngroups
total_sessions = df_test_group.groupby(['client_id', 'visit_id']).ngroups
test_error_rate = error_sessions / total_sessions

print(f"Proportion of sessions with errors: {test_error_rate:.2%}")

# Count of backward movements by step
errors_by_step = df_test_group[df_test_group['is_backward']]['process_step'].value_counts()
errors_by_step

# 3.2. Control group: success indicators

## 3.2.1. completion rate

In [None]:
# Filter for the Control group to measure the proportion of users who reach the final 'confirm' step
df_control_group = df_digital_footprint[df_digital_footprint['experiment_group'] == 'Control']

df_control_confirm = (df_control_group['process_step'] == 'confirm').sum() / len(df_control_group)
print("The proportion of Control group reaching the final 'confirm' step is: ", f"{df_control_confirm:.2%}")

## 3.2.2. time spent on each step

In [None]:
# Initialize a dictionary to store mean duration strings for the Control group
control_durations = {}

# Define pairs of steps to calculate the transition duration between, as before
step_pairs = [
    ('start', 'step_1'),
    ('step_1', 'step_2'),
    ('step_2', 'step_3'),
    ('step_3', 'confirm'),
]

# Loop over each pair of steps for the Control group
for start_step, end_step in step_pairs:
    # Filter for relevant steps
    relevant_steps = df_control_group[df_control_group['process_step'].isin([start_step, end_step])]
    
    # Sort by visit_id and date_time to ensure chronological order
    relevant_steps_sorted = relevant_steps.sort_values(by=['visit_id', 'date_time'])
    
    # Calculate the time difference between steps for each visit_id
    relevant_steps_sorted['time_diff'] = relevant_steps_sorted.groupby('visit_id')['date_time'].diff()
    
    # Select end_step rows to use the calculated time differences as durations from start_step to end_step
    end_step_durations = relevant_steps_sorted[relevant_steps_sorted['process_step'] == end_step]
    
    # Calculate the mean duration from start_step to end_step
    control_mean_duration = end_step_durations['time_diff'].mean()
    
    # Format the mean duration string to include days, hours, minutes, and seconds
    control_mean_duration_str = f"{control_mean_duration.days}d {control_mean_duration.components.hours}h {control_mean_duration.components.minutes}m {control_mean_duration.components.seconds}s"
    
    # Store the result in the dictionary keyed by the step pair
    control_durations[(start_step, end_step)] = control_mean_duration_str

## 3.2.3. error rates

In [None]:
# Sort by visit_id and date_time to ensure chronological order
df_control_group = df_control_group.sort_values(by=['client_id', 'visit_id', 'date_time'])

# Assign step orders
step_order = {'start': 0,
               'step_1': 1,
               'step_2': 2,
               'step_3': 3,
               'confirm': 4}


# Detect backward movements
df_control_group['step_order'] = df_control_group['process_step'].map(step_order)

# Calculate the difference in step order to identify backward movements
df_control_group['step_diff'] = df_control_group.groupby(['client_id', 'visit_id'])['step_order'].diff()

# A negative step_diff indicates a backward movement
df_control_group['is_backward'] = df_control_group['step_diff'] < 0

# Proportion of sessions with at least one backward movement
error_sessions = df_control_group[df_control_group['is_backward']].groupby(['client_id', 'visit_id']).ngroups
total_sessions = df_control_group.groupby(['client_id', 'visit_id']).ngroups
control_error_rate = error_sessions / total_sessions

print(f"Proportion of sessions with errors: {control_error_rate:.2%}")

# Count of backward movements by step
errors_by_step = df_control_group[df_control_group['is_backward']]['process_step'].value_counts()
errors_by_step

# 3.3. Redesign outcome

In [None]:
# Test group has higher completion rate
print("The proportion of Test group reaching the final 'confirm' step: ", f"{df_test_confirm:.2%}")
print("The proportion of Control group reaching the 'confirm' step is: ", f"{df_control_confirm:.2%}\n")

# Control group has lower average time spent between steps
for start_step, end_step in step_pairs:
    test_str = test_durations.get((start_step, end_step), "No data")
    control_str = control_durations.get((start_step, end_step), "No data")
    print(f"Transition from '{start_step}' to '{end_step}':")
    print(f"  Test group: {test_str}")
    print(f"  Control group: {control_str}\n")

# Control group has lower proportion of sessions with errors.
print(f"Proportion of sessions with errors: {test_error_rate:.2%}")
print(f"Proportion of sessions with errors: {control_error_rate:.2%}")

dropna, inner join, results:

- The proportion of Test group reaching the final 'confirm' step:  14.46%
- The proportion of Control group reaching the 'confirm' step is:  12.20%

- Transition from 'start' to 'step_1':
  - Test group: 0d 0h 0m 49s
  - Control group: 0d 0h 0m 51s

- Transition from 'step_1' to 'step_2':
  - Test group: 0d 0h 0m 58s
  - Control group: 0d 0h 0m 47s

- Transition from 'step_2' to 'step_3':
  - Test group: 0d 0h 1m 39s
  - Control group: 0d 0h 1m 33s

- Transition from 'step_3' to 'confirm':
  - Test group: 0d 0h 2m 9s
  - Control group: 0d 0h 2m 8s

- Proportion of sessions with errors: 26.81%
- Proportion of sessions with errors: 20.30%