In [1]:
# Loading libraries set 
import pandas as pd
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
# Loading datasets
df_final = pd.read_csv("/Users/oscargonzalezsanchez/Documents/ironhack/projects/datasets/2/df_final.csv")
exp = pd.read_csv("/Users/oscargonzalezsanchez/Documents/ironhack/projects/datasets/2/exp.csv")
data_1 = pd.read_csv("/Users/oscargonzalezsanchez/Documents/ironhack/projects/datasets/2/data_1.csv")
data_2 = pd.read_csv("/Users/oscargonzalezsanchez/Documents/ironhack/projects/datasets/2/data_2.csv")

In [3]:
# Rename the 'Variation' column to 'variation_group'
exp.rename(columns={'Variation': 'variation_group'}, inplace=True)

# Merge data_1 and data_2
df_web_data = pd.concat([data_1, data_2], ignore_index=True)

# Merge df_web_data with experiment data to get the group information
df_merged = pd.merge(df_web_data, exp, on='client_id', how='left')

# Convert date_time to datetime
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])

# Sort data by client_id and date_time
df_merged.sort_values(by=['client_id', 'date_time'], inplace=True)

# Calculate time differences between steps for each client
df_merged['next_process_step'] = df_merged.groupby('client_id')['process_step'].shift(-1)
df_merged['next_date_time'] = df_merged.groupby('client_id')['date_time'].shift(-1)
df_merged['step_time_diff'] = df_merged['next_date_time'] - df_merged['date_time']

# Filter out rows where the next_process_step is not in the sequence
valid_steps = ['step_1', 'step_2', 'step_3', 'confirm']
df_merged = df_merged[df_merged['process_step'].isin(valid_steps) & df_merged['next_process_step'].isin(valid_steps)]

# Map the process steps to ensure the correct order
step_order = {'step_1': 1, 'step_2': 2, 'step_3': 3, 'confirm': 4}
df_merged['process_step_order'] = df_merged['process_step'].map(step_order)
df_merged['next_process_step_order'] = df_merged['next_process_step'].map(step_order)

# Filter to ensure only consecutive steps are considered
df_merged = df_merged[df_merged['next_process_step_order'] == df_merged['process_step_order'] + 1]

# Calculate average time differences between each step for control and test groups
avg_time_diff_per_step = df_merged.groupby(['variation_group', 'process_step'])['step_time_diff'].mean().reset_index()

# Convert time differences to seconds
avg_time_diff_per_step['avg_time_diff_to_next_step'] = avg_time_diff_per_step['step_time_diff'].dt.total_seconds()

# Drop the original step_time_diff column
avg_time_diff_per_step = avg_time_diff_per_step.drop(columns=['step_time_diff'])

In [4]:
avg_time_diff_per_step

Unnamed: 0,variation_group,process_step,avg_time_diff_to_next_step
0,Control,step_1,34.024979
1,Control,step_2,87.173273
2,Control,step_3,1504.859957
3,Test,step_1,48.221973
4,Test,step_2,462.49011
5,Test,step_3,1504.506992


In [6]:
file_path = '/Users/oscargonzalezsanchez/Documents/ironhack/projects/datasets/avg_time_diff_per_step.csv'
avg_time_diff_per_step.to_csv(file_path, index=False)