In [None]:
import pandas as pd
import os
import glob
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis

In [None]:
web_data_merged_control = pd.read_csv("../data_files/clean/web_data_filtered_control.csv")
web_data_merged_test = pd.read_csv("../data_files/clean/web_data_filtered_test.csv")

In [None]:
def remove_consecutive_starts(group):
    drop_indices = []
    for i in range(1, len(group)):
        if group.iloc[i]['process_step'] == 'start' and group.iloc[i-1]['process_step'] == 'start':
            drop_indices.append(group.index[i-1])
    
    return group.drop(drop_indices)

web_data_merged_control = web_data_merged_control.groupby('visit_id', group_keys=False).apply(remove_consecutive_starts)
web_data_merged_test = web_data_merged_test.groupby('visit_id', group_keys=False).apply(remove_consecutive_starts)

In [None]:
def remove_consecutive_confirm (group):
    drop_indices = []
    for i in range(1, len(group)):
        if group.iloc[i]['process_step'] == 'confirm' and group.iloc[i-1]['process_step'] == 'confirm':
            drop_indices.append(group.index[i-1])
    
    return group.drop(drop_indices)

web_data_merged_control = web_data_merged_control.groupby('visit_id', group_keys=False).apply(remove_consecutive_confirm)
web_data_merged_test = web_data_merged_test.groupby('visit_id', group_keys=False).apply(remove_consecutive_confirm)

In [None]:
# Check if `visit_id` in the current row is the same as the previous row
web_data_merged_control['visit_id_check'] = web_data_merged_control['visit_id'] == web_data_merged_control['visit_id'].shift(1)
web_data_merged_test['visit_id_check'] = web_data_merged_test['visit_id'] == web_data_merged_test['visit_id'].shift(1)

In [None]:
# Use diff() to find the difference between consecutive datetime entries.
web_data_merged_control['time_diff'] = web_data_merged_control['date_time'].diff()
web_data_merged_test['time_diff'] = web_data_merged_test['date_time'].diff()
# Calculate 'time_elapsed' based on 'visit_id_check'
web_data_merged_control['time_elapsed'] = web_data_merged_control.apply(lambda row: row['time_diff'].total_seconds() if row['visit_id_check'] else 0,axis=1)
web_data_merged_test['time_elapsed'] = web_data_merged_test.apply(lambda row: row['time_diff'].total_seconds() if row['visit_id_check'] else 0,axis=1)

In [None]:
duration_visit_control = web_data_merged_control.groupby('visit_id')['time_elapsed'].sum()/60
duration_visit_test = web_data_merged_test.groupby('visit_id')['time_elapsed'].sum()/60

In [None]:
mean_time_control = duration_visit_control.mean()
median_time_control = duration_visit_control.median()
std_time_control = duration_visit_control.std()
min_time_control = duration_visit_control.min()
max_time_control = duration_visit_control.max()
mean_time_test = duration_visit_test.mean()
median_time_test = duration_visit_test.median()
std_time_test = duration_visit_test.std()
min_time_test = duration_visit_test.min()
max_time_test = duration_visit_test.max()

In [None]:
statistics_df = pd.DataFrame({
    'Control': [mean_time_control, median_time_control, std_time_control, min_time_control, max_time_control],
    'Test': [mean_time_test, median_time_test, std_time_test, min_time_test, max_time_test]
}, index=['Mean', 'Median', 'Standard Deviation', 'Minimum', 'Maximum'])
statistics_df

In [None]:
data = pd.DataFrame({'Control': duration_visit_control,'Test': duration_visit_test})
melted_data = data.melt(var_name='Group', value_name='Duration')
plt.figure(figsize=(10, 6))
sns.boxplot(x='Group', y='Duration', data=melted_data, hue='Group', dodge=False, palette='pastel')
plt.title('Box Plot of Visit Durations by Group')
plt.xlabel('Group')
plt.ylabel('Duration (minutes)')
plt.ylim(melted_data['Duration'].quantile(0.05), melted_data['Duration'].quantile(0.95))
plt.show()