In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [108]:
client_data = "../data/clean/clean_client_data.csv"
analysis_data = "../data/clean/analysis_data.csv"


In [109]:
client_df = pd.read_csv(client_data)
analysis_df = pd.read_csv(analysis_data)


In [110]:
analysis_df.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,visit_final_step,step_duration,step_proceeds,step_error
0,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,Test,False,7.0,True,False
1,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,Test,False,32.0,True,False
2,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,Test,False,99.0,True,False
3,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,Test,False,20.0,True,False
4,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,Test,True,,False,False


In [111]:
analysis_df.pivot_table(index='process_step',
                        columns='Variation',
                        values=['visit_final_step','step_proceeds','step_error','step_duration'],
                        aggfunc='mean').round(2)

Unnamed: 0_level_0,step_duration,step_duration,step_error,step_error,step_proceeds,step_proceeds,visit_final_step,visit_final_step
Variation,Control,Test,Control,Test,Control,Test,Control,Test
process_step,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
confirm,221.45,192.46,0.0,0.0,0.0,0.0,0.96,0.99
start,39.41,32.38,0.0,0.0,0.68,0.71,0.31,0.29
step_1,55.39,72.09,0.07,0.13,0.75,0.71,0.12,0.08
step_2,94.31,91.99,0.06,0.11,0.82,0.78,0.06,0.04
step_3,146.74,165.05,0.1,0.09,0.67,0.7,0.1,0.07


In [112]:
analysis_df[analysis_df['step_proceeds']==True].pivot_table(index='process_step',
                                                            columns='Variation',
                                                            values=['step_duration'],
                                                            aggfunc='mean').round(2)

Unnamed: 0_level_0,step_duration,step_duration
Variation,Control,Test
process_step,Unnamed: 1_level_2,Unnamed: 2_level_2
start,37.97,31.69
step_1,33.82,37.14
step_2,87.01,86.51
step_3,135.64,143.07


In [113]:
def successful_visit_col(df=analysis_df):
    analysis_df = df.copy()
    start_visit_ids = list(analysis_df[(analysis_df['process_step'] == 'start') & (analysis_df['step_proceeds'] == True)]['visit_id'])
    step_1_visit_ids = list(analysis_df[(analysis_df['process_step'] == 'step_1') & (analysis_df['step_proceeds'] == True)]['visit_id'])
    step_2_visit_ids = list(analysis_df[(analysis_df['process_step'] == 'step_2') & (analysis_df['step_proceeds'] == True)]['visit_id'])
    step_3_visit_ids = list(analysis_df[(analysis_df['process_step'] == 'step_3') & (analysis_df['step_proceeds'] == True)]['visit_id'])
    confirm_visit_ids = list(analysis_df[analysis_df['process_step']=='confirm']['visit_id'])
    successful_visits = list(set(start_visit_ids) & set(step_1_visit_ids) & set(step_2_visit_ids) & set(step_3_visit_ids) & set(confirm_visit_ids))
    analysis_df['successful_visit'] = analysis_df['visit_id'].isin(successful_visits)
    return analysis_df

In [249]:
successful_visit_col(analysis_df)[['visit_id','process_step','date_time','successful_visit']].head(20)

Unnamed: 0,visit_id,process_step,date_time,successful_visit
0,637149525_38041617439_716659,start,2017-04-15 12:57:56,True
1,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,True
2,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,True
3,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,True
4,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,True
5,40369564_40101682850_311847,start,2017-04-12 15:41:28,True
6,40369564_40101682850_311847,step_1,2017-04-12 15:41:35,True
7,40369564_40101682850_311847,step_2,2017-04-12 15:41:53,True
8,40369564_40101682850_311847,step_3,2017-04-12 15:45:02,True
9,40369564_40101682850_311847,confirm,2017-04-12 15:47:45,True


In [251]:
def successful_visit_durations(df=analysis_df):
    experiment_df = successful_visit_col(df)
    success_df = experiment_df[experiment_df['successful_visit']==True]
    duration_df = success_df[(success_df['process_step']=='start') | (success_df['process_step']=='confirm')]
    duration_df = duration_df[['visit_id', 'Variation', 'process_step', 'date_time']]

    duration_df_pivot = duration_df.pivot_table(index=['Variation', 'visit_id'],
                                            columns='process_step',
                                            values='date_time',
                                            aggfunc='first')

    duration_df_pivot['confirm'] = pd.to_datetime(duration_df_pivot['confirm'])
    duration_df_pivot['start'] = pd.to_datetime(duration_df_pivot['start'])
    duration_df_pivot['duration'] = (duration_df_pivot['confirm'] - duration_df_pivot['start']).dt.total_seconds()

    success_durations = duration_df_pivot.reset_index()[['Variation', 'visit_id', 'duration']]

    results = success_durations.pivot_table(index=['Variation'],
                                            values='duration',
                                            aggfunc='mean').round(2)
    return(results)

In [283]:
def success_rate(df=analysis_df):
    success_col_df = successful_visit_col(df)
    success_pivot = success_col_df.pivot_table(index='Variation',
                                                values='successful_visit',
                                                aggfunc='mean').round(4)*100
    return(success_pivot)

In [285]:
successful_visit_durations(analysis_df)

process_step,duration
Variation,Unnamed: 1_level_1
Control,317.33
Test,318.31


In [287]:
success_rate(analysis_df)

Unnamed: 0_level_0,successful_visit
Variation,Unnamed: 1_level_1
Control,66.54
Test,65.87


In [257]:
experiment_df = successful_visit_col(analysis_df)
success_df = experiment_df[experiment_df['successful_visit']==True]
duration_df = success_df[(success_df['process_step']=='start') | (success_df['process_step']=='confirm')]
duration_df.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,visit_final_step,step_duration,step_proceeds,step_error,successful_visit
0,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,Test,False,7.0,True,False,True
4,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,Test,True,,False,False,True
5,647,66758770_53988066587,40369564_40101682850_311847,start,2017-04-12 15:41:28,Test,False,7.0,True,False,True
9,647,66758770_53988066587,40369564_40101682850_311847,confirm,2017-04-12 15:47:45,Test,True,,False,False,True
26,1195,766842522_69992551638,393817425_39015278493_996341,start,2017-04-05 20:15:26,Control,False,33.0,True,False,True


In [259]:
duration_df = duration_df[['visit_id', 'Variation', 'process_step', 'date_time']]

duration_df_pivot = duration_df.pivot_table(index=['Variation', 'visit_id'],
                                        columns='process_step',
                                        values='date_time',
                                        aggfunc='first')

duration_df_pivot

Unnamed: 0_level_0,process_step,confirm,start
Variation,visit_id,Unnamed: 2_level_1,Unnamed: 3_level_1
Control,10006594_66157970412_679648,2017-04-13 11:56:12,2017-04-13 11:50:18
Control,10007589_47780784567_391490,2017-05-18 08:03:33,2017-05-18 07:51:32
Control,100254180_47139859079_984581,2017-04-05 21:47:43,2017-04-05 21:42:02
Control,100309269_21684743336_936307,2017-04-16 06:23:54,2017-04-16 06:19:54
Control,100471971_3065983298_584030,2017-05-04 07:53:11,2017-05-04 07:50:37
...,...,...,...
Test,999954858_74676709104_879685,2017-04-05 11:15:41,2017-04-05 11:13:34
Test,999958344_67534252886_39917,2017-04-15 00:36:59,2017-04-15 00:34:32
Test,999971096_28827267783_236076,2017-04-13 10:34:08,2017-04-13 10:31:49
Test,999976049_95772503197_182554,2017-04-04 13:02:18,2017-04-04 12:50:10


In [261]:
duration_df_pivot['confirm'] = pd.to_datetime(duration_df_pivot['confirm'])
duration_df_pivot['start'] = pd.to_datetime(duration_df_pivot['start'])
duration_df_pivot['duration'] = (duration_df_pivot['confirm'] - duration_df_pivot['start']).dt.total_seconds()

duration_df_pivot

Unnamed: 0_level_0,process_step,confirm,start,duration
Variation,visit_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Control,10006594_66157970412_679648,2017-04-13 11:56:12,2017-04-13 11:50:18,354.0
Control,10007589_47780784567_391490,2017-05-18 08:03:33,2017-05-18 07:51:32,721.0
Control,100254180_47139859079_984581,2017-04-05 21:47:43,2017-04-05 21:42:02,341.0
Control,100309269_21684743336_936307,2017-04-16 06:23:54,2017-04-16 06:19:54,240.0
Control,100471971_3065983298_584030,2017-05-04 07:53:11,2017-05-04 07:50:37,154.0
...,...,...,...,...
Test,999954858_74676709104_879685,2017-04-05 11:15:41,2017-04-05 11:13:34,127.0
Test,999958344_67534252886_39917,2017-04-15 00:36:59,2017-04-15 00:34:32,147.0
Test,999971096_28827267783_236076,2017-04-13 10:34:08,2017-04-13 10:31:49,139.0
Test,999976049_95772503197_182554,2017-04-04 13:02:18,2017-04-04 12:50:10,728.0


In [265]:
success_durations = duration_df_pivot.reset_index()[['Variation', 'visit_id', 'duration']]

results = success_durations.pivot_table(index=['Variation'],
                                        values='duration',
                                        aggfunc='mean').round(2)

results

process_step,duration
Variation,Unnamed: 1_level_1
Control,317.33
Test,318.31
