# Question1 
Based on this data, what can we conclude at this point from the A/B test (in which we tried initiating the background check earlier in the hiring process for the treatment shoppers)? And how confident should we be in this conclusion?


## Data exploratory

In [None]:
# import package
import pandas as pd

In [None]:
df = pd.read_csv('application.csv')
df.head()

In [None]:
# change data type
df['applicant_id'] = df['applicant_id'].astype('object')
df['event_date'] = df['event_date'].astype('datetime64')

In [None]:
df.groupby('group').agg({'applicant_id': 'nunique'})  # return df

In [None]:
df.groupby('group').agg({'applicant_id': 'count'})  # return df

In [None]:
# check converstion funnel and its sequence by group
print (df[df['group'] == 'control'].groupby(['group','event']).applicant_id.nunique().sort_values(ascending = False))
print (df[df['group'] == 'treatment'].groupby(['group','event']).applicant_id.nunique().sort_values(ascending = False))

In [None]:
## nominator
cvr = pd.DataFrame(df.groupby(['group','event']).applicant_id.nunique().sort_values(ascending = False))
cvr = cvr.sort_values(['group','applicant_id'], ascending = False).reset_index()

##demoninator
demoninator = pd.DataFrame(cvr.groupby('group').applicant_id.max()).reset_index()

## rate
cvr1 = cvr.merge(demoninator, left_on = 'group', right_on = 'group').reset_index(drop = True)
cvr1['rate'] = cvr1['applicant_id_x']/cvr1['applicant_id_y']
cvr1

In [None]:
# check experiment duration
print(df['event_date'].min())
print(df['event_date'].max())

df['event_date'].max() - df['event_date'].min()

In [None]:
# transpose the dataset by applicant
df_transpose = df.pivot_table(index = ['group','applicant_id'],columns = 'event',values = 'event_date', aggfunc='max')
df_transpose.reset_index()

In [None]:
# What's the distribution of the duration of the days between application_date to first_batch_completed_date
import seaborn as sns
duration = (df_transpose['first_batch_completed_date'] - df_transpose['application_date'])
sns.distplot(duration[duration.notna()].dt.days, hist_kws={'cumulative':'True'}, kde_kws= {'cumulative':'True'})

## Sample Size

In [None]:
# defind the conversion windows
# application date range from [2018-10-01 00:00:00, 2018-10-31 00:00:00]  80% -> duration 11 days

qualified_applicant_id = pd.DataFrame(df.loc[(df['event'] == 'application_date') 
                                             & (df['event_date'] <= '2018-10-31'),'applicant_id'].unique())
qualified_applicant_id.columns = ['qualified_applicant_id']

In [None]:
# filter out who are not qualified in this experiment

df = df.merge(qualified_applicant_id,left_on = 'applicant_id', right_on = 'qualified_applicant_id', 
              how = 'inner').reset_index(drop = True)
df = df[['applicant_id','channel','group','city','event','event_date']]
df.head()

In [None]:
# check convertsion funnel

print (df[df['group'] == 'control'].groupby(['group','event']).applicant_id.nunique().sort_values(ascending = False))
print (df[df['group'] == 'treatment'].groupby(['group','event']).applicant_id.nunique().sort_values(ascending = False))

# Checking the invariate metrics

In [None]:
# sanity check - sample size
# define invariate check function

def invariant_check(test_sample_size, control_sample_size, p_pool, z_score):
    import numpy as np
    p_pool_sd = np.sqrt(p_pool * (1 - p_pool)/(test_sample_size + control_sample_size))
    margin_error = p_pool_sd * z_score
    control_prop = control_sample_size/(test_sample_size + control_sample_size)
    confidence_interval_lower = p_pool - margin_error
    confidence_interval_upper = p_pool + margin_error   
    return confidence_interval_lower, confidence_interval_upper, control_prop

test_sample = df.groupby('group').agg({'applicant_id': 'nunique'}).loc['treatment','applicant_id']
control_sample = df.groupby('group').agg({'applicant_id': 'nunique'}).loc['control','applicant_id']
lower, upper, control_prop = invariant_check(test_sample,control_sample,0.5,1.96)

print ('confidence interval lower bounds: {}'.format(lower))
print ('confidence interval upper bounds: {}'.format(upper))
print ('control proportion: {}'.format(control_prop))

# Checking the evaluation metric

In [None]:
import scipy.stats as stats

# choose metrics
# conversion rate = complete first batch / application 
# calculate z score

# define z_score function
def z_score_pool(test_success, control_success, test_sample_size, control_sample_size):
    import numpy as np
    d = test_success/test_sample_size - control_success/control_sample_size
    p_pool = (test_success + control_success)/(test_sample_size + control_sample_size)
    p_pool_sd = np.sqrt(p_pool*(1-p_pool)*(1/test_sample_size + 1/control_sample_size))
    z_score = (d-0) / p_pool_sd
    return z_score

test_complete = 2115
contrl_complete = 2678
test_sample_size = 4958
control_sample_size = 10024
alpha = 0.05
dmin = 0  # no prtical significant value provided, then we can assume dmin = 0

z = z_score_pool(test_complete, contrl_complete, test_sample_size, control_sample_size)
p_value = (1-stats.norm.cdf(z))
print ( 'z score is %s > 1.65, so we can reject null hypothesis, and accept alternative hypothesis' %z )

# Question 2 
- The background check costs us $30 to complete!  
- We'd like to know if this change is cost-effective. How should we think about the cost-effectiveness of this change? Please be as specific as you can here

In [None]:
cost = 30

# formula: cost per success

control = (30*8582)/ 2678
test =  (30*4958)/ 2115
print (control)
print (test)

# Question 3

In [None]:
channel_cvr = df_transpose.groupby('channel').agg({'application_date':'count','first_batch_completed_date':'count' })
channel_cvr['cvr'] = channel_cvr['first_batch_completed_date']/channel_cvr['application_date']
channel_cvr.head()

In [None]:
channel_cvr_group = df_transpose.groupby(['group','channel']).agg({'application_date':'count','first_batch_completed_date':'count'})
channel_cvr_group['cvr'] = channel_cvr_group['first_batch_completed_date']/channel_cvr_group['application_date']
channel_cvr_group