In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date, time, timedelta
import time
from sklearn.metrics import auc, roc_curve, classification_report
from scipy import stats

#import h2o
#from h2o.frame import H2OFrame
#from h2o.estimators.random_forest import H2ORandomForestEstimator
#from h2o.grid.grid_search import H2OGridSearch

%matplotlib inline
sns.set(style="white",context="talk")

# Recap
1. Can you estimate the impact the program had on the site?
2. Based on the data, what would you suggest to do as a next step?

# 1. Data Preview and Processing

In [2]:
data = pd.read_csv("referral.csv",sep = ",")

In [3]:
def view(df):
    # Exploring the data types, number of unique values and missing values
    cols = df.columns
    print(f"The dataset consists of {df.shape[0]} rows and {df.shape[1]} columns")
    print(f"The columns are: {df.columns.tolist()}")
    return pd.concat([pd.DataFrame({"data_types":df.dtypes, 
              "value_counts": df.nunique(),
             "null_counts": df.isnull().sum()}).T,df.iloc[0:3,:]],axis = 0)

In [4]:
data = data.rename(columns ={"date":"day"})
view(data)

The dataset consists of 97341 rows and 6 columns
The columns are: ['user_id', 'day', 'country', 'money_spent', 'is_referral', 'device_id']


Unnamed: 0,user_id,day,country,money_spent,is_referral,device_id
data_types,int64,object,object,int64,int64,object
value_counts,18809,56,9,165,2,17887
null_counts,0,0,0,0,0,0
0,2,2015-10-03,FR,65,0,EVDCJTZMVMJDG
1,3,2015-10-03,CA,54,0,WUBZFTVKXGQQX
2,6,2015-10-03,FR,35,0,CBAPCJRTFNUJG


In [5]:
def add_date_factors(df, ts_col, is_datetime):
    '''
    Assumption: the currect ts_col is a timestamp datatype
    
    Input:
    df (dataframe, target dataframe to be transformed)
    ts_col(string, name of the timestamp column)
    is_datetime(boolean, whether ts_col is datetime object or not)
    
    Output:
    This function generates hour, hour_of_day, date, weekday, week_of_month, months, year
    '''
    if is_datetime:
        pass
    else:
        df[ts_col] = list(map(lambda x: datetime.fromisoformat(x),df[ts_col].replace(":60",":59", regex=True)))
        
    # Date
    df["date"] =  list(map(lambda day:day.date(), df[ts_col]))

    # Weekday
    wkday_dict = {1:"Mon",2:"Tue",3:"Wed",4:"Thu",5:"Fri",6:"Sat",7:"Sun"}
    df["weekday_index"] = list(map(lambda x: x.date().weekday()+1, df[ts_col]))
    df['weekday'] = list(map(lambda x: wkday_dict[x],df["weekday_index"]))

    # Week_of_month (assuming 4 weeks in a month)
    def get_week(day):
        if 1<= day.day < 8:
            return ("first_week",1)
        elif 8<= day.day < 15:
            return ("second_week",2)
        elif 15<= day.day < 22:
            return ("third_week",3)
        else:
            return ("last_week",4)
    df["week"] = list(map(lambda day: get_week(day)[0], df[ts_col]))
    df["week_index"] = list(map(lambda day: get_week(day)[1], df[ts_col]))

    # Month
    month_dict = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
    df["month_index"] = list(map(lambda day: day.month, df[ts_col]))
    df["month"] = list(map(lambda m: month_dict[m], df["month_index"]))

    # Year
    df["year"] = list(map(lambda x:x.year,df[ts_col]))
    return df

In [6]:
data.day = pd.to_datetime(data.day)
data = add_date_factors(data, "day",True)

In [7]:
# How many days in operation and total number of pick-ups each month
def get_opr_status(df,agg_frame,id_col,date_col,is_frame_time=True):
    '''
    Input:
      df(dataframe)
      add_frame(normally ingle string, but might be a list if it is a time factor, so we could sort the time)
      is_frame_time(bool)s

      id_col(string)
      date_col(string)
    '''
    if is_frame_time:
        opr = df.groupby(by = agg_frame).agg({id_col:"count",date_col:"nunique"}).sort_index()
    else:
        opr = df.groupby(by = agg_frame).agg({id_col:"count",date_col:"nunique"})
        opr.columns = ["num_obs","num_days"]
    return opr

In [8]:
get_opr_status(data, ["year","month_index","month"],"user_id","date",is_frame_time=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,user_id,date
year,month_index,month,Unnamed: 3_level_1,Unnamed: 4_level_1
2015,10,Oct,50574,29
2015,11,Nov,46767,27


# 2. Experiment

### Add the test identifier

In [9]:
data["is_test"] = 1
data.loc[data.month =="Oct","is_test"] = 0
data[data.date == datetime(2015,10,31)] = 1

### 2-1. Picking the evaluation metric

**North Star Metric-1:** Total revenue per day net of referral expenditure

In [10]:
data["revenue"] = data.money_spent
data.loc[data.is_referral == 1,"revenue"] = data.revenue - 10
temp1 = data.groupby(by = ["date","is_test"]).revenue.sum().reset_index()

In [11]:
def levene_t_test(data1, data2):
    # Step1: Check normal distribution assumption
    print("Data1: KS-Test",stats.kstest(data1, 'norm'))
    print("Data1: KS-Test",stats.kstest(data2, 'norm'),"\n")
    
    # Step2: Check equality of variance (not required in paired t-test), only required in independent t-test
    t,p = stats.levene(data1,data2)
    print(f"Levene variance test: t={round(t,3)}, p={round(p,3)}")
    if p<0.05:
        print("----variance of the two group doesn't equal, reject the null hypothesis----")
        t,p = stats.ttest_ind(data1,data2, equal_var = False)
    else:
        t,p = stats.ttest_ind(data1,data2)
    print(f"Independent t-test: t={round(t,3)}, p={round(p,3)}")

In [12]:
levene_t_test(temp1.loc[temp1.is_test == 1,"revenue"],
              temp1.loc[temp1.is_test == 0,"revenue"].sample(
                  len(temp1.loc[temp1.is_test == 0,"revenue"])))

Data1: KS-Test KstestResult(statistic=1.0, pvalue=0.0)
Data1: KS-Test KstestResult(statistic=1.0, pvalue=0.0) 

Levene variance test: t=0.154, p=0.696
Independent t-test: t=-0.227, p=0.821


**North Star Metric-2:** Revenue per customer per day net of referral expenditure

In [13]:
temp2 = data.groupby(by = ["date","is_test"]).agg({"revenue":"sum","user_id":"count"}).reset_index()
temp2["rev_per_order"] = temp2.revenue/temp2.user_id


In [14]:
levene_t_test(temp2.loc[temp2.is_test == 1,"rev_per_order"],
              temp2.loc[temp2.is_test == 0,"rev_per_order"].sample(
                  len(temp2.loc[temp2.is_test == 0,"rev_per_order"])))

Data1: KS-Test KstestResult(statistic=1.0, pvalue=0.0)
Data1: KS-Test KstestResult(statistic=1.0, pvalue=0.0) 

Levene variance test: t=0.235, p=0.629
Independent t-test: t=-5.934, p=0.0


Questions: are they price-sensitive customers? => How many orders does the referree bought

In [15]:
data[data.is_referral == 1].groupby("user_id").is_referral.count().sort_values(ascending = False).describe()

count    12715.000000
mean         2.203460
std          1.215192
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         10.000000
Name: is_referral, dtype: float64