In [38]:
import pandas as pd
import numpy as np
import statsmodels.stats.api as sms
import datetime as dt
import matplotlib.pyplot as plt
import bootstrapped.bootstrap as bs
import bootstrapped.compare_functions as bs_compare
import bootstrapped.stats_functions as bs_stats
from termcolor import colored, cprint
import statsmodels.formula.api as smf

In [2]:
df = pd.read_csv('project_data.csv')

In [3]:
df.head()

Unnamed: 0,user,adid,week,expid,if_click,real_like_cnt,category,brand_effect,experience,status,user_age,friend_age,user_gender,friend_gender,user_city,friend_city,user_degree,friend_degree,user_sns_like_cnt,user_sns_comment_cnt
0,297353,0b0fa14b56d3741178196daaa92e6a1e,w_2,1,0,3,Car,1,1,1,48,37,female,male,level_1,level_1,495,436,162,138
1,98719,ea875acb76e0a806a7837174528f62d9,w_3,1,0,3,Car,1,1,1,29,34,female,female,level_3,level_2,479,3911,212,312
2,205144,1f1fe825014d9e9a0881233d9950bd43,w_2,1,0,1,Jewelry,0,1,1,24,24,female,male,level_2,level_1,682,470,95,336
3,559194,1f1fe825014d9e9a0881233d9950bd43,w_2,1,0,5,Jewelry,0,1,1,26,41,male,male,level_1,level_1,340,602,35,138
4,11888,0d82161270febc99f7defb653e339113,w_3,0,0,1,Cosmetrics,0,1,1,40,34,female,female,level_1,level_2,95,836,50,38


### 1.Statistical tests

#### 1.1 All data

In [4]:
expid_0 = df[df['expid'] == 0]['if_click']
expid_1 = df[df['expid'] == 1]['if_click']

cm_expid = sms.CompareMeans(sms.DescrStatsW(expid_0), sms.DescrStatsW(expid_1))

In [5]:
# t-test

print(cm_expid.ttest_ind(alternative='two-sided', usevar='pooled'))

(-15.430590532876387, 1.0340030648053255e-53, 999998.0)


In [6]:
# z-test

print(cm_expid.ztest_ind(alternative='two-sided',usevar='pooled'))

(-15.430590532876387, 1.0193323875329505e-53)


In [7]:
# bootstrap

expid_0_numpy = expid_0.to_numpy() 
expid_1_numpy = expid_1.to_numpy()

print(bs.bootstrap_ab(
    expid_1_numpy,
    expid_0_numpy,
    stat_func=bs_stats.mean,
    compare_func=bs_compare.difference,
    alpha=0.05))

0.007902000000000006    (0.006918000000000021, 0.008902050000000007)


#### 1.2 Across different categories

In [8]:
p_threshold = 0.05

In [9]:
def bs_mean(expid_0, expid_1):
    expid_0_numpy = expid_0.to_numpy() 
    expid_1_numpy = expid_1.to_numpy()

    m = bs.bootstrap_ab(
        expid_1_numpy,
        expid_0_numpy,
        stat_func=bs_stats.mean,
        compare_func=bs_compare.difference,
        alpha=0.05)
    
    if m.lower_bound > 0:
        return True
    else:
        return False

In [10]:
# expid_0 = df[(df['expid'] == 0) & (df['category'] == 'Car')]['if_click']
# expid_1 = df[(df['expid'] == 1) & (df['category'] == 'Car')]['if_click']
# bs_mean(expid_0, expid_1)

##### Category

In [11]:
difference_category = pd.DataFrame(index=['t-test', 'z-test', 'bootstrap'], columns=df['category'].unique())
difference_category

Unnamed: 0,Car,Jewelry,Cosmetrics,Beverage,Clothes
t-test,,,,,
z-test,,,,,
bootstrap,,,,,


In [12]:
for category in df['category'].unique():
    expid_0 = df[(df['expid'] == 0) & (df['category'] == category)]['if_click']
    expid_1 = df[(df['expid'] == 1) & (df['category'] == category)]['if_click']
    cm_expid = sms.CompareMeans(sms.DescrStatsW(expid_0), sms.DescrStatsW(expid_1))
    
    t_test = cm_expid.ttest_ind(alternative='two-sided', usevar='pooled')
    z_test = cm_expid.ztest_ind(alternative='two-sided',usevar='pooled')
    
    if t_test[1] < p_threshold:
        difference_category.loc['t-test',category] = 'different'
    else:
        difference_category.loc['t-test',category] = 'not different'
        
    if z_test[1] < p_threshold:
        difference_category.loc['z-test',category] = 'different'
    else:
        difference_category.loc['z-test',category] = 'not different'
    
    if bs_mean(expid_0, expid_1):
        difference_category.loc['bootstrap',category] = 'different'
    else:
        difference_category.loc['bootstrap',category] = 'not different'

In [13]:
difference_category

Unnamed: 0,Car,Jewelry,Cosmetrics,Beverage,Clothes
t-test,different,different,different,different,different
z-test,different,different,different,different,different
bootstrap,different,different,different,different,different


##### Status

In [14]:
df['status'].unique()[1]

0

In [15]:
col = ['status goods', 'not status goods']
difference_status = pd.DataFrame(index=['t-test', 'z-test', 'bootstrap'], columns=col)
difference_status

Unnamed: 0,status goods,not status goods
t-test,,
z-test,,
bootstrap,,


In [16]:
for status in df['status'].unique():
    expid_0 = df[(df['expid'] == 0) & (df['status'] == status)]['if_click']
    expid_1 = df[(df['expid'] == 1) & (df['status'] == status)]['if_click']
    cm_expid = sms.CompareMeans(sms.DescrStatsW(expid_0), sms.DescrStatsW(expid_1))
    
    t_test = cm_expid.ttest_ind(alternative='two-sided', usevar='pooled')
    z_test = cm_expid.ztest_ind(alternative='two-sided',usevar='pooled')
    
    if t_test[1] < p_threshold:
        difference_status.loc['t-test',col[status]] = 'different'
    else:
        difference_status.loc['t-test',col[status]] = 'not different'
        
    if z_test[1] < p_threshold:
        difference_status.loc['z-test',col[status]] = 'different'
    else:
        difference_status.loc['z-test',col[status]] = 'not different'
    
    if bs_mean(expid_0, expid_1):
        difference_status.loc['bootstrap',col[status]] = 'different'
    else:
        difference_status.loc['bootstrap',col[status]] = 'not different'

In [17]:
difference_status

Unnamed: 0,status goods,not status goods
t-test,different,different
z-test,different,different
bootstrap,different,different


##### Experience

In [18]:
col = ['experience goods', 'not experience goods']
difference_experience = pd.DataFrame(index=['t-test', 'z-test', 'bootstrap'], columns=col)
difference_experience

Unnamed: 0,experience goods,not experience goods
t-test,,
z-test,,
bootstrap,,


In [19]:
for experience in df['experience'].unique():
    expid_0 = df[(df['expid'] == 0) & (df['experience'] == experience)]['if_click']
    expid_1 = df[(df['expid'] == 1) & (df['experience'] == experience)]['if_click']
    cm_expid = sms.CompareMeans(sms.DescrStatsW(expid_0), sms.DescrStatsW(expid_1))
    
    t_test = cm_expid.ttest_ind(alternative='two-sided', usevar='pooled')
    z_test = cm_expid.ztest_ind(alternative='two-sided',usevar='pooled')
    
    if t_test[1] < p_threshold:
        difference_experience.loc['t-test',col[experience]] = 'different'
    else:
        difference_experience.loc['t-test',col[experience]] = 'not different'
        
    if z_test[1] < p_threshold:
        difference_experience.loc['z-test',col[experience]] = 'different'
    else:
        difference_experience.loc['z-test',col[experience]] = 'not different'
    
    if bs_mean(expid_0, expid_1):
        difference_experience.loc['bootstrap',col[experience]] = 'different'
    else:
        difference_experience.loc['bootstrap',col[experience]] = 'not different'

In [20]:
difference_experience

Unnamed: 0,experience goods,not experience goods
t-test,different,different
z-test,different,different
bootstrap,different,different


##### Brand

In [21]:
col = ['brand', 'not brand']
difference_brand = pd.DataFrame(index=['t-test', 'z-test', 'bootstrap'], columns=col)
difference_brand

Unnamed: 0,brand,not brand
t-test,,
z-test,,
bootstrap,,


In [22]:
for brand_effect in df['brand_effect'].unique():
    expid_0 = df[(df['expid'] == 0) & (df['brand_effect'] == brand_effect)]['if_click']
    expid_1 = df[(df['expid'] == 1) & (df['brand_effect'] == brand_effect)]['if_click']
    cm_expid = sms.CompareMeans(sms.DescrStatsW(expid_0), sms.DescrStatsW(expid_1))
    
    t_test = cm_expid.ttest_ind(alternative='two-sided', usevar='pooled')
    z_test = cm_expid.ztest_ind(alternative='two-sided',usevar='pooled')
    
    if t_test[1] < p_threshold:
        difference_brand.loc['t-test',col[brand_effect]] = 'different'
    else:
        difference_brand.loc['t-test',col[brand_effect]] = 'not different'
        
    if z_test[1] < p_threshold:
        difference_brand.loc['z-test',col[brand_effect]] = 'different'
    else:
        difference_brand.loc['z-test',col[brand_effect]] = 'not different'
    
    if bs_mean(expid_0, expid_1):
        difference_brand.loc['bootstrap',col[brand_effect]] = 'different'
    else:
        difference_brand.loc['bootstrap',col[brand_effect]] = 'not different'

In [23]:
difference_brand

Unnamed: 0,brand,not brand
t-test,different,different
z-test,different,different
bootstrap,different,different


##### Category Interaction

In [24]:
difference = pd.DataFrame(index=['t-test', 'z-test', 'bootstrap'], columns=df['category'].unique())
difference

Unnamed: 0,Car,Jewelry,Cosmetrics,Beverage,Clothes
t-test,,,,,
z-test,,,,,
bootstrap,,,,,


In [26]:
types = ['status', 'experience', 'brand_effect']

for type_ in types:
    for flag in range(2):
        print(type_, '==', flag)
        
        for category in df['category'].unique():
            
            expid_0 = df[(df['expid'] == 0) & (df['category'] == category) & (df[type_] == flag)]['if_click']
            expid_1 = df[(df['expid'] == 1) & (df['category'] == category) & (df[type_] == flag)]['if_click']
#             print(category, len(expid_0), len(expid_1))

            if len(expid_0)==0 | len(expid_1)==0:
                difference.loc['t-test',category] = 'not available'
                difference.loc['z-test',category] = 'not available'
                difference.loc['bootstrap',category] = 'not available'
            else:
                cm_expid = sms.CompareMeans(sms.DescrStatsW(expid_0), sms.DescrStatsW(expid_1))

                t_test = cm_expid.ttest_ind(alternative='two-sided', usevar='pooled')
                z_test = cm_expid.ztest_ind(alternative='two-sided',usevar='pooled')

                if t_test[1] < p_threshold:
                    difference.loc['t-test',category] = 'different'
                else:
                    difference.loc['t-test',category] = 'not different'

                if z_test[1] < p_threshold:
                    difference.loc['z-test',category] = 'different'
                else:
                    difference.loc['z-test',category] = 'not different'

                if bs_mean(expid_0, expid_1):
                    difference.loc['bootstrap',category] = 'different'
                else:
                    difference.loc['bootstrap',category] = 'not different'

        display(difference)

status == 0


Unnamed: 0,Car,Jewelry,Cosmetrics,Beverage,Clothes
t-test,not available,not available,different,different,not available
z-test,not available,not available,different,different,not available
bootstrap,not available,not available,different,different,not available


status == 1


Unnamed: 0,Car,Jewelry,Cosmetrics,Beverage,Clothes
t-test,different,different,different,not available,different
z-test,different,different,different,not available,different
bootstrap,different,different,different,not available,different


experience == 0


Unnamed: 0,Car,Jewelry,Cosmetrics,Beverage,Clothes
t-test,different,not available,not available,not available,not available
z-test,different,not available,not available,not available,not available
bootstrap,different,not available,not available,not available,not available


experience == 1


Unnamed: 0,Car,Jewelry,Cosmetrics,Beverage,Clothes
t-test,different,different,different,different,different
z-test,different,different,different,different,different
bootstrap,different,different,different,different,different


brand_effect == 0


Unnamed: 0,Car,Jewelry,Cosmetrics,Beverage,Clothes
t-test,not different,different,different,different,not available
z-test,not different,different,different,different,not available
bootstrap,not different,different,different,different,not available


brand_effect == 1


Unnamed: 0,Car,Jewelry,Cosmetrics,Beverage,Clothes
t-test,different,not available,not available,not available,different
z-test,different,not available,not available,not available,different
bootstrap,different,not available,not available,not available,different


------

### **2. Lift Effect**

In [42]:
n = 1000000
d_0 = df[df['expid'] == 0]['if_click']
d_1 = df[df['expid'] == 1]['if_click']
m0 = np.mean(d_0)
m1 = np.mean(d_1)
lift = m1/m0
print(lift)

1.1186415230316498


In [43]:
var0 = np.var(d_0,ddof=1)
var1 = np.var(d_1,ddof=1)
var_m0 = var0/500000
var_m1 = var1/500000

In [44]:
var_lift = 1/(m0**2)*var_m1 + (m1**2)/(m0**4)*var_m0
se_lift = np.sqrt(var_lift)
print(var_lift,se_lift)

6.61616051913808e-05 0.008133978435635344


In [45]:
CI = [lift- 1.96*se_lift,lift+1.96*se_lift]
print(CI)

[1.1026989252978046, 1.134584120765495]


In [46]:
# null hypothesis is there is no difference in click through rate, so lift0 = 1
t_stat = (lift-1)/se_lift
print(t_stat)

14.585915609497517


In [47]:
from scipy.stats import t
pval = t.sf(np.abs(t_stat), n-1)*2  # two-sided pvalue = Prob(abs(t)>tt)
print('t-statistic = %6.3f pvalue = %6.4f' % (t_stat, pval))

t-statistic = 14.586 pvalue = 0.0000
