In [1]:
import pandas as pd
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 350)

In [2]:
data = pd.read_csv("randomization.csv")
data.head()

Unnamed: 0,user_id,source,device,browser_language,browser,sex,age,country,test,conversion
0,1,SEO,Web,EN,Chrome,M,38,Chile,0,0
1,2,SEO,Mobile,ES,Android_App,M,27,Colombia,0,0
2,3,SEO,Mobile,ES,Iphone_App,M,18,Guatemala,1,0
3,5,Ads,Web,ES,Chrome,M,22,Argentina,1,0
4,8,Ads,Mobile,ES,Android_App,M,19,Venezuela,1,0


In [3]:
#let's group by source and estimate relative frequencies
data_grouped_source = data.groupby("source")["test"].agg([('frequency_test_0',lambda x: len(x[x==0])), ('frequency_test_1', lambda x: len(x[x==1]))])
  


In [4]:
print(data_grouped_source/data_grouped_source.sum())

        frequency_test_0  frequency_test_1
source                                    
Ads             0.401228          0.400641
Direct          0.200949          0.199500
SEO             0.397823          0.399858


### Check A/B Test Randomization

In [5]:
#drop user_id, not needed
data = data.drop(['user_id'], axis=1)
#make dummy vars. Don't drop one level here, keep them all. You don't want to risk dropping the one level that actually creates problems with the randomization
data_dummy = pd.get_dummies(data)
#model features, test is the label and conversion is not needed here
train_cols = data_dummy.drop(['test', 'conversion'], axis=1)
  
tree=DecisionTreeClassifier(
    #change weights. Our data set is now perfectly balanced. It makes easier to look at tree output
    class_weight="balanced",
    #only split if if it's worthwhile. The default value of 0 means always split no matter what if you can increase overall performance, which creates tons of noisy and irrelevant splits
    min_impurity_decrease = 0.001
    )
tree.fit(train_cols,data_dummy['test'])
  
export_graphviz(tree, out_file="tree_test.dot", feature_names=train_cols.columns, proportion=True, rotate=True)
s = Source.from_file("tree_test.dot")
s.view()

'tree_test.dot.pdf'

In [6]:
print(data_dummy.groupby("test")[["country_Argentina", "country_Uruguay"]].mean())

      country_Argentina  country_Uruguay
test                                    
0              0.050488         0.002239
1              0.173223         0.017236


In [7]:
from scipy import stats
  
#this is the test results using the orginal dataset
original_data = stats.ttest_ind(data_dummy.loc[data['test'] == 1]['conversion'], 
                                data_dummy.loc[data['test'] == 0]['conversion'], 
                                equal_var=False)
  
#this is after removing Argentina and Uruguay
data_no_AR_UR = stats.ttest_ind(data_dummy.loc[(data['test'] == 1) & (data_dummy['country_Argentina'] ==  0) & (data_dummy['country_Uruguay'] ==  0)]['conversion'], 
                                data_dummy.loc[(data['test'] == 0) & (data_dummy['country_Argentina'] ==  0) & (data_dummy['country_Uruguay'] ==  0)]['conversion'], 
                                equal_var=False)
  
print(pd.DataFrame( {"data_type" : ["Full", "Removed_Argentina_Uruguay"], 
                         "p_value" : [original_data.pvalue, data_no_AR_UR.pvalue],
                         "t_statistic" : [original_data.statistic, data_no_AR_UR.statistic]
                         }))



                   data_type       p_value  t_statistic
0                       Full  1.928918e-13    -7.353895
1  Removed_Argentina_Uruguay  7.200849e-01     0.358346
