In [1]:
import pandas as pd
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source
import random
from tqdm import tqdm
from scipy import stats
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 350)

In [2]:
data = pd.read_csv("randomization.csv")
data.head()

Unnamed: 0,user_id,source,device,browser_language,browser,sex,age,country,test,conversion
0,1,SEO,Web,EN,Chrome,M,38,Chile,0,0
1,2,SEO,Mobile,ES,Android_App,M,27,Colombia,0,0
2,3,SEO,Mobile,ES,Iphone_App,M,18,Guatemala,1,0
3,5,Ads,Web,ES,Chrome,M,22,Argentina,1,0
4,8,Ads,Mobile,ES,Android_App,M,19,Venezuela,1,0


In [3]:
#drop user_id, not needed
data = data.drop(['user_id'], axis=1)
data_dummy = pd.get_dummies(data)
data_dummy.groupby("test")[["country_Argentina", "country_Uruguay"]].mean()

Unnamed: 0_level_0,country_Argentina,country_Uruguay
test,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.050488,0.002239
1,0.173223,0.017236


### Figure out how many new rows I need for test == 0

In [4]:
# how many users in each test group
data_dummy.groupby("test")[["country_Argentina", "country_Uruguay"]].sum()

Unnamed: 0_level_0,country_Argentina,country_Uruguay
test,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9356.0,415.0
1,37377.0,3719.0


In order to ensure the relative frequences between the two test groups balance, I need to add the following number of rows:

- Argentina: 37377 - 9356 = 28021
- Uruguay: 3719 - 415 = 3304

#### Argentina

In [5]:
req_ARG_users = 37377 - 9356
req_ARG_users

28021

#### Uruguay

In [6]:
req_URU_users = 3719 - 415
req_URU_users

3304

### Randomly select rows from the the original dataset

In [7]:
data_copy = data

In [8]:
data_copy.shape

(401085, 9)

#### Argentina

In [9]:
# create a list of index rows for users who are from ARG and in the control group
ARG_test_0_users = list(data[(data.country == 'Argentina') & (data.test == 0)].index)

In [10]:
# randomly select users from the ARG control group and add them to the dataset
for _ in tqdm(range(req_ARG_users)):
    idx = ARG_test_0_users[random.randint(0, len(ARG_test_0_users) - 1)]
    data_copy = pd.concat([data_copy, data.loc[[idx]]], ignore_index=True)

100%|██████████| 28021/28021 [26:27<00:00, 17.65it/s]


#### Uruguay

In [11]:
# create a list of index rows for users who are from URU and in the control group
URU_test_0_users = list(data[(data.country == 'Uruguay') & (data.test == 0)].index)

In [12]:
# randomly select users from the URU control group and add them to the dataset
for _ in tqdm(range(req_URU_users)):
    idx = URU_test_0_users[random.randint(0, len(URU_test_0_users) - 1)]
    data_copy = pd.concat([data_copy, data.loc[[idx]]], ignore_index=True)

100%|██████████| 3304/3304 [03:24<00:00, 16.17it/s]


#### Check class weights

In [13]:
data_dummy = pd.get_dummies(data_copy)
data_dummy.groupby("test")[["country_Argentina", "country_Uruguay"]].mean()

Unnamed: 0_level_0,country_Argentina,country_Uruguay
test,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.172534,0.017167
1,0.173223,0.017236


In [15]:
# generate t-test results
results = stats.ttest_ind(data_dummy.loc[data_copy['test'] == 1]['conversion'], 
                                data_dummy.loc[data_copy['test'] == 0]['conversion'], 
                                equal_var=False)

In [16]:
results.pvalue

0.961855629002422

In [17]:
results.statistic

0.047825132144838044

The results are not significant as the p-value is greater than 0.05.