# Multiple Comparisons

## Import libraries

In [1]:
#!pip install scipy

In [2]:
#!pip install statsmodels

In [3]:
#!pip install scipy statsmodels matplotlib seaborn

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests

In [5]:
from clickhouse_driver import Client


user_name = 'user-name'
pwd = 'password'

# creating connection ClickHouse
client = Client(host='clickhouse.lab.karpov.courses', port=9000,
                user=user_name, password=pwd, database='hardda')

# checking connection
result = client.execute("SELECT * FROM hardda.user_dm_events LIMIT 1")

# showing the result
for row in result[0:1]:
    print(row)

(datetime.date(2022, 2, 1), datetime.date(2022, 1, 31), 'android', 'f7411212fd0e2523e126cbfdd3f226c211212', '4beb10e1-aeeb-4c52-acd2-ce1ddbc1fc24b10e1', 22, 11, 3, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0)


## Tasks

### Task 1.

 **Enter the average ARPU per user for the entire dataset in the field below with one decimal place. Use a period to separate the integer and fractional parts.**

At first let's check columns names of the data we have in our database. 

In [6]:
query = '''
DESCRIBE TABLE live_adverts
'''

In [7]:
result = client.execute(query)
result

[('execution_date', 'Date32', '', '', '', 'DoubleDelta', ''),
 ('advert_id', 'UInt64', '', '', '', 'T64', ''),
 ('created_at', 'DateTime', '', '', '', 'DoubleDelta', ''),
 ('price', 'UInt64', '', '', '', 'Gorilla', ''),
 ('region', 'LowCardinality(String)', '', '', '', '', ''),
 ('user_id', 'UInt32', '', '', '', 'T64', ''),
 ('platform', 'LowCardinality(String)', '', '', '', '', ''),
 ('auto_brand', 'LowCardinality(String)', '', '', '', '', ''),
 ('auto_model', 'LowCardinality(String)', '', '', '', '', ''),
 ('passport_id', 'UInt32', '', '', '', 'T64', ''),
 ('year', 'LowCardinality(String)', '', '', '', '', ''),
 ('userType', 'UInt8', '', '', '', 'T64', '')]

In [8]:
query = '''
DESCRIBE TABLE user_transactions
'''

In [9]:
result = client.execute(query)
result

[('payment_date', 'Date', '', '', '', '', ''),
 ('id', 'Int32', '', '', '', '', ''),
 ('type', 'String', '', '', '', '', ''),
 ('sign', 'Int32', '', '', '', '', ''),
 ('amount', 'Int32', '', '', '', '', ''),
 ('tstamp', 'DateTime', '', '', '', '', ''),
 ('advert_id', 'Int32', '', '', '', '', ''),
 ('passport_id', 'Int32', '', '', '', '', ''),
 ('balance_consumption', 'Int32', '', '', '', '', '')]

Now when we know the structure of the tables we can join the data to get a dataframe we need to solve the task. 

In [10]:
query = '''
SELECT
  CAST(la.passport_id AS String) AS passport_id,
  SUM(CASE WHEN ut.sign = -1 AND ut.amount < 0 THEN ut.balance_consumption END) AS money
FROM
  live_adverts la
LEFT JOIN
  user_transactions ut
    ON CAST(ut.advert_id AS String) = CAST(la.advert_id AS String)
      AND la.execution_date = ut.payment_date
WHERE
  la.execution_date BETWEEN '2022-01-01' AND '2022-02-28'
GROUP BY
  CAST(la.passport_id AS String)
'''

In [11]:
result = client.execute(query)

In [12]:
df = pd.DataFrame(result, columns=['passport_id', 'money'])

In [13]:
df.sample(5)

Unnamed: 0,passport_id,money
3370,136731253,
319540,123575787,0.0
88033,126660219,540.0
266168,135001961,
225917,127661697,


Let's fill the zero-spending clients with 0 instead of NaNs. 

In [14]:
df.fillna(0, inplace=True)

In [15]:
df.shape[0]

398500

In [16]:
df.passport_id.nunique()

398500

In [17]:
df.sample(5)

Unnamed: 0,passport_id,money
180272,142792726,0.0
44749,143034838,185.0
330683,133415057,1340.0
144167,135840569,0.0
309571,133288259,0.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398500 entries, 0 to 398499
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   passport_id  398500 non-null  object 
 1   money        398500 non-null  float64
dtypes: float64(1), object(1)
memory usage: 6.1+ MB


Loading experiment groups assignment data. 

In [19]:
groups_df = pd.read_csv('user_groups.csv', dtype={'passport_id': 'string'})

In [20]:
groups_df.passport_id.nunique()

398500

In [21]:
groups_df.sample(5)

Unnamed: 0.1,Unnamed: 0,passport_id,group
227558,227558,136707285,D
30809,30809,124206893,B
9559,9559,123634308,C
378753,378753,142992908,A
269960,269960,138547068,A


In [22]:
groups_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398500 entries, 0 to 398499
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   398500 non-null  int64 
 1   passport_id  398500 non-null  string
 2   group        398500 non-null  object
dtypes: int64(1), object(1), string(1)
memory usage: 9.1+ MB


Merging clients spending data with experiment groups assignment data. 

In [23]:
data_df = pd.merge(df, groups_df[['passport_id', 'group']], how='inner', on='passport_id')

In [24]:
data_df

Unnamed: 0,passport_id,money,group
0,135655985,510.0,C
1,141737421,0.0,C
2,131999151,1650.0,A
3,134776715,175.0,A
4,129001315,0.0,D
...,...,...,...
398495,143024622,3971.0,D
398496,126984117,0.0,B
398497,138809936,330.0,D
398498,137173421,430.0,D


In [25]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 398500 entries, 0 to 398499
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   passport_id  398500 non-null  object 
 1   money        398500 non-null  float64
 2   group        398500 non-null  object 
dtypes: float64(1), object(2)
memory usage: 12.2+ MB


Getting an answer to our question.

In [26]:
print(f'ARPU: {round(data_df.money.mean(), 1)}')

ARPU: 568.3


### Task 2.  

**Find ARPU in different groups.** 

In [27]:
data_df.groupby('group', as_index=False) \
    .agg({'money': 'mean'}).sort_values('money')

Unnamed: 0,group,money
1,B,531.225178
2,C,558.728036
3,D,590.977113
0,A,592.619862


### Task 3. 

**Check p-value using t-test for A-B, A-C, A-D, B-C, B-D, C-D.**

In [28]:
grp_a = data_df[data_df['group'] == 'A'].money
grp_b = data_df[data_df['group'] == 'B'].money
grp_c = data_df[data_df['group'] == 'C'].money
grp_d = data_df[data_df['group'] == 'D'].money

In [29]:
ttest_ind(grp_a, grp_b) 

Ttest_indResult(statistic=2.6387293912277445, pvalue=0.008322386049800307)

In [30]:
ttest_ind(grp_a, grp_c) 

Ttest_indResult(statistic=1.411934949345719, pvalue=0.1579706762765919)

In [31]:
ttest_ind(grp_a, grp_d) 

Ttest_indResult(statistic=0.05802430431988292, pvalue=0.9537293274300052)

In [32]:
ttest_ind(grp_b, grp_c) 

Ttest_indResult(statistic=-1.6808553238389767, pvalue=0.09279258286629669)

In [33]:
ttest_ind(grp_b, grp_d) 

Ttest_indResult(statistic=-2.698887140811339, pvalue=0.006957763472281115)

In [34]:
ttest_ind(grp_c, grp_d) 

Ttest_indResult(statistic=-1.4072129691661825, pvalue=0.15936580440774298)

### Task 4. 

**Apply Sidak multiple comparison correction.**

In [35]:
pvals_lst = [
    ttest_ind(grp_a, grp_b).pvalue,
    ttest_ind(grp_a, grp_c).pvalue,
    ttest_ind(grp_a, grp_d).pvalue,
    ttest_ind(grp_b, grp_c).pvalue,
    ttest_ind(grp_b, grp_d).pvalue,
    ttest_ind(grp_c, grp_d).pvalue
    ]

In [36]:
ans = multipletests(pvals=pvals_lst,
              alpha=0.05,
              method='sidak')

print(f'Sidak corrected alpha: {ans[2]}')

Sidak corrected alpha: 0.008512444610847103


### Task 5. 

**What will be the new Holm-Bonferroni corrected alphas?**

In [37]:
alpha = 0.05
sorted_p_vals = sorted(pvals_lst)

n = len(sorted_p_vals)
rank = range(1, n + 1)
alphas_hb = [(alpha / (n - r + 1)) for r in rank]

alphas_hb

[0.008333333333333333, 0.01, 0.0125, 0.016666666666666666, 0.025, 0.05]

### Task 6. 

**Calculate the sample size through simulation with Holm-Bonferroni correction (to FWER 0.05) such that the power of finding all effects of 20% relative percentages in pairs A-B, A-C, A-D is guaranteed at the level of 0.8. Use group A as the base for the simulation.
The number of simulations per sample size is 200.
The increment step of the sample is 1000.
Use np.random.seed(1) before the first initialization.**

tbc..