# Multiple Comparisons

## Import libraries

In [1]:
import pandas as pd

In [2]:
from clickhouse_driver import Client


user_name = 'hardda_student_a.nikitin-8'
pwd = '0ddbfab04207297e7de7f96e697814e152de44e69f9fd4d8d5bdbb6b6107518a'

# creating connection ClickHouse
client = Client(host='clickhouse.lab.karpov.courses', port=9000,
                user=user_name, password=pwd, database='hardda')

# checking connection
result = client.execute("SELECT * FROM hardda.user_dm_events LIMIT 1")

# showing the result
for row in result[0:1]:
    print(row)

(datetime.date(2022, 2, 1), datetime.date(2022, 1, 31), 'android', 'f7411212fd0e2523e126cbfdd3f226c211212', '4beb10e1-aeeb-4c52-acd2-ce1ddbc1fc24b10e1', 22, 11, 3, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0)


## Tasks

### Task 1.

 **Enter the average ARPU per user for the entire dataset in the field below with one decimal place. Use a period to separate the integer and fractional parts.**

At first let's check columns names of the data we have in our database. 

In [3]:
query = '''
DESCRIBE TABLE live_adverts
'''

In [4]:
result = client.execute(query)
result

[('execution_date', 'Date32', '', '', '', 'DoubleDelta', ''),
 ('advert_id', 'UInt64', '', '', '', 'T64', ''),
 ('created_at', 'DateTime', '', '', '', 'DoubleDelta', ''),
 ('price', 'UInt64', '', '', '', 'Gorilla', ''),
 ('region', 'LowCardinality(String)', '', '', '', '', ''),
 ('user_id', 'UInt32', '', '', '', 'T64', ''),
 ('platform', 'LowCardinality(String)', '', '', '', '', ''),
 ('auto_brand', 'LowCardinality(String)', '', '', '', '', ''),
 ('auto_model', 'LowCardinality(String)', '', '', '', '', ''),
 ('passport_id', 'UInt32', '', '', '', 'T64', ''),
 ('year', 'LowCardinality(String)', '', '', '', '', ''),
 ('userType', 'UInt8', '', '', '', 'T64', '')]

In [5]:
query = '''
DESCRIBE TABLE user_transactions
'''

In [6]:
result = client.execute(query)
result

[('payment_date', 'Date', '', '', '', '', ''),
 ('id', 'Int32', '', '', '', '', ''),
 ('type', 'String', '', '', '', '', ''),
 ('sign', 'Int32', '', '', '', '', ''),
 ('amount', 'Int32', '', '', '', '', ''),
 ('tstamp', 'DateTime', '', '', '', '', ''),
 ('advert_id', 'Int32', '', '', '', '', ''),
 ('passport_id', 'Int32', '', '', '', '', ''),
 ('balance_consumption', 'Int32', '', '', '', '', '')]

Now when we know the structure of the tables we can join the data to get a dataframe we need to solve the task. 

In [7]:
query = '''
SELECT
  CAST(la.passport_id AS String) AS passport_id,
  SUM(CASE WHEN ut.sign = -1 AND ut.amount < 0 THEN ut.balance_consumption END) AS money
FROM
  live_adverts la
LEFT JOIN
  user_transactions ut
    ON CAST(ut.advert_id AS String) = CAST(la.advert_id AS String)
      AND la.execution_date = ut.payment_date
WHERE
  la.execution_date BETWEEN '2022-01-01' AND '2022-02-28'
GROUP BY
  CAST(la.passport_id AS String)
'''

In [8]:
result = client.execute(query)

In [9]:
df = pd.DataFrame(result, columns=['passport_id', 'money'])

In [10]:
df.sample(5)

Unnamed: 0,passport_id,money
359467,133533745,300.0
271567,138652538,
248774,125427304,175.0
342832,136610359,
2786,140540955,


Let's fill the zero-spending clients with 0 instead of NaNs. 

In [11]:
df.fillna(0, inplace=True)

In [12]:
df.shape[0]

398500

In [13]:
df.passport_id.nunique()

398500

In [14]:
df.sample(5)

Unnamed: 0,passport_id,money
371629,143082644,0.0
46689,133807891,0.0
338929,138348788,175.0
290563,142416614,0.0
170722,140369663,0.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398500 entries, 0 to 398499
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   passport_id  398500 non-null  object 
 1   money        398500 non-null  float64
dtypes: float64(1), object(1)
memory usage: 6.1+ MB


Loading experiment groups assignment data. 

In [16]:
groups_df = pd.read_csv('user_groups.csv', dtype={'passport_id': 'string'})

In [17]:
groups_df.passport_id.nunique()

398500

In [18]:
groups_df.sample(5)

Unnamed: 0.1,Unnamed: 0,passport_id,group
36291,36291,124526673,C
91713,91713,132173885,B
102957,102957,132672722,C
123092,123092,133194927,B
169896,169896,134473823,D


In [19]:
groups_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398500 entries, 0 to 398499
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   398500 non-null  int64 
 1   passport_id  398500 non-null  string
 2   group        398500 non-null  object
dtypes: int64(1), object(1), string(1)
memory usage: 9.1+ MB


Merging clients spending data with experiment groups assignment data. 

In [20]:
data_df = pd.merge(df, groups_df[['passport_id', 'group']], how='inner', on='passport_id')

In [21]:
data_df

Unnamed: 0,passport_id,money,group
0,135655985,510.0,C
1,141737421,0.0,C
2,131999151,1650.0,A
3,134776715,175.0,A
4,129001315,0.0,D
...,...,...,...
398495,143024622,3971.0,D
398496,126984117,0.0,B
398497,138809936,330.0,D
398498,137173421,430.0,D


In [22]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 398500 entries, 0 to 398499
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   passport_id  398500 non-null  object 
 1   money        398500 non-null  float64
 2   group        398500 non-null  object 
dtypes: float64(1), object(2)
memory usage: 12.2+ MB


Getting an answer to our question.

In [23]:
print(f'ARPU: {round(data_df.money.mean(), 1)}')

ARPU: 568.3


### Task 2.  

tbc..