# MDE, Power, Effect Size

## Import libraries

In [1]:
#!pip install statsmodels

In [2]:
import pandas as pd
from statsmodels.stats.power import tt_ind_solve_power

In [3]:
from clickhouse_driver import Client


user_name = 'user_name'
pwd = 'password'

# creating connection ClickHouse
client = Client(host='clickhouse.lab.karpov.courses', port=9000,
                user=user_name, password=pwd, database='hardda')

# checking connection
result = client.execute("SELECT * FROM hardda.user_dm_events LIMIT 1")

# showing the result
for row in result[0:1]:
    print(row)

(datetime.date(2022, 2, 1), datetime.date(2022, 1, 31), 'android', 'f7411212fd0e2523e126cbfdd3f226c211212', '4beb10e1-aeeb-4c52-acd2-ce1ddbc1fc24b10e1', 22, 11, 3, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0)


## Tasks

### Task 1.

 **Prepare an SQL query for customers who paid in Moscow in the period from October 11 to 16, 2022 for the "premium" service. What is the length of the resulting table?**

In [4]:
query = '''
SELECT
  ut.type,
  ut.passport_id,
  region,
  abs(sum(sign * amount)) AS revenue
FROM
  user_transactions AS ut
LEFT JOIN
  dp_live_adverts AS la ON CAST(la.advert_id, 'UInt32') = ut.advert_id AND la.execution_date = ut.payment_date
WHERE
  payment_date BETWEEN '2022-10-11' and '2022-10-16'
    AND type = 'premium'
      AND sign = -1
        AND amount != 0
          AND region = 'Москва'
GROUP BY
  ut.type,
  ut.passport_id,
  region
'''

In [5]:
result = client.execute(query)

In [6]:
print(len(result))

1053


### Task 2.  

**Find the average revenue for those users after it will be increased by 8%.**

In [7]:
df = pd.DataFrame(result, columns=['service', 'user_id', 'region', 'revenue'])

df

Unnamed: 0,service,user_id,region,revenue
0,premium,133570315,Москва,2210
1,premium,125526618,Москва,1105
2,premium,128205675,Москва,2210
3,premium,123640858,Москва,1100
4,premium,135304607,Москва,1105
...,...,...,...,...
1048,premium,133595511,Москва,985
1049,premium,134870217,Москва,1305
1050,premium,139986262,Москва,1305
1051,premium,133461081,Москва,1105


In [8]:
df.revenue.mean()*1.08

1754.8769230769233

### Task 3.

**Find the value of Cohens d (standardized effect size) for our experiment.**

In [9]:
coh_d = (df.revenue.mean()*1.08 - df.revenue.mean()) / df.revenue.std()

print(coh_d)

0.08142227460914778


### Task 4. 

**Using the calculated values, find the minimum sample size required to statistically significantly detect the effect (alpha=0.05, power=0.8).**

In [10]:
sample_size = tt_ind_solve_power(effect_size = coh_d,
                       alpha = 0.05,
                       power = 0.8,
                       ratio = 1,
                       alternative = "two-sided",
                       nobs1 = None)


print('Required sample size =', round(sample_size))

Required sample size = 2369


### Task 5. 

**Find the minimum sample size required to statistically significantly detect the effect of 6% having alpha=0.05 and power=0.9.**

In [11]:
coh_d_new = (df.revenue.mean()*1.06 - df.revenue.mean()) / df.revenue.std()

sample_size = tt_ind_solve_power(effect_size = coh_d_new,
                       alpha = 0.05,
                       power = 0.9,
                       ratio = 1,
                       alternative = "two-sided",
                       nobs1 = None)


print('Required sample size =', round(sample_size))

Required sample size = 5636


### Task 6. 

**There is an unbalanced sample of 100,000 users, where the ratio between the groups is 80:20 (the innovation is applied to 20% of users and not to 80%: ratio = n2/n1 = 0.25). The test design assumes a 5% probability of making a Type I error and a 20% allowance for making a Type II error. Determine the effect size that we can detect with the given experiment design..**

In [12]:
effect_size = tt_ind_solve_power(
    effect_size=None,
    nobs1=80000,
    ratio=0.25,
    alpha=0.05,
    power=0.8,
    alternative = "two-sided"
)

print(f'Effect size we can determine with a statistical significance: {effect_size:.3f}')

Effect size we can determine with a statistical significance: 0.022


### Task 7. 

**The maximum size in two samples available to you is 100,000 users. Determine at what ratio between the control and test groups (n2/n1) we can detect the minimum value of the effect size (Cohen's D) at a significance level of alpha = 0.05 and test power = 80% and statistical criterion t-test.**

In [13]:
ratios = [0.8, 1.0, 1.2, 2.5, 3.0]
population = 100000
min_effect = float('inf')
best_ratio = False

for ratio in ratios:
    sample_1 = population / (ratio + 1)
    effect_size = tt_ind_solve_power(
        effect_size=None,
        nobs1=sample_1,
        ratio=ratio,
        alpha=0.05,
        power=0.8,
        alternative = "two-sided")
    if effect_size < min_effect:
        min_effect = effect_size
        best_ratio = ratio
        
print(best_ratio, round(min_effect, 3))

1.0 0.018
