# Stratification

## Import libraries

In [1]:
#!pip install scipy statsmodels

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, norm
from statsmodels.stats.power import tt_ind_solve_power

## Importing data

In [3]:
df = pd.read_csv('stratification_homework_2.csv')

In [4]:
df

Unnamed: 0,ARPU,is_capital,is_pro,group
0,250.5,region,1,1
1,182.0,capital,0,1
2,75.0,region,0,0
3,532.5,capital,1,0
4,88.0,region,0,1
...,...,...,...,...
9695,109.0,region,0,1
9696,130.0,region,0,0
9697,3.0,region,0,1
9698,179.0,capital,0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9700 entries, 0 to 9699
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ARPU        9700 non-null   float64
 1   is_capital  9700 non-null   object 
 2   is_pro      9700 non-null   int64  
 3   group       9700 non-null   int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 303.2+ KB


In [6]:
df.group.value_counts()

1    4895
0    4805
Name: group, dtype: int64

In [7]:
df.is_pro.value_counts()

0    9197
1     503
Name: is_pro, dtype: int64

In [8]:
df.is_capital.value_counts()

region     6571
capital    3129
Name: is_capital, dtype: int64

## Tasks

### Task 1.

 **Do a basic t-test and assess the results.**

In [9]:
control_data = df[df['group'] == 0].ARPU
test_data = df[df['group'] == 1].ARPU

In [10]:
control_data.mean()

114.83007284079085

In [11]:
test_data.mean()

118.07221654749745

In [12]:
ttest_ind(control_data, test_data)

Ttest_indResult(statistic=-1.8461206634143887, pvalue=0.06490514155787332)

In [13]:
alpha = 0.05
pval = ttest_ind(control_data, test_data)[1]

if pval < alpha:
    print('We can reject H0!')
else:
    print('We are not able to reject H0.')

We are not able to reject H0.


### Task 2.  

**Calculate the value of the stratified variance for the stratum `is_capital`.** 

Firstly, we should create a function that will calculate a stratified variance. 

In [14]:
def calculate_strat_var(data, strata_name, gen_pop_weights, target_value):
    '''Calculates stratified variance.'''
    
    strat_vars = data.groupby(strata_name)[target_value].var()
    
    data_vars_weights = pd.merge(
        pd.Series(strat_vars, name='value_vars'),
        pd.Series(gen_pop_weights, name='weight'),
        how='inner',
        left_index=True,
        right_index=True)   
 
    var_strat = (data_vars_weights['weight'] * data_vars_weights['value_vars']).sum()
    
    return var_strat

Now we'll create a dictionary with weights of each stratum.

In [15]:
df.is_capital.value_counts(normalize=True)

region     0.677423
capital    0.322577
Name: is_capital, dtype: float64

In [16]:
is_capital_weights = dict()

is_capital_weights[df.is_capital.value_counts(normalize=True).index[0]] = df.is_capital.value_counts(normalize=True)[0]
is_capital_weights[df.is_capital.value_counts(normalize=True).index[1]] = df.is_capital.value_counts(normalize=True)[1]

is_capital_weights

{'region': 0.6774226804123711, 'capital': 0.32257731958762886}

And finally we will calculate the value of a stratified variance for `is_capital` feature. 

In [17]:
calculate_strat_var(df, 'is_capital', is_capital_weights, 'ARPU')

5789.948335065556

### Task 3. 

**Calculate the value of the stratified variance for the stratum `is_pro`.** 

In [18]:
df.is_pro.value_counts(normalize=True)

0    0.948144
1    0.051856
Name: is_pro, dtype: float64

In [19]:
is_pro_weights = dict()

is_pro_weights[df.is_pro.value_counts(normalize=True).index[0]] = df.is_pro.value_counts(normalize=True)[0]
is_pro_weights[df.is_pro.value_counts(normalize=True).index[1]] = df.is_pro.value_counts(normalize=True)[1]

is_pro_weights

{0: 0.9481443298969072, 1: 0.051855670103092784}

In [20]:
calculate_strat_var(df, 'is_pro', is_pro_weights, 'ARPU')

6193.935098081299

### Task 4. 

**Using the best stratum, apply poststratification to the data and recalculate the results of the t-test.**

In [21]:
def calculate_strat_mean(data, strata_name, gen_pop_weights, target_value):
    '''Calculates stratified mean.'''
    strats_means = data.groupby(strata_name)[target_value].mean()

    data_means_weights = pd.merge(
        pd.Series(strats_means, name='value_means'),
        pd.Series(gen_pop_weights, name='weight'),
        how='inner',
        left_index=True,
        right_index=True)

    mean_strat = (data_means_weights['weight'] * data_means_weights['value_means']).sum()
    
    return mean_strat

In [22]:
def get_strat_ttest(df_A, df_B, strata_name, target_value, gen_pop_weights):
    '''Performs t-test for stratified data.'''
    mean_strat_A =  calculate_strat_mean(df_A, strata_name, gen_pop_weights, target_value)
    mean_strat_B =  calculate_strat_mean(df_B, strata_name, gen_pop_weights, target_value)
    
    var_strat_A = calculate_strat_var(df_A, strata_name, gen_pop_weights, target_value)
    var_strat_B = calculate_strat_var(df_B, strata_name, gen_pop_weights, target_value)

    delta_mean_strat = mean_strat_A - mean_strat_B
    std_mean_strat = np.sqrt(var_strat_A / len(df_A) + var_strat_B / len(df_B))
    t_stat_strat = delta_mean_strat / std_mean_strat
    p_value = 2 * (1 - norm.cdf(np.abs(t_stat_strat)))
    
    inference = {'t_stat': t_stat_strat, 'p_value':p_value}
    
    return inference

In [23]:
control_df = df[df['group'] == 0]
test_df = df[df['group'] == 1]

In [24]:
get_strat_ttest(control_df, test_df, 'is_capital', 'ARPU', is_capital_weights)

{'t_stat': -2.357821319048786, 'p_value': 0.018382538725737696}

In [25]:
alpha = 0.05
pval = get_strat_ttest(control_df, test_df, 'is_capital', 'ARPU', is_capital_weights)['p_value']

if pval < alpha:
    print('We can reject H0!')
else:
    print('We are not able to reject H0.')

We can reject H0!


### Task 5. 

**Take the control group data (group = 0) as a basis and calculate the required sample size to find the absolute MDE equal to the observed mean difference between the test and control groups.
Experiment parameters: alpha = 0.05, beta = 0.2, two-tailed test, 50/50 split. What will be the group size for a simple t-test?.**

In [26]:
control_df.sample(5)

Unnamed: 0,ARPU,is_capital,is_pro,group
786,0.0,region,0,0
6560,89.0,region,0,0
7961,220.0,capital,0,0
5616,0.0,capital,0,0
1805,95.0,region,0,0


In [27]:
uplift = test_data.mean() - control_data.mean()
data_std = control_df.ARPU.std()

basic_cohen_D_effect_size = uplift / data_std

basic_cohen_D_effect_size

0.03716095483581302

In [28]:
basic_sample_size = int(tt_ind_solve_power(effect_size = basic_cohen_D_effect_size,
                                           alpha=0.05,
                                           power=0.8,
                                           nobs1=None,
                                           ratio=1)
                       )

basic_sample_size

11368

### Task 6. 

**What will be required sample size for stratified data?**

In [29]:
control_df.is_capital.value_counts(normalize=True)

region     0.67513
capital    0.32487
Name: is_capital, dtype: float64

In [30]:
is_capital_weights = dict()

is_capital_weights[control_df.is_capital.value_counts(normalize=True).index[0]] = control_df.is_capital.value_counts(normalize=True)[0]
is_capital_weights[control_df.is_capital.value_counts(normalize=True).index[1]] = control_df.is_capital.value_counts(normalize=True)[1]

is_capital_weights

{'region': 0.6751300728407909, 'capital': 0.32486992715920915}

In [31]:
uplift = test_data.mean() - control_data.mean()
data_std = calculate_strat_var(control_df, 'is_capital', is_capital_weights, 'ARPU') ** 0.5

strat_cohen_D_effect_size = uplift / data_std

strat_cohen_D_effect_size 

0.042577425346549014

In [32]:
strat_sample_size = int(tt_ind_solve_power(effect_size = strat_cohen_D_effect_size,
                                           alpha=0.05,
                                           power=0.8,
                                           nobs1=None,
                                           ratio=1)
                       )

strat_sample_size

8660