In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv(r'../data/cookie_cats.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userid          90189 non-null  int64 
 1   version         90189 non-null  object
 2   sum_gamerounds  90189 non-null  int64 
 3   retention_1     90189 non-null  bool  
 4   retention_7     90189 non-null  bool  
dtypes: bool(2), int64(2), object(1)
memory usage: 2.2+ MB


In [5]:
metrics_summary = data.groupby('version').agg({'userid':'count','retention_1':'sum', 'retention_7':'sum', 'sum_gamerounds':'sum'})

In [8]:
metrics_summary['retention_avg_1'] = metrics_summary['retention_1']/metrics_summary['userid']
metrics_summary['retention_avg_7'] = metrics_summary['retention_7']/metrics_summary['userid']
metrics_summary['sum_gamerounds_avg'] = metrics_summary['sum_gamerounds']/metrics_summary['userid']
metrics_summary

Unnamed: 0_level_0,userid,retention_1,retention_7,sum_gamerounds,retention_avg_1,retention_avg_7,sum_gamerounds_avg
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gate_30,44700,20034,8502,2344795,0.448188,0.190201,52.456264
gate_40,45489,20119,8279,2333530,0.442283,0.182,51.298776


### retention_1 и retention_7

#### Проверка на достаточность данных для теста

In [31]:
from statsmodels.stats.power import NormalIndPower

group_sample_size = min(metrics_summary.loc['gate_30','userid'],metrics_summary.loc['gate_40','userid'])
p1_retention_1 = metrics_summary.loc['gate_30','retention_avg_1']    # Контрольная группа retention_1
p2_retention_1 = metrics_summary.loc['gate_40','retention_avg_1']    # Тестовая группа retention_1
effect_size_retention_1 = 0.0445                                     # Ожидаемое изменение retention_1 +10%

p1_retention_7 = metrics_summary.loc['gate_30','retention_avg_7']    # Контрольная группа retention_7
p2_retention_7 = metrics_summary.loc['gate_40','retention_avg_7']    # Тестовая группа retention_7
effect_size_retention_7 = 0.019                                      # Ожидаемое изменение retention_7 +10%

def sample_sufficiency_test(p1,p2,effect_size, group_sample_size):
    power_analysis = NormalIndPower()
    sample_size = power_analysis.solve_power(effect_size=effect_size, power=0.8, alpha=0.05, alternative='larger')

    print(f"Данных в каждой группе достаточно для теста - {group_sample_size>sample_size}")

##### retention_1

In [32]:
sample_sufficiency_test(p1_retention_1,p2_retention_1,effect_size_retention_1, group_sample_size)

Данных в каждой группе достаточно для теста - True


##### retention_7

In [33]:
sample_sufficiency_test(p1_retention_7,p2_retention_7,effect_size_retention_7, group_sample_size)

Данных в каждой группе достаточно для теста - True


#### Z-тест пропорций

##### retention_1

In [43]:
from statsmodels.stats.proportion import proportions_ztest

success = [metrics_summary.loc['gate_40','retention_1'], metrics_summary.loc['gate_30','retention_1']]
nobs = [metrics_summary.loc['gate_40','userid'], metrics_summary.loc['gate_30','userid']]

z_stat, p_value = proportions_ztest(count=success, nobs=nobs, alternative='larger')

print(f"Z-статистика: {z_stat:.3f}")
print(f"P-value: {p_value:.4f}")
print(f'Нулевая гипотеза отвергнута - {p_value < 0.05}')

Z-статистика: -1.784
P-value: 0.9628
Нулевая гипотеза отвергнута - False


##### retention_7

In [44]:
from statsmodels.stats.proportion import proportions_ztest

success = [metrics_summary.loc['gate_40','retention_7'], metrics_summary.loc['gate_30','retention_7']]
nobs = [metrics_summary.loc['gate_40','userid'], metrics_summary.loc['gate_30','userid']]

z_stat, p_value = proportions_ztest(count=success, nobs=nobs, alternative='larger')

print(f"Z-статистика: {z_stat:.3f}")
print(f"P-value: {p_value:.4f}")
print(f'Нулевая гипотеза отвергнута - {p_value < 0.05}')

Z-статистика: -3.164
P-value: 0.9992
Нулевая гипотеза отвергнута - False
