In [99]:
import pandas as pd
import os
from datetime import datetime
from scipy.stats import norm, ttest_ind
import numpy as np
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [100]:
URL_BASE = 'https://raw.githubusercontent.com/pashkovsn/pashkov.sergei/refs/heads/main/Study/karpov.courses/lesson1/'


def read_database(file_name):
    return pd.read_csv(os.path.join(URL_BASE, file_name))

In [101]:
try:
    df_sales = read_database('2022-04-01T12_df_sales.csv')
    print(df_sales.head())
except FileNotFoundError as e:
    print(f"File not found: {e}")

df_sales['date'] = pd.to_datetime(df_sales['date'])

   sale_id                 date  count_pizza  count_drink  price user_id
0  1000001  2022-02-04 10:00:24            1            0    720  1c1543
1  1000002  2022-02-04 10:02:28            1            1    930  a9a6e8
2  1000003  2022-02-04 10:02:35            3            1   1980  23420a
3  1000004  2022-02-04 10:03:06            1            1    750  3e8ed5
4  1000005  2022-02-04 10:03:23            1            1    870  cbc468


In [102]:
df = df_sales[
        (df_sales['date']>=datetime(2022,3,21)) & (df_sales['date']<datetime(2022,3,29))][['user_id','price']].groupby('user_id')['price'].sum().reset_index()
df.shape

(28718, 2)

In [103]:
df.fillna(0,inplace=True)

In [104]:
def check_ttest(a, b, alpha=0.05):
    """Тест Стьюдента. Возвращает 1, если отличия значимы."""
    _, pvalue = ttest_ind(a, b)
    return int(pvalue < alpha)

In [105]:
mu = df['price'].mean()
std = df['price'].std()
print(mu)
print(std)

1222.656173828261
804.0468730096542


Добавление константы ко всем значениям

In [112]:
alpha = 0.05
users = df['user_id'].unique()

result_ab = []
for _ in tqdm(range(10000)):
    np.random.shuffle(users)
    sample_size = 1000
    a, b = users[:sample_size], users[sample_size+1:sample_size*2]
    group_a = df[df['user_id'].isin(a)]['price']
    group_b = df[df['user_id'].isin(b)]['price']+mu*0.1
    result_ab.append(check_ttest(group_a, group_b, alpha))
second_errors = (1 - np.mean(result_ab))
print(second_errors)

  0%|          | 0/1000 [00:00<?, ?it/s]

0.08099999999999996


Умножение на константу

In [122]:
alpha = 0.05
users = df['user_id'].unique()

result_ab = []
for _ in tqdm(range(10000)):
    np.random.shuffle(users)
    sample_size = 1000
    a, b = users[:sample_size], users[sample_size+1:sample_size*2]
    group_a = df[df['user_id'].isin(a)]['price']
    group_b = df[df['user_id'].isin(b)]['price']*(1.1)
    result_ab.append(check_ttest(group_a, group_b, alpha))
second_errors = (1 - np.mean(result_ab))
print(second_errors)

  0%|          | 0/10000 [00:00<?, ?it/s]

0.09819999999999995


Добавление константы к 2.5% значений

In [121]:
alpha = 0.05
users = df['user_id'].unique()

result_ab = []
for _ in tqdm(range(10000)):
    np.random.shuffle(users)
    sample_size = 1000
    a, b = users[:sample_size], users[sample_size:sample_size*2]
    group_a = df[df['user_id'].isin(a)]['price']
    group_b = df[df['user_id'].isin(b)]['price']
    group_b = pd.concat([(group_b[:sample_size//40]+mu*0.1*40),group_b[sample_size//40:]])
    
    result_ab.append(check_ttest(group_a, group_b, alpha))
second_errors = (1 - np.mean(result_ab))
print(f'part errors = {second_errors:0.4f}')

  0%|          | 0/10000 [00:00<?, ?it/s]

part errors = 0.1483
