In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sps
from datetime import timedelta

data = pd.read_csv("/home/nullkatar/Downloads/axNSsVJtVBJbNUgD.gzip", compression='gzip')
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s') + timedelta(hours=3)
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')

pre_group = data[data['group'].isna()]
A = data[data['group'] == 'A']
B = data[data['group'] == 'B']

In [2]:
def duration(k, delta_effect, sigma_1, sigma_2, alpha=0.05, beta=0.2):
    z = sps.norm.ppf(1 - alpha/2) + sps.norm.ppf(1-beta)
    n = (k+1) * z ** 2 * (sigma_1 ** 2 + sigma_2 **2 / k) / (delta_effect ** 2)
    return n

def metric(data, end):
    filtered = data.groupby('id_user').timestamp.min().reset_index(name='min_timestamp')
    filtered['max_timestamp'] = filtered['min_timestamp'] + timedelta(days=7)
    
    merged = data.merge(filtered, on='id_user')
    merged = merged[(merged['timestamp'] <= merged['max_timestamp']) & (merged['timestamp'] >= merged['min_timestamp'])]
    merged = merged[pd.to_datetime(merged['max_timestamp']).dt.normalize() < end]
    
    return merged.groupby('id_user').sum_payment.sum().tolist()

def filter_date(data, target_n, start, end):
    date = start + timedelta(days=7)

    while True:
        if date == end:
            return date
        temp_data = data[data['date'] <= date]

        filtered = data.groupby('id_user').timestamp.min().reset_index(name='min_timestamp')
        filtered['max_timestamp'] = filtered['min_timestamp'] + timedelta(days=7)
        filtered = filtered[pd.to_datetime(filtered['max_timestamp']).dt.normalize() <= date]

        if len(np.unique(filtered["id_user"].values)) >= target_n:
            return date

        date = date + timedelta(days=1)
        
def remove_outliers(data, threshold = 0.001 ):
    lower_bound = data["sum_payment"].quantile(q=threshold)
    upper_bound = data["sum_payment"].quantile(q=1-threshold)
    emission_df = data[(data["sum_payment"] < lower_bound) | (data["sum_payment"] > upper_bound)]
    
    data.loc[data.index.isin(emission_df.index), "sum_payment"] = \
        data.loc[data.index.isin(emission_df.index), "sum_payment"].apply(lambda x: min(x, upper_bound))
    
    return data

In [3]:
start = pd.to_datetime('2023-05-01', format='%Y-%m-%d')
end = pd.to_datetime('2023-05-31', format='%Y-%m-%d')

pre_group = remove_outliers(pre_group)
metric_out = metric(pre_group, start)
sigma_1 = sigma_2 = np.std(metric_out)

mean_base = np.mean(metric_out)
effect = 0.1 * mean_base

k = len(np.unique(A['id_user'].values)) / len(np.unique(B['id_user'].values))

target_n = int(duration(k, effect, sigma_1, sigma_2)/2)

print(f"Target size: {target_n}")

Target size: 3601


In [4]:
a_date = filter_date(A, target_n, start, end)
b_date = filter_date(B, target_n, start, end)

end = max(a_date, b_date)

print(f"End date: {end}")

End date: 2023-05-31 00:00:00


In [8]:
A = A[A['date'] <= a_date]
B = B[B['date'] <= b_date]

A = remove_outliers(A)
B = remove_outliers(B)

In [11]:
final_df = pd.DataFrame(
    columns=["metric_a", "metric_b", "effect", "ttest", "p_value"],
    index=pd.date_range(start + timedelta(days=7), end),
)
for date in final_df.index:
    tmp_a = metric(A[A["date"] <= date], date)
    tmp_b = metric(B[B["date"] <= date], date)
    effect = np.mean(tmp_b) - np.mean(tmp_a)
    ttest, p_value = sps.ttest_ind(tmp_a, tmp_b, equal_var=abs(np.var(tmp_b) - np.var(tmp_a)) <= 0.1)
    
    final_df.loc[date] = np.mean(tmp_a), np.mean(tmp_b), effect, ttest, p_value

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


In [12]:
final_df

Unnamed: 0,metric_a,metric_b,effect,ttest,p_value
2023-05-08,,,,,
2023-05-09,11.5,3.333333,-8.166667,1.339292,0.206675
2023-05-10,12.586207,10.483871,-2.102336,0.504056,0.616169
2023-05-11,12.730159,10.84375,-1.886409,0.631576,0.52884
2023-05-12,11.954128,11.826923,-0.127205,0.054293,0.956757
2023-05-13,11.295181,12.802326,1.507145,-0.78469,0.433229
2023-05-14,10.048387,13.008,2.959613,-1.948365,0.051989
2023-05-15,10.279279,12.006006,1.726727,-1.362038,0.173675
2023-05-16,10.024554,11.704805,1.680252,-1.547276,0.12217
2023-05-17,10.120287,11.395221,1.274933,-1.334673,0.182271


p_value < 0.005 => A setup is preferable