### The data was provided by Karpov courses platform (https://karpov.courses/)
#### 

In [1]:
import os
from scipy import stats 
import pandas as pd
import numpy as np

In [2]:
URL_BASE = 'https://raw.githubusercontent.com/ab-courses/simulator-ab-datasets/main/2022-04-01/'

def read_data(file_name):
    return pd.read_csv(os.path.join(URL_BASE, file_name))

In [3]:
df_sales = read_data('2022-04-01T12_df_sales.csv')

In [4]:
df_sales.sample(3)

Unnamed: 0,sale_id,date,count_pizza,count_drink,price,user_id
180777,1180778,2022-03-25 17:20:43,1,1,810,3aa9bb
107618,1107619,2022-03-05 15:19:06,4,0,2460,98930d
12768,1012769,2022-02-07 12:50:03,7,3,5610,b1c5a8


In [5]:
df_sales.dtypes

sale_id         int64
date           object
count_pizza     int64
count_drink     int64
price           int64
user_id        object
dtype: object

In [6]:
df_sales['date'] = pd.to_datetime(df_sales['date'])

In [7]:
users = read_data('experiment_users.csv')

In [8]:
users.sample(3)

Unnamed: 0,user_id,pilot
1078,d4dff0,0
22355,12a3f7,1
18657,df7d01,1


### Experiment was conducted from 16-03-2022 to 23-03-2022
#### Metric to track is the sales per user

In [9]:
# Let's filter the data and define the metric - sales per user
EXPERIMENT_START = pd.to_datetime('2022-03-16')
EXPERIMENT_END = pd.to_datetime('2022-03-23')
sales = (df_sales[(df_sales['date'] >= EXPERIMENT_START) & (df_sales['date'] < EXPERIMENT_END)]
         [['user_id', 'price']]
         .groupby('user_id', as_index=False)
         .sum())

In [10]:
# Let's join and filter the data
df = users.merge(sales, on='user_id', how='left')

In [11]:
df.pilot.value_counts(normalize=True)

0    0.504393
1    0.495607
Name: pilot, dtype: float64

test and control groups are equal

In [12]:
# users in experiment might not have bought anything, so the metric for them should be 0
df['price'] = df['price'].fillna(0)

In [13]:
# Let's split the dataframe by two groups
control_group = df[df['pilot']==0].copy()['price']
experiment_group = df[df['pilot']==1].copy()['price']

In [14]:
stats.ttest_ind(control_group, experiment_group)

Ttest_indResult(statistic=-1.2837567415000515, pvalue=0.19923983306424942)

By the results of ttest, we can't reject the H0

###### There are a lot of zeros in the data. This means that most of the users who made a purchase during the experiment did not make purchases in the week before the experiment. Let's estimate the average time between purchases.

In [15]:
frequent_buyers = df_sales[['user_id', 'sale_id']].groupby('user_id', as_index=False).nunique()
frequent_buyers = frequent_buyers[frequent_buyers['sale_id']>1]['user_id'].unique()

In [16]:
users_frequent = (df_sales[df_sales['user_id'].isin(frequent_buyers)]
                  [['user_id', 'date']]
                  .sort_values(['user_id', 'date'])
                  .copy())

In [17]:
users_frequent['dates_diff'] = users_frequent['date'].diff()
mask = users_frequent.user_id != users_frequent.user_id.shift(1)
users_frequent['dates_diff'][mask] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_frequent['dates_diff'][mask] = np.nan


In [18]:
users_frequent.set_index('user_id', inplace=True)
users_frequent['dates_diff'].mean()

Timedelta('17 days 07:37:57.653431857')

The average time between the purchases is 17 days for the customers, but the experiment ran only for one week