# A/B Testing Course

## Lesson 2. Hypothesis Testing

### Homework

#### Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
from scipy.stats import ttest_ind

#### Import Data

In [3]:
URL_BASE = 'https://raw.githubusercontent.com/ab-courses/simulator-ab-datasets/main/2022-04-01/'

def read_database(file_name):
    return pd.read_csv(os.path.join(URL_BASE, file_name))

In [4]:
df_sales = read_database('2022-04-01T12_df_sales.csv')
df_users = read_database('experiment_users.csv')

df_sales['date'] = pd.to_datetime(df_sales['date'])

#### Data Description

df_sales - information about purchases, one row represents one order:
- sale_id - purchase identifier;
- date - date of purchase;
- count_pizza - number of pizzas in the order;
- count_drink - number of drinks in the order;
- price - order price;
- user_id - user identifier.   

df_users - information about experiment groups:
- user_id - user identifier;
- pilot:
    - 0 --> control group;
    - 1 --> test group.

#### Checking Data

In [5]:
df_sales.head()

Unnamed: 0,sale_id,date,count_pizza,count_drink,price,user_id
0,1000001,2022-02-04 10:00:24,1,0,720,1c1543
1,1000002,2022-02-04 10:02:28,1,1,930,a9a6e8
2,1000003,2022-02-04 10:02:35,3,1,1980,23420a
3,1000004,2022-02-04 10:03:06,1,1,750,3e8ed5
4,1000005,2022-02-04 10:03:23,1,1,870,cbc468


In [6]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203847 entries, 0 to 203846
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   sale_id      203847 non-null  int64         
 1   date         203847 non-null  datetime64[ns]
 2   count_pizza  203847 non-null  int64         
 3   count_drink  203847 non-null  int64         
 4   price        203847 non-null  int64         
 5   user_id      203847 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 9.3+ MB


In [7]:
df_users.head()

Unnamed: 0,user_id,pilot
0,0ffc65,0
1,b962b9,0
2,7ea63f,0
3,7f9a61,0
4,459e55,0


In [8]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23333 entries, 0 to 23332
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  23333 non-null  object
 1   pilot    23333 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 364.7+ KB


In [9]:
print(f'Users in control group: {df_users[df_users["pilot"] == 0].shape[0]}')

Users in control group: 11769


In [10]:
print(f'Users in test group: {df_users[df_users["pilot"] == 1].shape[0]}')

Users in test group: 11564


#### Task 1. 

Test the significance of differences in average revenue per user in the week before the experiment (from 2022.03.16 to 2022.03.23).
Enter the p-value rounded to 3 decimal places as the answer.

In [11]:
# filtering data by date
sales_filt = df_sales[(df_sales['date'] >= '2022-03-16') & \
                      (df_sales['date'] < '2022-03-23')]

In [12]:
# merging info about experiment groups to filtered sales data
sales_with_groups = pd.merge(df_users, sales_filt, on='user_id', how='left')

In [13]:
# filling NaN with 0 for users without purchases
sales_with_groups.fillna(0, inplace=True)

In [14]:
# calculating revenue for each user
sales_by_user = sales_with_groups.groupby('user_id', as_index=False) \
    .agg({'price': 'sum', 'pilot':'sum'}).rename(columns={'price': 'revenue'})

In [15]:
# control & test data
control_data = sales_by_user[sales_by_user['pilot'] == 0].revenue
test_data = sales_by_user[sales_by_user['pilot'] > 0].revenue

In [16]:
# checking amount of users in each group
print(f'Users in control group: {control_data.shape[0]}')
print(f'Users in test group: {test_data.shape[0]}')

Users in control group: 11769
Users in test group: 11564


In [17]:
# making t-test and printing out result
p_val = ttest_ind(control_data, test_data)[1]

print(f'p_value: {round(p_val, 3):.3f}')

p_value: 0.199


#### Task 2. 

In [None]:
tbc..