# Пример анализа результатов A/B-теста

In [1]:
%matplotlib inline

import datetime
import re

import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def load(tablename, ext='csv'):
    """Load a table from file
    """
    filename = tablename + '.' + ext
    try:
        df = pd.read_csv(filename)
    except FileNotFoundError:
        df = pd.read_csv('https://code.s3.yandex.net/datasets/' + filename)
    correct_column_names(df)  #  переводит в snake_case
    survey(df)  # даёт обзор возможных проблем с таблицей
    return df

In [3]:
def survey(data):
    """Display summaries to detect data problems
    """
    display(data.sample(5))
    print(f'Полностью совпадающих строк: {data.duplicated().sum()}\n')
    data.info()
    display(get_nans(data))
    print(get_filled_rows_share(data))

In [4]:
def get_nans(data):
    nans = []
    for column in data.columns:
        missing = data[column].isna().sum()  # Подсчет количества отсутствующих значений
        part = round(missing * 100 / len(data), 2) # Подсчет доли отсутствующих значений
        uniques = data[column].sort_values().nunique()
        nans.append([column, missing, part, uniques])

    return pd.DataFrame(
        data=nans, columns=['column', 'na_count', 'na_%', 'unique_count']
    ).sort_values(['na_count', 'unique_count'], ascending=False).set_index('column')

In [5]:
def get_filled_rows_share(df):
    share = 1 - df.isna().any(axis=1).sum() / len(df)
    return f'Заполненных строк без пропусков: {share:.02%}'

In [6]:
def correct_column_names(df):
    df.columns = df.columns.str.strip()
    # df.columns = df.columns.str.lower()
    # df.columns = df.columns.str.replace(' ', '_')

def to_snake_case(name):
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    name = re.sub('__([A-Z])', r'_\1', name)
    name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', name)
    return name.lower()

In [7]:
date_pattern = (
    '^(?:\\d{4})-(?:\\d{2})-(?:\\d{2})'  # date
    '[ T](?:\\d{2}):(?:\\d{2}):(?:\\d{2}(?:\\.\\d*)?)'  # time
    '(?:(?:-(?:\\d{2}):(?:\\d{2})|Z)?)$'  # timezone
)

print(re.match(date_pattern, '2021-11-04 11:11:11') is not None)

# visits['session_start'] = pd.to_datetime(visits['session_start'])
# visits['session_end'] = pd.to_datetime(visits['session_start'])

True


In [8]:
data = load('lesson_data_3-3')

Unnamed: 0,date,group,visitors,orders
22,1/23/2019,A,1026,11
23,1/24/2019,A,1042,16
45,1/15/2019,B,1040,14
46,1/16/2019,B,957,12
3,1/4/2019,A,1006,12


Полностью совпадающих строк: 0

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date      62 non-null     object
 1   group     62 non-null     object
 2   visitors  62 non-null     int64 
 3   orders    62 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 2.1+ KB


Unnamed: 0_level_0,na_count,na_%,unique_count
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
visitors,0,0.0,44
date,0,0.0,31
orders,0,0.0,7
group,0,0.0,2


Заполненных строк без пропусков: 100.00%


In [9]:
data_new = (
    data.groupby('group', as_index=False)
    .agg({'visitors': 'sum', 'orders': 'sum'})
)

In [10]:
data_new

Unnamed: 0,group,visitors,orders
0,A,30996,385
1,B,31105,419


In [11]:
data_new['orders_to_visitors_ratio'] = data['orders'] * 100 / data['visitors']

In [12]:
data_new

Unnamed: 0,group,visitors,orders,orders_to_visitors_ratio
0,A,30996,385,0.987167
1,B,31105,419,1.341589


Видим, что количестов заказов на пользователя во втором случае значимо больше, чем в первом.