In [2]:
# importando as libs 
import pandas as pd
import numpy as np

from scipy import stats
import statsmodels.api as sm

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
ab_df = pd.read_csv("data/ab_data.csv")

### Exploração inicial dos dados

In [6]:
# Olhando os 
ab_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [None]:
# 


#### Informações sobre os campos:

In [12]:
ab_df.head(3)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0


In [16]:
ab_df.duplicated().sum()

0

In [17]:
ab_df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [88]:
conversion_rate_geral = ab_df['converted'].mean().round(4)
conversion_rate_control = ab_df.query("group == 'control'")['converted'].mean().round(4)
conversion_rate_treatment = ab_df.query("group == 'treatment'")['converted'].mean().round(4)

print("Conversion Rate Geral: ", conversion_rate_geral)
print("Conversion Rate Controle: ", conversion_rate_control)
print("Conversion Rate Tratamento: ", conversion_rate_treatment)


Conversion Rate Geral:  0.1197
Conversion Rate Controle:  0.1204
Conversion Rate Tratamento:  0.1189


### Checando distribuição entre os grupos de tratamento e controle

In [5]:
### Verifica se há usuários duplicados
ab_df['user_id'].duplicated().sum()

3894

In [57]:
### Remove os usuários duplicados
ids_duplicados = ab_df['user_id'].value_counts().sort_values()
ids_duplicados = ids_duplicados[ids_duplicados.values > 1].index #cria uma lista com os user_id com > 1 ocorrência

In [63]:
ab_df_uniques = ab_df[ab_df["user_id"].isin(ids_duplicados) == False]

### Teste de distribuição dos buckets (SRM - Sample Ratio Missmatch)

In [65]:
ab_df_uniques['group'].value_counts()

treatment    143397
control      143293
Name: group, dtype: int64

In [66]:
control_count = ab_df_uniques.query("group == 'control'")['user_id'].count()

In [67]:
treatment_count = ab_df_uniques.query("group == 'treatment'")['user_id'].count()

In [104]:
def multinomial_goodness_of_fit_test(control_count, treatment_count, treshold_p):
    observed_counts = np.array([control_count, treatment_count])
    total_counts = observed_counts.sum()
    expected_counts = np.array([total_counts / 2, total_counts / 2])

    chi2, p_value = stats.chisquare(f_obs=observed_counts, f_exp=expected_counts)

    print(f"Diff. Entre Buckets: {control_count - treatment_count}")
    print(f"Ratio: {(control_count / treatment_count):.4f}")
    print(f"Chi-square: {chi2:.4f}")
    print(f"P-value: {p_value:.4f}")

    if p_value < treshold_p:
        print(f"O teste é inválido por conta de um desbalanço entre as populações. Valor-p > {treshold_p}")
    else:
        print(f"O teste é válido: valor-p < {treshold_p}.")


In [107]:
multinomial_goodness_of_fit_test(control_count, treatment_count, 0.005)

Diff. Entre Buckets: -104
Ratio: 0.9993
Chi-square: 0.0377
P-value: 0.8460
O teste é válido: valor-p < 0.005.
