# Estatística inferencial

In [1]:
import pandas as pd
from palmerpenguins import load_penguins
from scipy import stats

penguins: pd.DataFrame = load_penguins().dropna().reset_index(drop=True).infer_objects()

penguins.loc[:, penguins.dtypes == object] = penguins.select_dtypes(object).astype("category")

## Amostragem

In [2]:
# Amostragem aleatória simples

penguins.sample(5)

penguins.sample(frac=0.015)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
37,Adelie,Dream,36.0,18.5,186.0,3100.0,female,2007
112,Adelie,Torgersen,35.7,17.0,189.0,3350.0,female,2009
220,Gentoo,Biscoe,47.5,14.2,209.0,4600.0,female,2008
11,Adelie,Torgersen,38.7,19.0,195.0,3450.0,female,2007
208,Gentoo,Biscoe,54.3,15.7,231.0,5650.0,male,2008


In [3]:
# Amostragem aleatória estratificada

penguins.groupby("species").apply(lambda df: df.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Adelie,113,Adelie,Torgersen,41.1,18.6,189.0,3325.0,male,2009
Adelie,42,Adelie,Dream,36.0,17.9,190.0,3450.0,female,2007
Chinstrap,283,Chinstrap,Dream,46.4,18.6,190.0,3450.0,female,2007
Chinstrap,275,Chinstrap,Dream,46.6,17.8,193.0,3800.0,female,2007
Gentoo,230,Gentoo,Biscoe,43.4,14.4,218.0,4600.0,female,2009
Gentoo,146,Gentoo,Biscoe,46.1,13.2,211.0,4500.0,female,2007


## Testes de hipóteses


In [4]:
# Teste t de Student com uma amostra

amostra = stats.norm.rvs(loc=16.808, scale=2.733, size=25)

stats.ttest_1samp(amostra, popmean=18, alternative="less")

Ttest_1sampResult(statistic=-3.9561637841552546, pvalue=0.000294293556272125)

In [5]:
# Teste t de Student com duas amostras independentes

tempos1 = [22.8, 23.4, 26.2, 24.3, 22.0, 24.8, 26.7, 25.1, 23.1, 22.8, 25.6, 25.1, 24.3, 24.2, 22.8, 23.2, 24.7, 26.5, 24.5, 23.6, 23.9, 22.8, 25.4, 26.7, 22.9, 23.5, 23.8, 24.6, 26.3, 22.7]

tempos2 = [26.8, 29.3, 28.4, 25.6, 29.4, 27.2, 27.6, 26.8, 25.4, 28.6, 29.7, 27.2, 27.9, 28.4, 26.0, 26.8, 27.5, 28.5, 27.3, 29.1, 29.2, 25.7, 28.4, 28.6, 27.9, 27.4, 26.7, 26.8, 25.6, 26.1]

stats.ttest_ind(tempos1, tempos2, alternative="two-sided", equal_var=True)

Ttest_indResult(statistic=-9.70841463275532, pvalue=9.185777525386829e-14)

In [6]:
# Teste t de Welch

tempos1 = [22.8, 23.4, 26.2, 24.3, 22.0, 24.8, 26.7, 25.1, 23.1, 22.8, 25.6, 25.1, 24.3, 24.2, 22.8, 23.2, 24.7, 26.5, 24.5, 23.6, 23.9, 22.8, 25.4, 26.7, 22.9, 23.5, 23.8, 24.6, 26.3, 22.7]

tempos2 = [26.8, 29.3, 28.4, 25.6, 29.4, 27.2, 27.6, 26.8, 25.4, 28.6, 29.7, 27.2, 27.9, 28.4, 26.0, 26.8, 27.5, 28.5, 27.3, 29.1, 29.2, 25.7, 28.4, 28.6, 27.9, 27.4, 26.7, 26.8, 25.6, 26.1]

stats.ttest_ind(tempos1, tempos2, alternative="two-sided", equal_var=False)

Ttest_indResult(statistic=-9.70841463275532, pvalue=9.728921191025935e-14)