# Estatística inferencial

In [1]:
import pandas as pd
from scipy import stats
from statsmodels.datasets import get_rdataset

penguins: pd.DataFrame = get_rdataset("penguins", "palmerpenguins").data

penguins = penguins.infer_objects()

penguins.loc[:, penguins.dtypes == object] = penguins.select_dtypes(object).astype("category")

## Amostragem

In [2]:
# Amostragem aleatória simples

penguins.sample(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
259,Gentoo,Biscoe,53.4,15.8,219.0,5500.0,male,2009
240,Gentoo,Biscoe,47.5,14.0,212.0,4875.0,female,2009
222,Gentoo,Biscoe,47.7,15.0,216.0,4750.0,female,2008
80,Adelie,Torgersen,34.6,17.2,189.0,3200.0,female,2008
78,Adelie,Torgersen,36.2,16.1,187.0,3550.0,female,2008


In [3]:
# Amostragem aleatória estratificada

penguins.groupby("species").apply(lambda df: df.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Adelie,17,Adelie,Torgersen,42.5,20.7,197.0,4500.0,male,2007
Adelie,1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
Chinstrap,313,Chinstrap,Dream,52.0,20.7,210.0,4800.0,male,2008
Chinstrap,299,Chinstrap,Dream,50.6,19.4,193.0,3800.0,male,2007
Gentoo,218,Gentoo,Biscoe,46.2,14.4,214.0,4650.0,,2008
Gentoo,201,Gentoo,Biscoe,45.2,15.8,215.0,5300.0,male,2008


## Testes de hipóteses


In [4]:
# Teste t de Student com uma amostra

amostra = stats.norm.rvs(loc=16.808, scale=2.733, size=25)

stats.ttest_1samp(amostra, popmean=18, alternative="less")

Ttest_1sampResult(statistic=-2.968426036667662, pvalue=0.003344182489069449)

In [5]:
# Teste t de Student com duas amostras independentes

tempos1 = [22.8, 23.4, 26.2, 24.3, 22.0, 24.8, 26.7, 25.1, 23.1, 22.8, 25.6, 25.1, 24.3, 24.2, 22.8, 23.2, 24.7, 26.5, 24.5, 23.6, 23.9, 22.8, 25.4, 26.7, 22.9, 23.5, 23.8, 24.6, 26.3, 22.7]

tempos2 = [26.8, 29.3, 28.4, 25.6, 29.4, 27.2, 27.6, 26.8, 25.4, 28.6, 29.7, 27.2, 27.9, 28.4, 26.0, 26.8, 27.5, 28.5, 27.3, 29.1, 29.2, 25.7, 28.4, 28.6, 27.9, 27.4, 26.7, 26.8, 25.6, 26.1]

stats.ttest_ind(tempos1, tempos2, alternative="two-sided", equal_var=True)

Ttest_indResult(statistic=-9.70841463275532, pvalue=9.185777525386829e-14)

In [6]:
# Teste t de Welch

tempos1 = [22.8, 23.4, 26.2, 24.3, 22.0, 24.8, 26.7, 25.1, 23.1, 22.8, 25.6, 25.1, 24.3, 24.2, 22.8, 23.2, 24.7, 26.5, 24.5, 23.6, 23.9, 22.8, 25.4, 26.7, 22.9, 23.5, 23.8, 24.6, 26.3, 22.7]

tempos2 = [26.8, 29.3, 28.4, 25.6, 29.4, 27.2, 27.6, 26.8, 25.4, 28.6, 29.7, 27.2, 27.9, 28.4, 26.0, 26.8, 27.5, 28.5, 27.3, 29.1, 29.2, 25.7, 28.4, 28.6, 27.9, 27.4, 26.7, 26.8, 25.6, 26.1]

stats.ttest_ind(tempos1, tempos2, alternative="two-sided", equal_var=False)

Ttest_indResult(statistic=-9.70841463275532, pvalue=9.728921191025935e-14)

In [7]:
# Teste de independência qui-quadrado

tabela_contingencia = pd.DataFrame(
    {
        "avaliação baixa": [40, 16, 12],
        "avaliação média": [32, 24, 16],
        "avaliação alta": [24, 32, 4],
    },
    index=["Plano 1", "Plano 2", "Plano 3"],
)

stats.chi2_contingency(tabela_contingencia, correction=False)

(15.860566448801741,
 0.0032120846981537215,
 4,
 array([[32.64, 34.56, 28.8 ],
        [24.48, 25.92, 21.6 ],
        [10.88, 11.52,  9.6 ]]))