# Estatística inferencial

In [1]:
import pandas as pd
from scipy import stats
from statsmodels.datasets import get_rdataset

penguins: pd.DataFrame = get_rdataset("penguins", "palmerpenguins").data

penguins = penguins.infer_objects()

penguins.loc[:, penguins.dtypes == object] = penguins.select_dtypes(object).astype("category")

## Amostragem

In [2]:
# Amostragem aleatória simples

penguins.sample(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
134,Adelie,Dream,38.1,17.6,187.0,3425.0,female,2009
297,Chinstrap,Dream,48.5,17.5,191.0,3400.0,male,2007
334,Chinstrap,Dream,50.2,18.8,202.0,3800.0,male,2009
131,Adelie,Torgersen,43.1,19.2,197.0,3500.0,male,2009
217,Gentoo,Biscoe,49.8,16.8,230.0,5700.0,male,2008


In [3]:
# Amostragem aleatória estratificada

penguins.groupby("species").apply(lambda df: df.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Adelie,75,Adelie,Torgersen,42.8,18.5,195.0,4250.0,male,2008
Adelie,104,Adelie,Biscoe,37.9,18.6,193.0,2925.0,female,2009
Chinstrap,309,Chinstrap,Dream,51.0,18.8,203.0,4100.0,male,2008
Chinstrap,299,Chinstrap,Dream,50.6,19.4,193.0,3800.0,male,2007
Gentoo,163,Gentoo,Biscoe,49.0,16.1,216.0,5550.0,male,2007
Gentoo,246,Gentoo,Biscoe,44.5,14.7,214.0,4850.0,female,2009


## Testes de hipóteses


In [4]:
# Teste t de Student com uma amostra

sample = stats.norm.rvs(loc=16.808, scale=2.733, size=25)

stats.ttest_1samp(sample, popmean=18, alternative="less")

Ttest_1sampResult(statistic=-0.9891607375342513, pvalue=0.16622644779120382)

In [5]:
# Teste t de Student com duas amostras independentes

sample1 = [22.8, 23.4, 26.2, 24.3, 22.0, 24.8, 26.7, 25.1, 23.1, 22.8, 25.6, 25.1, 24.3, 24.2, 22.8, 23.2, 24.7, 26.5, 24.5, 23.6, 23.9, 22.8, 25.4, 26.7, 22.9, 23.5, 23.8, 24.6, 26.3, 22.7]

sample2 = [26.8, 29.3, 28.4, 25.6, 29.4, 27.2, 27.6, 26.8, 25.4, 28.6, 29.7, 27.2, 27.9, 28.4, 26.0, 26.8, 27.5, 28.5, 27.3, 29.1, 29.2, 25.7, 28.4, 28.6, 27.9, 27.4, 26.7, 26.8, 25.6, 26.1]

stats.ttest_ind(sample1, sample2, alternative="two-sided", equal_var=True)

Ttest_indResult(statistic=-9.70841463275532, pvalue=9.185777525386829e-14)

In [6]:
# Teste t de Welch

sample1 = [22.8, 23.4, 26.2, 24.3, 22.0, 24.8, 26.7, 25.1, 23.1, 22.8, 25.6, 25.1, 24.3, 24.2, 22.8, 23.2, 24.7, 26.5, 24.5, 23.6, 23.9, 22.8, 25.4, 26.7, 22.9, 23.5, 23.8, 24.6, 26.3, 22.7]

sample2 = [26.8, 29.3, 28.4, 25.6, 29.4, 27.2, 27.6, 26.8, 25.4, 28.6, 29.7, 27.2, 27.9, 28.4, 26.0, 26.8, 27.5, 28.5, 27.3, 29.1, 29.2, 25.7, 28.4, 28.6, 27.9, 27.4, 26.7, 26.8, 25.6, 26.1]

stats.ttest_ind(sample1, sample2, alternative="two-sided", equal_var=False)

Ttest_indResult(statistic=-9.70841463275532, pvalue=9.728921191025935e-14)

In [7]:
# Teste de independência qui-quadrado

contingency_table = pd.DataFrame(
    {
        "avaliação baixa": [40, 16, 12],
        "avaliação média": [32, 24, 16],
        "avaliação alta": [24, 32, 4],
    },
    index=["Plano 1", "Plano 2", "Plano 3"],
)

test_statistic_value, p_value, degrees_of_freedom, expected_frequencies = stats.chi2_contingency(contingency_table, correction=False)

print(f"valor da estatística de teste: {test_statistic_value}")

print(f"p-value: {p_value}")

print(f"graus de liberdade da distribuição qui-quadrado: {degrees_of_freedom}")

print(f"frequências esperadas:\n{expected_frequencies}")

valor da estatística de teste: 15.860566448801741
p-value: 0.0032120846981537215
graus de liberdade da distribuição qui-quadrado: 4
frequências esperadas:
[[32.64 34.56 28.8 ]
 [24.48 25.92 21.6 ]
 [10.88 11.52  9.6 ]]
