In [8]:
import pandas as pd
from scipy import stats
# %pip install statsmodels
from statsmodels.stats.multicomp import pairwise_tukeyhsd
url = "https://raw.githubusercontent.com/dataprofessor/data/refs/heads/master/penguins_cleaned.csv"


In [9]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,male
1,Adelie,Torgersen,39.5,17.4,186,3800,female
2,Adelie,Torgersen,40.3,18.0,195,3250,female
3,Adelie,Torgersen,36.7,19.3,193,3450,female
4,Adelie,Torgersen,39.3,20.6,190,3650,male


In [10]:
males = df.loc[df['sex']=='male', 'body_mass_g']
females = df.loc[df['sex'] == 'female', 'body_mass_g']

# t_res = stats.ttest_ind(males,females)
# t_res
# df = t_res.df
# t, p = t_res
# print(t,p,df)

t,p = stats.ttest_ind(males,females)
deg_f = len(males) + len(females) -2

print(t,p,deg_f)


8.541720337994516 4.897246751596224e-16 331


In [11]:
print(f"t={t:.4f}, p={p:.4f}, deg_f={deg_f}")

t=8.5417, p=0.0000, deg_f=331


In [12]:
alpha = 0.05
crit_t = stats.t.ppf(1-alpha/2, deg_f)
print(f"t={t:.4f}, crit_t={crit_t:.4f}")

t=8.5417, crit_t=1.9672


In [13]:
mean_diff = males.mean() - females.mean()
mean_diff 

np.float64(683.4117965367964)

In [14]:
df['species'].unique()

array(['Adelie', 'Gentoo', 'Chinstrap'], dtype=object)

In [15]:
adelie = df.loc[df['species']=="Adelie", "body_mass_g"]
gentoo = df.loc[df['species']=="Gentoo", "body_mass_g"]
chinstrap = df.loc[df['species']=="Chinstrap", "body_mass_g"]

f, p = stats.f_oneway(adelie, gentoo, chinstrap)
f,p

(np.float64(341.8948949481461), np.float64(3.74450512630046e-81))

In [24]:
groups = [df.loc[df['species']==species, "body_mass_g"] for species in df['species'].unique()]
f, p = stats.f_oneway(*groups)

print(f"f={f:.4f} p={p:.4f}")

f=341.8949 p=0.0000


In [25]:
dfb = len(groups) - 1 # categories - 1
dfw = len(df) - len(groups) # total observations - categories
f_crit =  stats.f.ppf(1-alpha, dfb, dfw)
print(f"f={f:.4f}, critical={f_crit:.4f}")

f=341.8949, critical=3.0231


In [26]:
tukey = pairwise_tukeyhsd(
    endog=df['body_mass_g'],
    groups=df['species'],
    alpha=alpha
)
tukey.summary()

group1,group2,meandiff,p-adj,lower,upper,reject
Adelie,Chinstrap,26.9239,0.9164,-132.3528,186.2005,False
Adelie,Gentoo,1386.2726,0.0,1252.2897,1520.2554,True
Chinstrap,Gentoo,1359.3487,0.0,1194.4304,1524.2671,True


In [33]:
# with scipy.stats
bad_tukey = stats.tukey_hsd(*groups)
print(bad_tukey)

Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)  -1386.273     0.000 -1520.255 -1252.290
 (0 - 2)    -26.924     0.916  -186.201   132.353
 (1 - 0)   1386.273     0.000  1252.290  1520.255
 (1 - 2)   1359.349     0.000  1194.430  1524.267
 (2 - 0)     26.924     0.916  -132.353   186.201
 (2 - 1)  -1359.349     0.000 -1524.267 -1194.430

