In [17]:
from utils import Dataset, init_notebook
import pandas as pd
init_notebook()
ds = Dataset.from_sheets()

In [18]:
def welch_summary(mean1, std1, n1, mean2, std2, n2):
    import math
    from scipy import stats
    # Calculate t-statistic
    se1 = std1**2 / n1
    se2 = std2**2 / n2
    t_stat = (mean1 - mean2) / math.sqrt(se1 + se2)
    
    # Calculate degrees of freedom using Welch-Satterthwaite equation
    df_num = (se1 + se2)**2
    df_den = (se1**2) / (n1 - 1) + (se2**2) / (n2 - 1)
    df = df_num / df_den
    
    # Calculate two-tailed p-value
    p_value = 2 * stats.t.sf(abs(t_stat), df)
    
    # Output results
    print(f"t-statistic: {t_stat:.4f}")
    print(f"Degrees of freedom: {df:.2f}")
    print(f"Two-tailed p-value: {p_value:.4f}")

def mean_and_std_from_subsamples(n1, x1, s1, n2, x2, s2):
    N = n1 + n2
    # Combined mean
    x_bar = (n1 * x1 + n2 * x2) / N

    # Pooled variance with Bessel's correction
    var = (
        (n1 - 1) * s1**2
        + (n2 - 1) * s2**2
        + (n1 * n2 / N) * (x1 - x2)**2
    ) / (N - 1)

    return x_bar, var**0.5

def combined_sample_stats(groups):
    # groups = list of tuples: (n, mean, std)
    total_n = sum(n for n, _, _ in groups)
    mean = sum(n * x for n, x, _ in groups) / total_n

    within = sum((n - 1) * s**2 for n, _, s in groups)
    between = sum(n * (x - mean)**2 for n, x, _ in groups)

    var = (within + between) / (total_n - 1)
    return mean, var**0.5


In [11]:
l = ds.df["Długość ciała (cm)"]

print("# Casanova 2015, Body len")
welch_summary(l.mean(), l.std(), len(l), 33.03, 1.855, 42)

print("# Bhowmik 2014, Body len")
welch_summary(l.mean(), l.std(), len(l), 33.28,1.8, 30)


# Casanova 2015, Body len
t-statistic: 10.1724
Degrees of freedom: 48.55
Two-tailed p-value: 0.0000
# Bhowmik 2014, Body len
t-statistic: 8.9522
Degrees of freedom: 46.82
Two-tailed p-value: 0.0000


In [11]:
print("Hetmanski 2011, Weigth", mean_and_std_from_subsamples(n1=24, n2=24, x1=389.9, x2=372.9, s1=29.3, s2=27.6))
print("Hetmanski 2011, Head len", mean_and_std_from_subsamples(n1=24, n2=24, x1=55.5, x2=54.2, s1=1.0, s2=1.1))
print("Hetmanski 2011, Wing len", mean_and_std_from_subsamples(n1=24, n2=24, x1=23.3, x2=23.0, s1=0.7, s2=0.7))

Hetmanski 2011, Weigth (381.3999999999999, 29.439353456128988)
Hetmanski 2011, Head len (54.85, 1.2300320009675576)
Hetmanski 2011, Wing len (23.150000000000002, 0.7089098613289011)


In [39]:
groups = [
    (20, 372, 22), # num samples, average, std dev
    (19, 385, 28),
    (19, 372, 43),
    (22, 403, 34),
    (15, 407, 32),
    (16, 396, 32)
]
print("Hetmanski 2008, Weigth", combined_sample_stats(groups))

Hetmanski 2008, Weigth (388.55855855855856, 34.65037968693579)
