In [1]:
import pandas as pd
from scipy import stats

# Load data with the CORRECT file name and encoding
df = pd.read_csv('../data/raw/World-happiness-report-updated_2024.csv', encoding='cp1252')

# 1. Create your two groups
# This dataset has data from 2005-2023. Let's filter for the most recent year, 2023.
df_2023 = df[df['year'] == 2023]

group1 = df_2023[df_2023['Country name'].isin(['United States', 'United Kingdom', 'France', 'Germany'])]['Life Ladder'].dropna()
group2 = df_2023[df_2023['Country name'].isin(['China', 'India', 'Japan'])]['Life Ladder'].dropna()

# 2. Perform the t-test
t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)

print(f"--- T-Test: G7 vs. Asian Powers (2023) ---")
print(f"Group 1 Mean (G7): {group1.mean():.4f}")
print(f"Group 2 Mean (Asia): {group2.mean():.4f}")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# 3. Interpret the result
alpha = 0.05
if p_value < alpha:
    print(f"Conclusion: We reject the null hypothesis.")
    print("There is a statistically significant difference in happiness scores.")
else:
    print(f"Conclusion: We fail to reject the null hypothesis.")

--- T-Test: G7 vs. Asian Powers (2023) ---
Group 1 Mean (G7): 6.6320
Group 2 Mean (Asia): 5.5770
T-statistic: 2.2954
P-value: 0.1443
Conclusion: We fail to reject the null hypothesis.
