In [1]:
import pandas as pd
from scipy import stats

# Load data with the CORRECT file name and encoding
df = pd.read_csv('../data/raw/World-happiness-report-updated_2024.csv', encoding='cp1252')

# 1. Filter for 2023
df_2023 = df[df['year'] == 2023]

# 2. Create sample groups (since there is no region column)
europe = df_2023[df_2023['Country name'].isin(['United Kingdom', 'France', 'Germany', 'Italy', 'Spain', 'Poland'])]['Life Ladder'].dropna()
north_america = df_2023[df_2023['Country name'].isin(['United States', 'Canada', 'Mexico'])]['Life Ladder'].dropna()
asia = df_2023[df_2023['Country name'].isin(['China', 'India', 'Japan', 'South Korea'])]['Life Ladder'].dropna()

# 3. Perform the ANOVA test
f_stat, p_value = stats.f_oneway(europe, north_america, asia)

print(f"--- ANOVA: Are Europe, N. America, and Asia equally happy (2023)? ---")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# 4. Interpret the result
alpha = 0.05
if p_value < alpha:
    print(f"Conclusion: We reject the null hypothesis.")
    print("There is a significant difference in mean happiness scores between at least two regions.")
else:
    print(f"Conclusion: We fail to reject the null hypothesis.")

--- ANOVA: Are Europe, N. America, and Asia equally happy (2023)? ---
F-statistic: 7.0375
P-value: 0.0124
Conclusion: We reject the null hypothesis.
There is a significant difference in mean happiness scores between at least two regions.
