In [2]:
import pandas as pd
from scipy import stats

# Load data with the CORRECT file name and encoding
df = pd.read_csv('../data/raw/World-happiness-report-updated_2024.csv', encoding='cp1252')

# 1. Filter for 2023
df_2023 = df[df['year'] == 2023].dropna(subset=['Life Ladder', 'Log GDP per capita'])

# 2. Create categorical variables
median_score = df_2023['Life Ladder'].median()
# THIS IS THE CORRECTED LINE:
median_gdp = df_2023['Log GDP per capita'].median() 

df_2023['Happiness Level'] = ['Happy' if score > median_score else 'Unhappy' for score in df_2023['Life Ladder']]
df_2023['Economy Level'] = ['Rich' if gdp > median_gdp else 'Poor' for gdp in df_2023['Log GDP per capita']]

# 3. Create a contingency table (crosstab)
contingency_table = pd.crosstab(df_2023['Happiness Level'], df_2023['Economy Level'])

print("--- Chi-Square Test: Is Happiness related to GDP (2023)? ---")
print("\nContingency Table:")
print(contingency_table)

# 4. Perform the Chi-Square test
chi2_stat, p_value, dof, expected_freq = stats.chi2_contingency(contingency_table)

print(f"\nChi2 Statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# 5. Interpret the result
alpha = 0.05
if p_value < alpha:
    print(f"Conclusion: We reject the null hypothesis.")
    print("There is a significant association between a country's Economy Level and its Happiness Level.")
else:
    print(f"Conclusion: We fail to reject the null hypothesis.")

--- Chi-Square Test: Is Happiness related to GDP (2023)? ---

Contingency Table:
Economy Level    Poor  Rich
Happiness Level            
Happy              11    53
Unhappy            54    11

Chi2 Statistic: 53.3995
P-value: 0.0000
Conclusion: We reject the null hypothesis.
There is a significant association between a country's Economy Level and its Happiness Level.
