In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.mixture import GaussianMixture

In [None]:
df = pd.read_csv("../Data/Clean/all_data_SQL_Export.csv")

In [None]:
df.info()

In [None]:
df.rename(columns={ 
    "Log_GDP_PEr_Capita": "Log_GDP_Per_Capita",
    "Happiness_Score": "Happiness",
    "%_Engaged": "Job_Engagement",
    "Average_hours_per_week": "Work_hours_per_week"
}, inplace=True)

In [None]:
# Define numeric columns
numerical_columns = df.select_dtypes(include=['float64']).columns

# Distribution graphs for each numeric column
plt.figure(figsize=(16, 20))
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(len(numerical_columns) // 3 + 1, 3, i)
    sns.histplot(df[column], kde=True, color='skyblue')
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
plt.tight_layout()
plt.suptitle("Distribution of Numerical Features", y=1.02, fontsize=16)
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(numerical_columns):
    if col in df.columns:
        sns.boxplot(y=df[col], ax=axes[i])
        axes[i].set_title(f"Boxplot of {col}")

plt.tight_layout()
plt.show()

*Analysis:*

**Happiness Score**
- Fairly normal distribution, with most values concentrated in the mid to high range.
- Mean: **5.52**, Standard Deviation: **1.12**
- Scores range from 1.28 (low happiness) to high 7s.
- The distribution seems roughly normal, but there are some low-score outliers like Afghanistan, likely due to the war and political situation.

**Log GDP Per Capita**
- Mean: **9.43**, with a wide standard deviation of **1.13.**
- Right-skewed distribution, indicating that most countries have lower GDP per capita, with fewer having very high GDP per capita.
- Venezuela is a major outlier with very low GDP, possibly due to economic crises.

**Social Support**
- Mean: **0.81**, with a fairly high lower bound (0.22).
- **No extreme outliers**, indicating that most countries have reasonable social support.

**Life Expectancy**
- Mean: **63.7 years**, but ranges from 40.4 to nearly 80.
- Normally distributed, with most values falling within a predictable range.
- There are likely outliers on the lower end, possibly due to developing countries with lower healthcare access.

**Freedom**
- Mean: **0.75**, but with some low-end outliers.
- The distribution is right skewed, possibly indicating that a subset of countries have significantly lower freedom scores.
- The minimum is **0.23**, suggesting some countries have much lower freedom scores.

**Generosity**
- Mean: **-0.001**, indicating some negative generosity values.
- A broad spread, with some outliers.

**Corruption**
- Mean: **0.75**, indicating that most countries report corruption.
-  Right skewed, most values seem to be concentrated in a certain range, suggesting that low corruption values are uncommon.
- The minimum is **0.035**, suggesting some countries have very low corruption.

**Job Engagement**
- Mean: **20.56%**, with a minimum of 4% engagement.
- Only included for 2023 and 2024

**Work Hours Per Week**
- Mean: **32.8 hours**
- Some outliers exist, as seen in the boxplot.
- Only included for OECD countries

In [None]:

# Correlation Matrix Calculation
correlation_matrix = df[numerical_columns].corr()

# Correlation Matrix Visualization
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix of Numerical Features")
plt.show()

*Analysis:*

**Strong Happiness Correlations**
- **Log GDP Per Capita (~0.77):** Wealthier countries tend to report higher happiness.
- **Life Expectancy (~0.73)**: Healthier countries are generally happier.
- **Social Support (~0.72):** Countries with strong support networks are generally happier.

**Moderate to Weak Happiness Correlations:**
- **Freedom (-0.54):** Freedom is related to higher happiness, but only moderately.
- **Corruption & Happiness (-0.48):** Higher corruption is associated with lower happiness, but not strongly.
- **Work Hours (-0.36):** Higher average weekly work hours is somewhat related to lower happiness, but not strong.

**Other Strong Correlations**
- **Life Expectancy & Social Support (~0.70):** Countries with better social support systems tend to have higher life expectancy.
- **Work Hours & GDP Per Capita (-0.75)** Countries with higher working hours tend to have less wealth
- **Work Hours & Corruptions (-0.55)** Countries with higher work hours tend to have more corruption

*Other notes*
- **Generosity** has a weak relationship with most factors, except for freedom.
- **Job Engagement** has weak relationships with most factors, also except for freedom



In [None]:
# Regional Analysis
region_summary = df.groupby("Region")[numerical_columns].mean().reset_index()

# fixed color palette for regions
unique_regions = region_summary["Region"].unique()
region_colors = dict(zip(unique_regions, sns.color_palette("tab10", len(unique_regions))))

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 12))
axes = axes.flatten()

#Dropping Work Hours & Job Engagement because data was incomplete
metrics = ["Happiness", "Log_GDP_Per_Capita", "Life_Expectancy", "Freedom", "Generosity", "Corruption"]

for i, metric in enumerate(metrics):
    sorted_data = region_summary.sort_values(by=metric, ascending=False)

    colors = [region_colors[region] for region in sorted_data["Region"]]

    sns.barplot(
        y=sorted_data["Region"],
        x=sorted_data[metric],
        ax=axes[i],
        palette=colors 
    )
    axes[i].set_title(f"{metric}")

plt.tight_layout()
plt.show()
region_summary


*Analysis*

**Happiness Score:**
- **North America + Australia and New Zealand** have the highest average happiness score.
- **Sub Saharan Africa** has the lowest happiness score among the regions analyzed.

**Log GDP Per Capita:**
- **Western Europe** has the highest GDP per capita (10.78), followed closely by North America and ANZ (10.78).
- **Sub Saharan Africa** has the lowest GDP per capita.

**Life Expectancy:**
- **Western Europe** has the highest average life expectancy (70.9)
- **Sub Saharan Africa** has much lower life expectancy compared to other regions (~54)

**Freedom:**
-**North America + Australia and New Zealand** has the highest freedom scores.
- **Middle East & North Africa** has the lowest freedom scores.

**Generosity:**
- Generosity scores are negative across most regions *except* North America, ANZ, Southeast Asia, Western Europe, South Asia
- **Commonwealth of Independent states** (former soviet republics) have the lowest generosity scores.

**Corruption**
- **Central & Eastern Europe** has the highest corruption levels.
- **North America and ANZ** show the lowest corruption levels among the regions analyzed.

In [None]:
# Creating a ranking table for each country in each category for every year

# Define the key metrics to rank
ranking_metrics = ["Happiness", "Log_GDP_Per_Capita", "Life_Expectancy", "Freedom", "Corruption", "Social_Support"]

# Compute ranks for each metric within each year
ranked_df = df.copy()
for metric in ranking_metrics:
    ranked_df[f"{metric}_Rank"] = ranked_df.groupby("Year")[metric].rank(ascending=False, method="min")

# Select relevant columns for the ranking table
ranked_df = ranked_df[["Country_Name", "Year"] + [f"{metric}_Rank" for metric in ranking_metrics]]

# Display the ranked data table
display(ranked_df)

In [None]:
# Calculating year-over-year rank changes for each country
rank_change_df = ranked_df.copy()

# Compute year-over-year differences in rank for each metric
for metric in ranking_metrics:
    rank_change_df[f"{metric}_Rank_Change"] = rank_change_df.groupby("Country_Name")[f"{metric}_Rank"].diff()

# Melt the data to structure it for identifying the biggest movers
rank_movers = rank_change_df.melt(
    id_vars=["Country_Name", "Year"], 
    value_vars=[f"{metric}_Rank_Change" for metric in ranking_metrics],
    var_name="Metric",
    value_name="Rank_Change"
).dropna()

# Identify the biggest year-over-year rank changes (absolute values)
biggest_rank_movers = rank_movers.sort_values(by="Rank_Change", key=abs, ascending=False).head(20)

display(biggest_rank_movers)


In [None]:
# Recalculate year-over-year rank changes excluding Corruption and Freedom
filtered_rank_change_df = ranked_df.copy()

# Define the metrics to keep
filtered_metrics = ["Happiness", "Log_GDP_Per_Capita", "Life_Expectancy", "Social_Support"]

# Compute year-over-year differences in rank for selected metrics
for metric in filtered_metrics:
    filtered_rank_change_df[f"{metric}_Rank_Change"] = filtered_rank_change_df.groupby("Country_Name")[f"{metric}_Rank"].diff()

# Melt the data to structure it for identifying the biggest movers
filtered_rank_movers = filtered_rank_change_df.melt(
    id_vars=["Country_Name", "Year"], 
    value_vars=[f"{metric}_Rank_Change" for metric in filtered_metrics],
    var_name="Metric",
    value_name="Rank_Change"
).dropna()

# Identify the biggest year-over-year rank changes (absolute values)
filtered_biggest_rank_movers = filtered_rank_movers.sort_values(by="Rank_Change", key=abs, ascending=False).head(20)

# Display the biggest movers in ranking (excluding Corruption and Freedom)
display(filtered_biggest_rank_movers)


In [None]:
# Filter the dataset to include only the biggest movers identified earlier
top_moving_countries = filtered_biggest_rank_movers["Country_Name"].unique()
filtered_time_series = filtered_rank_change_df[filtered_rank_change_df["Country_Name"].isin(top_moving_countries)]

# Create a time series plot for each metric showing rank changes over time
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(10, 20))
axes = axes.flatten()

# Plot trends for each selected metric
for i, metric in enumerate(filtered_metrics):
    sns.lineplot(
        data=filtered_time_series, 
        x="Year", 
        y=f"{metric}_Rank", 
        hue="Country_Name", 
        marker="o", 
        ax=axes[i]
    )
    axes[i].set_title(f"Time Series of {metric} Rank Changes for Biggest Movers")
    axes[i].invert_yaxis()  # Lower ranks (better positions) should be at the top

plt.tight_layout()
plt.show()
