In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px


In [2]:
root = '../data/'
df = pd.read_csv(root + 'global_health.csv')
df.head()

Unnamed: 0,Country,Country_Code,Year,Fertility_Rate,Urban_Population_Percent,Total_Population,Water_Access_Percent,Unemployment_Rate,Sanitary_Expense_Per_GDP,Life_Expectancy,...,Sanitary_Expense_Per_Capita,CO2_Exposure_Percent,Air_Pollution,Labour_Force_Total,Tuberculosis_Per_100000,Suicide_Rate_Percent,Obesity_Rate_Percent,Underweight_Rate_Percent,Overweight_Rate_Percent,Safe_Water_Access_Percent
0,Afghanistan,AFG,2012,5.83,24.16,30466479.0,21.123996,7.909,7.897169,61.923,...,52.613541,70.922317,70.922317,7520865.0,189.0,3.68,10.7,10.15,31.55,46.68
1,Afghanistan,AFG,2013,5.696,24.373,31541209.0,22.03447,7.919,8.805964,62.417,...,56.305542,73.131816,73.131816,7881567.0,189.0,3.66,11.55,10.79,32.73,49.45
2,Afghanistan,AFG,2014,5.56,24.587,32716210.0,22.944301,7.915,9.528878,62.545,...,60.189579,77.143728,77.143728,8285362.0,189.0,3.6,10.44,10.17,33.95,52.25
3,Afghanistan,AFG,2015,5.405,24.803,33753499.0,23.85359,9.011,10.105348,62.659,...,60.05854,73.490818,73.490818,8630724.0,189.0,3.57,11.19,10.52,35.19,55.09
4,Afghanistan,AFG,2016,5.262,25.02,34636207.0,24.76222,10.1,11.81859,63.136,...,61.486458,72.76591,72.76591,8913938.0,189.0,3.61,11.99,7.88,36.45,57.97


In [3]:
df.describe()

Unnamed: 0,Year,Fertility_Rate,Urban_Population_Percent,Total_Population,Water_Access_Percent,Unemployment_Rate,Sanitary_Expense_Per_GDP,Life_Expectancy,Life_Expectancy_Female,Life_Expectancy_Male,...,Sanitary_Expense_Per_Capita,CO2_Exposure_Percent,Air_Pollution,Labour_Force_Total,Tuberculosis_Per_100000,Suicide_Rate_Percent,Obesity_Rate_Percent,Underweight_Rate_Percent,Overweight_Rate_Percent,Safe_Water_Access_Percent
count,1880.0,1844.0,1880.0,1880.0,1225.0,1740.0,1834.0,1840.0,1840.0,1840.0,...,1833.0,1683.0,1683.0,1740.0,1670.0,1272.0,1650.0,1650.0,1650.0,1648.0
mean,2016.5,2.762999,58.616514,39596360.0,68.156944,7.68353,6.646729,71.435155,74.096245,68.890663,...,1158.319956,26.298231,26.298231,19161170.0,118.492665,9.569717,17.724194,5.508636,42.325394,81.960376
std,2.873046,1.34558,23.170964,145228800.0,30.828206,5.73873,2.975888,7.836184,8.108933,7.716119,...,1905.095419,17.296974,17.296974,72077900.0,163.747526,11.977159,10.833918,5.46802,18.356223,21.324338
min,2012.0,0.808,11.194,10444.0,5.86313,0.1,1.514593,47.835,50.486,45.36,...,13.625722,4.895181,4.895181,36260.0,0.0,0.0,0.47,0.23,5.46,25.61
25%,2014.0,1.67,40.069,2061738.0,44.033993,3.7155,4.400171,65.367,68.057,62.957,...,86.749573,13.894428,13.894428,1439398.0,13.0,2.21,11.345,1.35,25.9125,65.555
50%,2016.5,2.311,58.884,8693360.0,76.250143,5.811,6.280311,72.554,76.093,69.3,...,353.0,21.181059,21.181059,4341022.0,48.0,10.42,16.49,3.06,45.345,91.07
75%,2019.0,3.6455,77.63375,29184560.0,97.10822,10.24525,8.428217,77.42275,80.4015,74.682,...,1158.393066,32.333277,32.333277,12177480.0,167.75,13.5725,23.32,9.5725,55.01,99.6825
max,2021.0,7.4,100.0,1412360000.0,100.0,35.707,24.283052,84.56,87.71,82.6,...,12012.241211,107.144665,107.144665,780709600.0,1180.0,147.8,69.08,31.09,88.81,100.0


In [4]:
# drop any rows that are missing data in key columns
key_columns = [
    'Year', 'Country', 'GDP_Per_Capita', 
    'Life_Expectancy', 'Total_Population'
]
df_clean = df.dropna(subset=key_columns)

# Ensure 'Year' is an integer (it's best for the animation slider)
df_clean['Year'] = df_clean['Year'].astype(int)

print(f"Original rows: {len(df)}, Rows after cleaning: {len(df_clean)}")

### Create Animated Bubble Chart ###

# Calculate min max before plotting
min_life_exp = df_clean['Life_Expectancy'].min() - 5
max_life_exp = df_clean['Life_Expectancy'].max() + 5

# Put min max on log scale for GDP
min_gdp = df_clean[df_clean['GDP_Per_Capita'] > 0]['GDP_Per_Capita'].min() * 0.9
max_gdp = df_clean['GDP_Per_Capita'].max() * 1.1

print("Creating animated bubble chart... This may take a moment.")

fig = px.scatter(
    df_clean,
    x="GDP_Per_Capita",         # X-axis: Wealth
    y="Life_Expectancy",        # Y-axis: Health
    size="Total_Population",    # Bubble Size: Population
    color="Country",            # Bubble Color: Country
    
    # Interactivity & Tooltip 
    hover_name="Country",       
    hover_data={                
        "Year": True,
        "GDP_Per_Capita": ":,.2f", 
        "Life_Expectancy": ":.1f",
        "Total_Population": ":,d"  
    },

    # Animation 
    animation_frame="Year",
    animation_group="Country", 
    
    # Styling & Configuration 
    log_x=True,                
    size_max=60,                
    range_y=[min_life_exp, max_life_exp], 
    range_x=[min_gdp, max_gdp], 
    title="Global Health vs. Wealth (2012-Present)",
    labels={
        "GDP_Per_Capita": "GDP per Capita (Log Scale)",
        "Life_Expectancy": "Life Expectancy (Years)",
        "Total_Population": "Population"
    }
)

# Improve Layout
# Hide the legend (it will be huge with all the countries)
fig.update_layout(showlegend=False)
# Add a prefix to the slider's display
fig.layout.sliders[0].currentvalue.prefix = 'Year: '

# Show the Chart
fig.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Year'] = df_clean['Year'].astype(int)


Original rows: 1880, Rows after cleaning: 1813
Creating animated bubble chart... This may take a moment.


In [5]:
df_processed = df.copy()

# Calculate a Composite Index: "Development Index"

# Log-transform GDP per Capita for better scaling
df_processed['log_GDP_Per_Capita'] = np.log(df_processed['GDP_Per_Capita'].replace(0, np.nan))

# Format: { 'column_name': 'direction' }
index_columns = {
    'Life_Expectancy': 'good',
    'log_GDP_Per_Capita': 'good',
    'Safe_Water_Access_Percent': 'good',
    'Unemployment_Rate': 'bad',
    'Immunization_Rate': 'good'
}

# Find global "goalposts" (min and max) for each metric
global_stats = {}
for col in index_columns.keys():
    global_stats[col] = {
        'min': df_processed[col].min(),
        'max': df_processed[col].max()
    }

# Normalize each column to a 0-1 score
score_cols = []
for col, direction in index_columns.items():
    score_col_name = f"{col}_Score"
    score_cols.append(score_col_name)
    
    global_min = global_stats[col]['min']
    global_max = global_stats[col]['max']
    
    if direction == 'good':
        # Formula: (Value - Min) / (Max - Min)
        df_processed[score_col_name] = (df_processed[col] - global_min) / (global_max - global_min)
    elif direction == 'bad':
        # Formula: (Max - Value) / (Max - Min)
        df_processed[score_col_name] = (global_max - df_processed[col]) / (global_max - global_min)

# Calculate the final Aggregate_Score by averaging the individual scores
df_processed['Aggregate_Score'] = df_processed[score_cols].mean(axis=1, skipna=True)


# Clean processed data for plotting
plot_df = df_processed.dropna(subset=[
    'Year', 'Country', 'GDP_Per_Capita', 
    'Life_Expectancy', 'Total_Population', 'Aggregate_Score'
])
plot_df['Year'] = plot_df['Year'].astype(int)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [6]:
df_processed.head()

Unnamed: 0,Country,Country_Code,Year,Fertility_Rate,Urban_Population_Percent,Total_Population,Water_Access_Percent,Unemployment_Rate,Sanitary_Expense_Per_GDP,Life_Expectancy,...,Underweight_Rate_Percent,Overweight_Rate_Percent,Safe_Water_Access_Percent,log_GDP_Per_Capita,Life_Expectancy_Score,log_GDP_Per_Capita_Score,Safe_Water_Access_Percent_Score,Unemployment_Rate_Score,Immunization_Rate_Score,Aggregate_Score
0,Afghanistan,AFG,2012,5.83,24.16,30466479.0,21.123996,7.909,7.897169,61.923,...,10.15,31.55,46.68,6.482216,0.383608,0.15784,0.283237,0.780689,0.6,0.441075
1,Afghanistan,AFG,2013,5.696,24.373,31541209.0,22.03447,7.919,8.805964,62.417,...,10.79,32.73,49.45,6.459487,0.397059,0.154588,0.320473,0.780408,0.5625,0.443006
2,Afghanistan,AFG,2014,5.56,24.587,32716210.0,22.944301,7.915,9.528878,62.545,...,10.17,33.95,52.25,6.440169,0.400545,0.151824,0.358113,0.780521,0.5375,0.4457
3,Afghanistan,AFG,2015,5.405,24.803,33753499.0,23.85359,9.011,10.105348,62.659,...,10.52,35.19,55.09,6.34015,0.403649,0.137513,0.39629,0.74974,0.5625,0.449938
4,Afghanistan,AFG,2016,5.262,25.02,34636207.0,24.76222,10.1,11.81859,63.136,...,7.88,36.45,57.97,6.259683,0.416637,0.125999,0.435005,0.719156,0.5875,0.456859


In [7]:
# Countries dataframe to be used to join
# https://www.kaggle.com/datasets/hserdaraltan/countries-by-continent
countries_df = pd.read_csv(root + 'countries_and_continents.csv')
countries_d = countries_df.set_index('Country')['Continent'].to_dict()
countries_df.head()

Unnamed: 0,Continent,Country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina


In [8]:
# Map countries to respective continents
df_processed = df_processed.join(countries_df.set_index('Country'), on ='Country')

In [11]:
try:
    output_filename = 'global_health_with_index.csv'
    df_processed.to_csv(output_filename, index=False)
    print(f"Successfully exported processed data to '{output_filename}'")
except Exception as e:
    print(f"Error exporting file: {e}")

Successfully exported processed data to 'global_health_with_index.csv'


In [10]:
# Create animated bubble chart

# Fix the axis ranges to prevent "jumping" during animation
min_life_exp = plot_df['Life_Expectancy'].min() - 5
max_life_exp = plot_df['Life_Expectancy'].max() + 5
min_gdp = plot_df[plot_df['GDP_Per_Capita'] > 0]['GDP_Per_Capita'].min() * 0.9
max_gdp = plot_df['GDP_Per_Capita'].max() * 1.1

print("Creating animated bubble chart...")

fig = px.scatter(
    plot_df,
    # Define Axes & Bubble Size
    x="GDP_Per_Capita",
    y="Life_Expectancy",
    size="Total_Population",
    
    # Color by our Aggregate Score
    color="Aggregate_Score",
    color_continuous_scale=px.colors.sequential.Viridis, 
    range_color=[0, 1],
    
    # Interactivity & Tooltip 
    hover_name="Country",
    hover_data={
        "Year": True,
        "GDP_Per_Capita": ":,.2f",
        "Life_Expectancy": ":.1f",
        "Total_Population": ":,d",
        "Aggregate_Score": ":.3f" 
    },

    # Animation 
    animation_frame="Year",
    animation_group="Country",
    
    # --- Styling & Configuration ---
    log_x=True,
    size_max=60,
    range_y=[min_life_exp, max_life_exp],
    range_x=[min_gdp, max_gdp],
    title="Global Health vs. Wealth (Colored by Development Index)",
    labels={
        "GDP_Per_Capita": "GDP per Capita (Log Scale)",
        "Life_Expectancy": "Life Expectancy (Years)",
        "Total_Population": "Population",
        "Aggregate_Score": "Development Score"
    }
)

# --- 5. Improve Layout ---
fig.update_layout(showlegend=False)
# Add a prefix to the slider
fig.layout.sliders[0].currentvalue.prefix = 'Year: '
# Title the color bar
fig.update_coloraxes(colorbar_title_text='Dev. Score')

print("Chart created. Showing in browser...")

# Show
fig.show()

Creating animated bubble chart...
Chart created. Showing in browser...
