In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, QuantileRegressor, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from linearmodels.panel import PanelOLS
from statsmodels.stats.outliers_influence import variance_inflation_factor
from numpy.linalg import matrix_rank

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read Data

path='./Data/merged_burden_risk.csv'

df = pd.read_csv(path)

In [3]:
#rename columns for ease 

df = df.dropna(subset=['dalys_(disability-adjusted_life_years)'])
df.rename(columns={'Chronic_Respiratory(RSD)': 'RSD'}, inplace=True)
df.rename(columns={'dalys_(disability-adjusted_life_years)': 'DALYs'}, inplace=True)
df = df.sort_values(by=['country', 'year'])

In [4]:
#filling in missing data for HAQ_Index and RSD using Linear Interpolation and Regression Imputation

# Step 1: Linear interpolation (within each country)
for col in ['HAQ_Index', 'RSD']:
    df[col] = df.groupby('country')[col].transform(lambda x: x.interpolate(method='linear', limit_direction='both'))

# Step 2: Fill remaining HAQ using regression on year
if df['HAQ_Index'].isna().sum() > 0:
    known = df[df['HAQ_Index'].notna()]
    unknown = df[df['HAQ_Index'].isna()]
    model = LinearRegression().fit(known[['year']], known['HAQ_Index'])
    df.loc[unknown.index, 'HAQ_Index'] = model.predict(unknown[['year']])

# Step 3: Fill remaining RSD (if any) using regression
if df['RSD'].isna().sum() > 0:
    known_rsd = df[df['RSD'].notna()]
    unknown_rsd = df[df['RSD'].isna()]
    model_rsd = LinearRegression().fit(known_rsd[['year']], known_rsd['RSD'])
    df.loc[unknown_rsd.index, 'RSD'] = model_rsd.predict(unknown_rsd[['year']])

In [5]:
#Log Transformations: Normalize skewed variables
df['log_gdp_per_capita'] = np.log(df['GDP PER CAPITA (USD)'] + 1)
df['log_population_density'] = np.log(df['Population Density'] + 1)
df['log_total_co2'] = np.log(df['Total CO2 Emission excluding LUCF (Mt)'] + 1)


#Per Capita Pollution Measures: Scale pollution to population for fair comparisons
df['co2_per_capita'] = df['Total CO2 Emission excluding LUCF (Mt)'] / df['Population']
df['no2_per_capita'] = df['Nitrogen oxide'] / df['Population']
df['black_carbon_per_capita'] = df['Black Carbon'] / df['Population']

#Pollution burden adjusted for healthcare quality
df['pollution_x_low_haq'] = df['co2_per_capita'] * (1 - df['HAQ_Index'] / 100)

df['year_index'] = df['year'] - df['year'].min() #Time Index: Relative year index

df['lagged_dalys'] = df.groupby('country')['DALYs'].shift(1) #Lagged DALYs: Previous year's burden for temporal modeling

df.head()


Unnamed: 0,country,Sub-Region,year,Population,GDP PER CAPITA (USD),Area (Km2),Population Density,Total CO2 Emission excluding LUCF (Mt),Nitrogen oxide,Sulphur dioxide,...,smoking_YLL,log_gdp_per_capita,log_population_density,log_total_co2,co2_per_capita,no2_per_capita,black_carbon_per_capita,pollution_x_low_haq,year_index,lagged_dalys
0,algeria,Northern Africa,2000,30774621,1780.38,2381741,12.921061,80.05,259742.44,99556.484,...,19088,7.485144,2.633403,4.395066,3e-06,0.00844,0.001715,1e-06,0,
1,algeria,Northern Africa,2001,31200985,1754.58,2381741,13.100075,78.65,259294.86,91087.23,...,19447,7.470555,2.64618,4.377642,3e-06,0.00831,0.001509,1e-06,1,76462.0
2,algeria,Northern Africa,2002,31624696,1794.81,2381741,13.277974,82.4,293826.3,118613.93,...,20062,7.493211,2.658718,4.423648,3e-06,0.009291,0.001538,1e-06,2,78867.0
3,algeria,Northern Africa,2003,32055883,2117.05,2381741,13.459013,88.19,289759.84,108086.62,...,20779,7.658251,2.671318,4.490769,3e-06,0.009039,0.001531,1e-06,3,82146.0
4,algeria,Northern Africa,2004,32510186,2624.8,2381741,13.649757,89.49,315098.84,121832.65,...,21397,7.873141,2.684424,4.505239,3e-06,0.009692,0.001523,1e-06,4,85619.0


In [6]:
#Rolling Averages: Capture long-term exposure effects
df['pm25_3yr_avg'] = df.groupby('country')['pm25_DALY'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
df['dalys_3yr_avg'] = df.groupby('country')['DALYs'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

#Temporal Change: Year-over-year change in pollution
df['delta_pm25'] = df.groupby('country')['pm25_DALY'].diff()
df['delta_black_carbon'] = df.groupby('country')['Black Carbon'].diff()

#Interaction Terms: Capture compound effects between variables
df['gdp_x_haq'] = df['GDP PER CAPITA (USD)'] * df['HAQ_Index']
df['smoking_x_pm25'] = df['smoking_DALY'] * df['pm25_DALY']
df['haq_x_dalys_lag'] = df['HAQ_Index'] * df.groupby('country')['DALYs'].shift(1)

#Vulnerability Index: Composite of low GDP, high population density, and low HAQ
df['norm_gdp'] = df.groupby('year')['GDP PER CAPITA (USD)'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_density'] = df.groupby('year')['Population Density'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_haq'] = df.groupby('year')['HAQ_Index'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['vulnerability_index'] = (1 - df['norm_gdp']) + df['norm_density'] + (1 - df['norm_haq'])


# Preview key new features
df[['log_gdp_per_capita', 'co2_per_capita', 'pollution_x_low_haq', 'pm25_3yr_avg',
    'delta_pm25', 'gdp_x_haq', 'vulnerability_index']].head()

Unnamed: 0,log_gdp_per_capita,co2_per_capita,pollution_x_low_haq,pm25_3yr_avg,delta_pm25,gdp_x_haq,vulnerability_index
0,7.485144,3e-06,1e-06,18436.0,,90087.228,1.043238
1,7.470555,3e-06,1e-06,18696.5,521.0,90431.0532,1.035506
2,7.493211,3e-06,1e-06,18994.0,632.0,94191.6288,1.033777
3,7.658251,3e-06,1e-06,19588.666667,631.0,113092.811,0.989745
4,7.873141,3e-06,1e-06,20190.666667,543.0,142684.128,0.967272


In [7]:
all_features = [
    'log_gdp_per_capita', 'log_population_density', 'log_total_co2',
    'co2_per_capita', 'pollution_x_low_haq', 'year_index', 'lagged_dalys',
    'pm25_3yr_avg', 'delta_pm25', 'gdp_x_haq', 'smoking_x_pm25',
    'haq_x_dalys_lag', 'vulnerability_index'
]

structural_features = [f for f in all_features if f not in ['lagged_dalys', 'haq_x_dalys_lag']]

In [10]:
#set Countru and Year as Multi-index

df = df.set_index(['country', 'year'])

In [11]:
#Panel Modeling — Random Effects Approximation (OLS + Clustered SE)
# STEP 6A: Random Effects-style Panel Regression (with Clustered SEs)


# Feature list
candidate_features = [
    'log_gdp_per_capita', 'pollution_x_low_haq', 'pm25_3yr_avg',
    'gdp_x_haq', 'smoking_x_pm25', 'haq_x_dalys_lag', 'vulnerability_index'
]

# Prepare panel structure

panel_df = df[candidate_features + ['DALYs']].dropna().copy()
panel_df['const'] = 1

# Remove collinear feature
final_features = candidate_features.copy()
for feature in candidate_features:
    test_features = [f for f in final_features if f != feature]
    X = panel_df[test_features + ['const']]
    if matrix_rank(X.values) == len(test_features) + 1:
        final_features = test_features
        break

# Final regression design
X_final = panel_df[final_features + ['const']]

# VIF check
vif_data = pd.DataFrame()
vif_data["feature"] = final_features
vif_data["VIF"] = [variance_inflation_factor(X_final[final_features].values, i) for i in range(len(final_features))]

# Run PanelOLS
model = PanelOLS(panel_df['DALYs'], X_final, entity_effects=True)
results = model.fit(cov_type='clustered', cluster_entity=True)

# Output
print(results.summary)
print(vif_data)


                          PanelOLS Estimation Summary                           
Dep. Variable:                  DALYs   R-squared:                        0.6891
Estimator:                   PanelOLS   R-squared (Between):              0.6932
No. Observations:                1056   R-squared (Within):               0.6891
Date:                Fri, Apr 18 2025   R-squared (Overall):              0.6935
Time:                        10:31:52   Log-likelihood                -1.083e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      367.94
Entities:                          54   P-value                           0.0000
Avg Obs:                       19.556   Distribution:                   F(6,996)
Min Obs:                       8.0000                                           
Max Obs:                       20.000   F-statistic (robust):             69.879
                            

In [13]:
#Panel Modeling — Random Effects Approximation (OLS + Clustered SE)
# Random Effects-style Panel Regression (with Clustered SEs)

# Step 1: Set up your panel features
panel_features = [
    'log_gdp_per_capita', 'pollution_x_low_haq', 'pm25_3yr_avg',
    'gdp_x_haq', 'smoking_x_pm25', 'haq_x_dalys_lag', 'vulnerability_index'
]

# Step 2: Set index but DON'T reference 'country', 'year' as columns
#df = df.set_index(['country', 'year'])

# Step 3: Extract panel data, then bring 'country' back from index for clustering
panel_df = df[panel_features + ['DALYs']].dropna().copy()
panel_df['cluster_group'] = panel_df.index.get_level_values('country')  # <--- this is key!

# Step 4: Define model inputs
X = sm.add_constant(panel_df[panel_features])
y = panel_df['DALYs']

# Step 5: Fit model with cluster-robust SEs
model = sm.OLS(y, X)
re_results = model.fit(cov_type='cluster', cov_kwds={'groups': panel_df['cluster_group']})


# === 6. Print results summary ===
print(re_results.summary())
# 📘 Notes:

# This approximates Random Effects by clustering standard errors by country
# It controls for within-country correlation while pooling across countries
# For true RE, you can use linearmodels.RandomEffects() if you're set up for that

                            OLS Regression Results                            
Dep. Variable:                  DALYs   R-squared:                       0.948
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     169.1
Date:                Fri, 18 Apr 2025   Prob (F-statistic):           2.69e-31
Time:                        10:45:45   Log-Likelihood:                -11922.
No. Observations:                1056   AIC:                         2.386e+04
Df Residuals:                    1049   BIC:                         2.389e+04
Df Model:                           6                                         
Covariance Type:              cluster                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                4.723e+04   2



In [14]:
#First Difference Per Country
# Define the columns
features = [
    'DALYs', 'log_gdp_per_capita', 'pollution_x_low_haq', 'pm25_3yr_avg',
    'gdp_x_haq', 'smoking_x_pm25', 'haq_x_dalys_lag', 'vulnerability_index'
]

# Step 1: Clean sub-dataframe

df_sub = df.reset_index()[['country', 'year'] + features].dropna()

# Step 2: First differences per country
diff_rows = []
group_labels = []

for country, group in df_sub.groupby('country'):
    group_sorted = group.sort_values('year')
    diffs = group_sorted[features].diff().dropna()
    diffs['country'] = country
    diff_rows.append(diffs)
    group_labels.extend([country] * len(diffs))

df_diff = pd.concat(diff_rows).reset_index(drop=True)

# Step 3: Set up X and y
X_diff = sm.add_constant(df_diff.drop(columns=['DALYs', 'country']))
y_diff = df_diff['DALYs']
group_var = df_diff['country']

# Step 4: Fit model with clustered standard errors
model = sm.OLS(y_diff, X_diff)
fd_results = model.fit(cov_type='cluster', cov_kwds={'groups': group_var})

# Step 5: Print the summary
print(fd_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  DALYs   R-squared:                       0.298
Model:                            OLS   Adj. R-squared:                  0.293
Method:                 Least Squares   F-statistic:                     10.31
Date:                Fri, 18 Apr 2025   Prob (F-statistic):           6.13e-07
Time:                        10:50:34   Log-Likelihood:                -8952.5
No. Observations:                1002   AIC:                         1.792e+04
Df Residuals:                     995   BIC:                         1.795e+04
Df Model:                           6                                         
Covariance Type:              cluster                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                1045.1614    



##### Panel MOdel for Deaths

In [19]:
# === 1. Load and Prepare Dataset ===
df = pd.read_csv("./Data/merged_burden_risk.csv")
df = df.dropna(subset=['deaths'])  # use exact column name
df.rename(columns={'deaths': 'TARGET'}, inplace=True)
df = df.sort_values(by=['country', 'year'])

In [20]:
# === 2. Feature Engineering (Static) ===
df['log_gdp_per_capita'] = np.log(df['GDP PER CAPITA (USD)'] + 1)
df['log_population_density'] = np.log(df['Population Density'] + 1)
df['log_total_co2'] = np.log(df['Total CO2 Emission excluding LUCF (Mt)'] + 1)
df['co2_per_capita'] = df['Total CO2 Emission excluding LUCF (Mt)'] / df['Population']
df['pollution_x_low_haq'] = df['co2_per_capita'] * (1 - df['HAQ_Index'].fillna(0) / 100)
df['pm25_3yr_avg'] = df.groupby('country')['pm25_DALY'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
df['gdp_x_haq'] = df['GDP PER CAPITA (USD)'] * df['HAQ_Index'].fillna(0)
df['smoking_x_pm25'] = df['smoking_DALY'] * df['pm25_DALY']
df['haq_x_target_lag'] = df['HAQ_Index'].fillna(0) * df.groupby('country')['TARGET'].shift(1)
df['norm_gdp'] = df.groupby('year')['GDP PER CAPITA (USD)'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_density'] = df.groupby('year')['Population Density'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_haq'] = df.groupby('year')['HAQ_Index'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['vulnerability_index'] = (1 - df['norm_gdp']) + df['norm_density'] + (1 - df['norm_haq'])

In [21]:

# === 3. Select Features and Drop Missing ===
features = [
    'TARGET', 'log_gdp_per_capita', 'pollution_x_low_haq', 'pm25_3yr_avg',
    'gdp_x_haq', 'smoking_x_pm25', 'haq_x_target_lag', 'vulnerability_index'
]
df_sub = df[['country', 'year'] + features].dropna()

In [22]:

# === 4. RANDOM EFFECTS-STYLE MODEL (Clustered OLS) ===
X = sm.add_constant(df_sub[['log_gdp_per_capita', 'pollution_x_low_haq', 'pm25_3yr_avg',
                            'gdp_x_haq', 'smoking_x_pm25', 'haq_x_target_lag', 'vulnerability_index']])
y = df_sub['TARGET']
re_model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': df_sub['country']})
print("🔹 RANDOM EFFECTS MODEL RESULTS FOR DEATHS")
print(re_model.summary())

🔹 RANDOM EFFECTS MODEL RESULTS FOR DEATHS
                            OLS Regression Results                            
Dep. Variable:                 TARGET   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                     186.0
Date:                Fri, 18 Apr 2025   Prob (F-statistic):           8.90e-34
Time:                        11:30:52   Log-Likelihood:                -1968.5
No. Observations:                 249   AIC:                             3951.
Df Residuals:                     242   BIC:                             3976.
Df Model:                           6                                         
Covariance Type:              cluster                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------



In [23]:
# === 5. FIRST-DIFFERENCE MODEL ===
diff_rows = []
for country, group in df_sub.groupby('country'):
    group = group.sort_values('year')
    diffs = group[features].diff().dropna()
    diffs['country'] = country
    diff_rows.append(diffs)

df_diff = pd.concat(diff_rows).reset_index(drop=True)
X_diff = sm.add_constant(df_diff[['log_gdp_per_capita', 'pollution_x_low_haq', 'pm25_3yr_avg',
                                  'gdp_x_haq', 'smoking_x_pm25', 'haq_x_target_lag', 'vulnerability_index']])
y_diff = df_diff['TARGET']
fd_model = sm.OLS(y_diff, X_diff).fit(cov_type='cluster', cov_kwds={'groups': df_diff['country']})
print("\n🔹 FIRST-DIFFERENCE MODEL RESULTS FOR DEATHS")
print(fd_model.summary())



🔹 FIRST-DIFFERENCE MODEL RESULTS FOR DEATHS
                            OLS Regression Results                            
Dep. Variable:                 TARGET   R-squared:                       0.469
Model:                            OLS   Adj. R-squared:                  0.452
Method:                 Least Squares   F-statistic:                     5.732
Date:                Fri, 18 Apr 2025   Prob (F-statistic):           0.000306
Time:                        11:30:52   Log-Likelihood:                -1255.2
No. Observations:                 195   AIC:                             2524.
Df Residuals:                     188   BIC:                             2547.
Df Model:                           6                                         
Covariance Type:              cluster                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------

