In [None]:
# 📊 COPD Advanced Feature Engineering Pipeline

import pandas as pd
import numpy as np

# Load and prepare the dataset
df = pd.read_csv("/mnt/data/merged_burden_risk.csv")



In [None]:
# Drop rows with missing DALYs target and rename for convenience
df = df.dropna(subset=['dalys_(disability-adjusted_life_years)'])
df.rename(columns={'dalys_(disability-adjusted_life_years)': 'DALYs'}, inplace=True)

# Sort by country and year for time-aware transformations
df = df.sort_values(by=['country', 'year'])



In [None]:
# --- ✨ BASIC FEATURE TRANSFORMATIONS ---

# 🔁 Log Transformations: Normalize skewed variables
df['log_gdp_per_capita'] = np.log(df['GDP PER CAPITA (USD)'] + 1)
df['log_population_density'] = np.log(df['Population Density'] + 1)
df['log_total_co2'] = np.log(df['Total CO2 Emission excluding LUCF (Mt)'] + 1)

# 👤 Per Capita Pollution Measures: Scale pollution to population for fair comparisons
df['co2_per_capita'] = df['Total CO2 Emission excluding LUCF (Mt)'] / df['Population']
df['no2_per_capita'] = df['Nitrogen Oxide'] / df['Population']
df['black_carbon_per_capita'] = df['Black Carbon'] / df['Population']

# 🏥 Pollution burden adjusted for healthcare quality
df['pollution_x_low_haq'] = df['co2_per_capita'] * (1 - df['HAQ_Index'].fillna(0) / 100)

# ⏳ Time Index: Relative year index
df['year_index'] = df['year'] - df['year'].min()

# 📉 Lagged DALYs: Previous year's burden for temporal modeling
df['lagged_dalys'] = df.groupby('country')['DALYs'].shift(1)




In [None]:
# --- 🔬 ADVANCED FEATURE ENGINEERING ---

# 📊 Rolling Averages: Capture long-term exposure effects
df['pm25_3yr_avg'] = df.groupby('country')['pm25_DALY'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
df['dalys_3yr_avg'] = df.groupby('country')['DALYs'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# 📈 Temporal Change: Year-over-year change in pollution
df['delta_pm25'] = df.groupby('country')['pm25_DALY'].diff()
df['delta_black_carbon'] = df.groupby('country')['Black Carbon'].diff()

# 🧮 Interaction Terms: Capture compound effects between variables
df['gdp_x_haq'] = df['GDP PER CAPITA (USD)'] * df['HAQ_Index'].fillna(0)
df['smoking_x_pm25'] = df['smoking_DALY'] * df['pm25_DALY']
df['haq_x_dalys_lag'] = df['HAQ_Index'].fillna(0) * df['lagged_dalys']

# ⚠️ Vulnerability Index: Composite of low GDP, high population density, and low HAQ
df['norm_gdp'] = df.groupby('year')['GDP PER CAPITA (USD)'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_density'] = df.groupby('year')['Population Density'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_haq'] = df.groupby('year')['HAQ_Index'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['vulnerability_index'] = (1 - df['norm_gdp']) + df['norm_density'] + (1 - df['norm_haq'])


# Preview key new features
df[['log_gdp_per_capita', 'co2_per_capita', 'pollution_x_low_haq', 'pm25_3yr_avg',
    'delta_pm25', 'gdp_x_haq', 'vulnerability_index']].head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Reload and use the fully transformed dataset
df = pd.read_csv("/mnt/data/merged_burden_risk.csv")
df = df.dropna(subset=['dalys_(disability-adjusted_life_years)'])
df.rename(columns={'dalys_(disability-adjusted_life_years)': 'DALYs'}, inplace=True)
df = df.sort_values(by=['country', 'year'])

# Reapply key transformations to ensure consistent state
df['log_gdp_per_capita'] = np.log(df['GDP PER CAPITA (USD)'] + 1)
df['log_population_density'] = np.log(df['Population Density'] + 1)
df['log_total_co2'] = np.log(df['Total CO2 Emission excluding LUCF (Mt)'] + 1)
df['co2_per_capita'] = df['Total CO2 Emission excluding LUCF (Mt)'] / df['Population']
df['pollution_x_low_haq'] = df['co2_per_capita'] * (1 - df['HAQ_Index'].fillna(0) / 100)
df['year_index'] = df['year'] - df['year'].min()
df['lagged_dalys'] = df.groupby('country')['DALYs'].shift(1)
df['pm25_3yr_avg'] = df.groupby('country')['pm25_DALY'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
df['delta_pm25'] = df.groupby('country')['pm25_DALY'].diff()
df['gdp_x_haq'] = df['GDP PER CAPITA (USD)'] * df['HAQ_Index'].fillna(0)
df['smoking_x_pm25'] = df['smoking_DALY'] * df['pm25_DALY']
df['haq_x_dalys_lag'] = df['HAQ_Index'].fillna(0) * df['lagged_dalys']
df['norm_gdp'] = df.groupby('year')['GDP PER CAPITA (USD)'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_density'] = df.groupby('year')['Population Density'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_haq'] = df.groupby('year')['HAQ_Index'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['vulnerability_index'] = (1 - df['norm_gdp']) + df['norm_density'] + (1 - df['norm_haq'])

# Drop rows with any remaining NaNs in predictor columns
feature_cols = [
    'log_gdp_per_capita', 'log_population_density', 'log_total_co2',
    'co2_per_capita', 'pollution_x_low_haq', 'year_index', 'lagged_dalys',
    'pm25_3yr_avg', 'delta_pm25', 'gdp_x_haq', 'smoking_x_pm25',
    'haq_x_dalys_lag', 'vulnerability_index'
]

df_model = df.dropna(subset=feature_cols + ['DALYs']).copy()

# Define input and output
X = df_model[feature_cols]
y = df_model['DALYs']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

# Evaluate model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Package results
results = pd.DataFrame({
    'Model': ['Linear Regression'],
    'R²': [r2],
    'MAE': [mae],
    'RMSE': [rmse]
})

import ace_tools as tools; tools.display_dataframe_to_user(name="Linear Regression Performance", dataframe=results)

results


In [None]:
# Import necessary libraries for extended model training
from sklearn.linear_model import Ridge, Lasso, QuantileRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold

# Prepare inputs from previous step
df = df_model.copy()  # from previous step
X = df[feature_cols]
y = df['DALYs']
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Quantile Regression (median)': QuantileRegressor(quantile=0.5, alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Evaluate models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    results.append({
        'Model': name,
        'R²': r2,
        'MAE': mae,
        'RMSE': rmse
    })

results_df = pd.DataFrame(results).sort_values(by='R²', ascending=False)

# Display to user
import ace_tools as tools; tools.display_dataframe_to_user(name="Expanded Model Comparison", dataframe=results_df)

results_df


In [None]:
import shap
import matplotlib.pyplot as plt

# Use the trained XGBoost model from earlier step (retrain if needed)
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Initialize SHAP explainer for tree-based models
explainer = shap.Explainer(xgb_model, X_train)
shap_values = explainer(X_test)

# Global SHAP summary plot
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values, features=X_test, feature_names=feature_cols, show=False)
plt.tight_layout()
plt.show()


In [None]:
# Re-import required libraries after kernel reset
import pandas as pd
import numpy as np
import shap
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load dataset and perform all preprocessing
df = pd.read_csv("/mnt/data/merged_burden_risk.csv")
df = df.dropna(subset=['dalys_(disability-adjusted_life_years)'])
df.rename(columns={'dalys_(disability-adjusted_life_years)': 'DALYs'}, inplace=True)
df = df.sort_values(by=['country', 'year'])

# Feature engineering (reapply transformations)
df['log_gdp_per_capita'] = np.log(df['GDP PER CAPITA (USD)'] + 1)
df['log_population_density'] = np.log(df['Population Density'] + 1)
df['log_total_co2'] = np.log(df['Total CO2 Emission excluding LUCF (Mt)'] + 1)
df['co2_per_capita'] = df['Total CO2 Emission excluding LUCF (Mt)'] / df['Population']
df['pollution_x_low_haq'] = df['co2_per_capita'] * (1 - df['HAQ_Index'].fillna(0) / 100)
df['year_index'] = df['year'] - df['year'].min()
df['lagged_dalys'] = df.groupby('country')['DALYs'].shift(1)
df['pm25_3yr_avg'] = df.groupby('country')['pm25_DALY'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
df['delta_pm25'] = df.groupby('country')['pm25_DALY'].diff()
df['gdp_x_haq'] = df['GDP PER CAPITA (USD)'] * df['HAQ_Index'].fillna(0)
df['smoking_x_pm25'] = df['smoking_DALY'] * df['pm25_DALY']
df['haq_x_dalys_lag'] = df['HAQ_Index'].fillna(0) * df['lagged_dalys']
df['norm_gdp'] = df.groupby('year')['GDP PER CAPITA (USD)'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_density'] = df.groupby('year')['Population Density'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_haq'] = df.groupby('year')['HAQ_Index'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['vulnerability_index'] = (1 - df['norm_gdp']) + df['norm_density'] + (1 - df['norm_haq'])

# Define model features and target
feature_cols = [
    'log_gdp_per_capita', 'log_population_density', 'log_total_co2',
    'co2_per_capita', 'pollution_x_low_haq', 'year_index', 'lagged_dalys',
    'pm25_3yr_avg', 'delta_pm25', 'gdp_x_haq', 'smoking_x_pm25',
    'haq_x_dalys_lag', 'vulnerability_index'
]

df_model = df.dropna(subset=feature_cols + ['DALYs']).copy()
X = df_model[feature_cols]
y = df_model['DALYs']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Apply SHAP
explainer = shap.Explainer(xgb_model, X_train)
shap_values = explainer(X_test)

# SHAP summary plot
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values, features=X_test, feature_names=feature_cols, show=False)
plt.tight_layout()
plt.show()


In [None]:
# Re-import essentials after kernel error
import shap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Reload and preprocess dataset
df = pd.read_csv("/mnt/data/merged_burden_risk.csv")
df = df.dropna(subset=['dalys_(disability-adjusted_life_years)'])
df.rename(columns={'dalys_(disability-adjusted_life_years)': 'DALYs'}, inplace=True)
df = df.sort_values(by=['country', 'year'])

# Feature engineering
df['log_gdp_per_capita'] = np.log(df['GDP PER CAPITA (USD)'] + 1)
df['log_population_density'] = np.log(df['Population Density'] + 1)
df['log_total_co2'] = np.log(df['Total CO2 Emission excluding LUCF (Mt)'] + 1)
df['co2_per_capita'] = df['Total CO2 Emission excluding LUCF (Mt)'] / df['Population']
df['pollution_x_low_haq'] = df['co2_per_capita'] * (1 - df['HAQ_Index'].fillna(0) / 100)
df['year_index'] = df['year'] - df['year'].min()
df['lagged_dalys'] = df.groupby('country')['DALYs'].shift(1)
df['pm25_3yr_avg'] = df.groupby('country')['pm25_DALY'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
df['delta_pm25'] = df.groupby('country')['pm25_DALY'].diff()
df['gdp_x_haq'] = df['GDP PER CAPITA (USD)'] * df['HAQ_Index'].fillna(0)
df['smoking_x_pm25'] = df['smoking_DALY'] * df['pm25_DALY']
df['haq_x_dalys_lag'] = df['HAQ_Index'].fillna(0) * df['lagged_dalys']
df['norm_gdp'] = df.groupby('year')['GDP PER CAPITA (USD)'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_density'] = df.groupby('year')['Population Density'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['norm_haq'] = df.groupby('year')['HAQ_Index'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
df['vulnerability_index'] = (1 - df['norm_gdp']) + df['norm_density'] + (1 - df['norm_haq'])

# Define model features and target
feature_cols = [
    'log_gdp_per_capita', 'log_population_density', 'log_total_co2',
    'co2_per_capita', 'pollution_x_low_haq', 'year_index', 'lagged_dalys',
    'pm25_3yr_avg', 'delta_pm25', 'gdp_x_haq', 'smoking_x_pm25',
    'haq_x_dalys_lag', 'vulnerability_index'
]

df_model = df.dropna(subset=feature_cols + ['DALYs']).copy()
X = df_model[feature_cols]
y = df_model['DALYs']

# Scale and split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Use TreeExplainer instead of SHAP's general explainer
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test)

# SHAP summary plot
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values, features=X_test, feature_names=feature_cols, show=False)
plt.tight_layout()
plt.show()
