# Modular COPD Modeling Pipeline
This notebook is a refactored version of the full pipeline. Each step is wrapped in modular functions, cleaned of redundancy, and ready for reuse or publication.

In [1]:
def load_and_clean_data(path='merged_burden_risk.csv'):
    import pandas as pd
    df = pd.read_csv(path)
    df = df.dropna(subset=['dalys_(disability-adjusted_life_years)'])
    df.rename(columns={'dalys_(disability-adjusted_life_years)': 'DALYs'}, inplace=True)
    df = df.sort_values(by=['country', 'year'])
    return df

In [2]:
def apply_basic_features(df):
    import numpy as np
    df['log_gdp_per_capita'] = np.log(df['GDP PER CAPITA (USD)'] + 1)
    df['log_population_density'] = np.log(df['Population Density'] + 1)
    df['log_total_co2'] = np.log(df['Total CO2 Emission excluding LUCF (Mt)'] + 1)
    df['co2_per_capita'] = df['Total CO2 Emission excluding LUCF (Mt)'] / df['Population']
    df['no2_per_capita'] = df['Nitrogen Oxide'] / df['Population']
    df['black_carbon_per_capita'] = df['Black Carbon'] / df['Population']
    df['pollution_x_low_haq'] = df['co2_per_capita'] * (1 - df['HAQ_Index'].fillna(0) / 100)
    df['year_index'] = df['year'] - df['year'].min()
    df['lagged_dalys'] = df.groupby('country')['DALYs'].shift(1)
    return df

In [3]:
def apply_advanced_features(df):
    df['pm25_3yr_avg'] = df.groupby('country')['pm25_DALY'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
    df['dalys_3yr_avg'] = df.groupby('country')['DALYs'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
    df['delta_pm25'] = df.groupby('country')['pm25_DALY'].diff()
    df['delta_black_carbon'] = df.groupby('country')['Black Carbon'].diff()
    df['gdp_x_haq'] = df['GDP PER CAPITA (USD)'] * df['HAQ_Index'].fillna(0)
    df['smoking_x_pm25'] = df['smoking_DALY'] * df['pm25_DALY']
    df['haq_x_dalys_lag'] = df['HAQ_Index'].fillna(0) * df['lagged_dalys']
    df['norm_gdp'] = df.groupby('year')['GDP PER CAPITA (USD)'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
    df['norm_density'] = df.groupby('year')['Population Density'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
    df['norm_haq'] = df.groupby('year')['HAQ_Index'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
    df['vulnerability_index'] = (1 - df['norm_gdp']) + df['norm_density'] + (1 - df['norm_haq'])
    return df

In [4]:
def prepare_model_data(df):
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    feature_cols = ['log_gdp_per_capita', 'log_population_density', 'log_total_co2',
                    'co2_per_capita', 'pollution_x_low_haq', 'year_index', 'lagged_dalys',
                    'pm25_3yr_avg', 'delta_pm25', 'gdp_x_haq', 'smoking_x_pm25',
                    'haq_x_dalys_lag', 'vulnerability_index']
    df_model = df.dropna(subset=feature_cols + ['DALYs']).copy()
    X = df_model[feature_cols]
    y = df_model['DALYs']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return train_test_split(X_scaled, y, test_size=0.2, random_state=42), feature_cols

In [5]:
def evaluate_models(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import Ridge, Lasso, QuantileRegressor
    from sklearn.ensemble import RandomForestRegressor
    import xgboost as xgb
    from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
    import pandas as pd
    import numpy as np

    models = {
        'Ridge Regression': Ridge(alpha=1.0),
        'Lasso Regression': Lasso(alpha=0.1),
        'Quantile Regression (median)': QuantileRegressor(quantile=0.5, alpha=0.1),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'XGBoost': xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    }

    results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        results.append({
            'Model': name,
            'R²': r2_score(y_test, preds),
            'MAE': mean_absolute_error(y_test, preds),
            'RMSE': np.sqrt(mean_squared_error(y_test, preds))
        })

    return pd.DataFrame(results).sort_values(by='R²', ascending=False)

In [6]:
def shap_explain(xgb_model, X_train, X_test, feature_cols):
    import shap
    import matplotlib.pyplot as plt
    explainer = shap.Explainer(xgb_model, X_train)
    shap_values = explainer(X_test)
    plt.figure(figsize=(12, 6))
    shap.summary_plot(shap_values, features=X_test, feature_names=feature_cols, show=False)
    plt.tight_layout()
    plt.show()

In [7]:
def ridge_lasso_coefficients(X_train, y_train, feature_cols):
    from sklearn.linear_model import Ridge, Lasso
    import pandas as pd
    ridge = Ridge(alpha=1.0).fit(X_train, y_train)
    lasso = Lasso(alpha=0.1).fit(X_train, y_train)
    ridge_coef = pd.Series(ridge.coef_, index=feature_cols)
    lasso_coef = pd.Series(lasso.coef_, index=feature_cols)
    return pd.DataFrame({
        'Ridge Coefficient': ridge_coef,
        'Lasso Coefficient': lasso_coef
    }).sort_values(by='Ridge Coefficient', key=abs, ascending=False)

In [8]:
# Run full pipeline
df = load_and_clean_data()
df = apply_basic_features(df)
df = apply_advanced_features(df)
(X_train, X_test, y_train, y_test), feature_cols = prepare_model_data(df)
model_results = evaluate_models(X_train, X_test, y_train, y_test)
print(model_results)

# SHAP Explainability for XGBoost
import xgboost as xgb
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
shap_explain(xgb_model, X_train, X_test, feature_cols)

# Ridge vs Lasso
coeffs = ridge_lasso_coefficients(X_train, y_train, feature_cols)
coeffs

ModuleNotFoundError: No module named 'xgboost'