# XGBoost Regression Model

In [1]:
import kagglehub
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

# Regression Classifier.
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Statsmodels.
import statsmodels.api as sm
import pandas as pd
import pickle

# XGBoost.
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset.
# Download latest version
path = kagglehub.dataset_download("lashagoch/life-expectancy-who-updated")
print("Path to dataset files:", path)

file = "Life-Expectancy-Data-Updated.csv"

data = pd.read_csv(path + "/" + file)

# FEATURE ENGINEERING
# take log of GDP per capita
data["GDP_per_capita"] = np.log10(data["GDP_per_capita"])

# take average vaccination percentage
data["Vaccination_score"] = (data["Hepatitis_B"] + data["Polio"] + data["Diphtheria"])/3

# Compute Lifestyle Index
BMI_score = 1*(data["BMI"] <= 30)*(data["BMI"] > 25) + 2*(data["BMI"] > 30)
Alcohol_score = 1*(data["Alcohol_consumption"] <= 9.722)*(data["Alcohol_consumption"] > 3.241) + 2*(data["Alcohol_consumption"] > 9.722)
data["Lifestyle_index"] = BMI_score + Alcohol_score

X = data[['Alcohol_consumption', 'Hepatitis_B', 'Measles',
       'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed', 'Vaccination_score', 'Lifestyle_index']]

y = data['Life_expectancy']

# Split data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Path to dataset files: /Users/nathanlonghurst/.cache/kagglehub/datasets/lashagoch/life-expectancy-who-updated/versions/1


In [3]:
corrs_clean = data[X.columns.tolist() + ['Life_expectancy']].corr(numeric_only=True)['Life_expectancy'].sort_values(ascending=False)
print(f'corrs:  {corrs_clean}')

corrs:  Life_expectancy                1.000000
GDP_per_capita                 0.795556
Schooling                      0.732484
Polio                          0.641217
Diphtheria                     0.627541
Vaccination_score              0.598929
BMI                            0.598423
Lifestyle_index                0.580995
Economy_status_Developed       0.523791
Measles                        0.490019
Hepatitis_B                    0.417804
Alcohol_consumption            0.399159
Population_mln                 0.026298
Thinness_five_nine_years      -0.458166
Thinness_ten_nineteen_years   -0.467824
Incidents_HIV                 -0.553027
Name: Life_expectancy, dtype: float64


In [4]:
# GridSearch
param_grid = {
    'max_depth': [3, 4, 5, 6, 8], # max depth for a particular branch of tree
    'learning_rate': [0.01, 0.05, 0.1], # learning rate for fitting
    'subsample': [0.7, 0.85, 1.0], # percentage of data to subsample
    'colsample_bytree': [0.6, 0.8, 1.0], # percentage of features to sample for tree
    'n_estimators': [200, 400, 600] # number of boosting rounds
}

grid = GridSearchCV(
    XGBRegressor(random_state=42),
    param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid.fit(X_train, y_train)
print(grid.best_params_)
neg_mean_squared_error = grid.best_score_
print(grid.best_score_)

{'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 600, 'subsample': 0.7}
-0.9891425567520109


In [5]:
best_model = grid.best_estimator_
importances = best_model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("order of feature importances")
print(feature_importances)

order of feature importances
Incidents_HIV                  0.548182
GDP_per_capita                 0.178363
Schooling                      0.086768
BMI                            0.037160
Economy_status_Developed       0.031217
Thinness_five_nine_years       0.019495
Polio                          0.018662
Diphtheria                     0.017428
Lifestyle_index                0.016721
Thinness_ten_nineteen_years    0.014648
Population_mln                 0.008754
Vaccination_score              0.008537
Measles                        0.007969
Alcohol_consumption            0.003093
Hepatitis_B                    0.003005
dtype: float32


In [6]:
# Save the model.
with open('XGBoost.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [7]:
with open('XGBoost.pkl', 'rb') as f:
    model = pickle.load(f)

In [9]:
# Predictions.
y_test_pred = model.predict(X_test)

# Metrics.
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MSE: {-neg_mean_squared_error}")
print(f'Test MSE:  {test_mse}')

Train MSE: 0.9891425567520109
Test MSE:  0.679438608968662
