# XGBoost Regression Model

In [34]:
import kagglehub
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

# Regression Classifier.
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Statsmodels.
import statsmodels.api as sm
import pandas as pd

# XGBoost.
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
# Load dataset.
# Download latest version
path = kagglehub.dataset_download("lashagoch/life-expectancy-who-updated")
print("Path to dataset files:", path)

file = "Life-Expectancy-Data-Updated.csv"

# TODO:  Correct features and assign target.
data = pd.read_csv(path + "/" + file)
data.columns
features_to_drop = ['Infant_deaths', 'Under_five_deaths', 'Adult_mortality', 'Country', 'Region', 'Year']
X = data.drop(columns=features_to_drop + ['Life_expectancy'])
# X = data[['Alcohol_consumption', 'Hepatitis_B', 'Measles',
#        'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
#        'Population_mln', 'Thinness_ten_nineteen_years',
#        'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed']]

y = data['Life_expectancy']

# Split data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Path to dataset files: C:\Users\valeh\.cache\kagglehub\datasets\lashagoch\life-expectancy-who-updated\versions\1


In [32]:
# Construct XGBoost regression model.
model = XGBRegressor(
    n_estimators=100,       # Number of boosted trees.
    learning_rate=0.1,      # Shrinkage (eta).
    max_depth=6,            # Tree depth.
    subsample=1.0,          # Row sampling.
    colsample_bytree=1.0,   # Feature sampling.
    random_state=0
)

# Fit model and predict results.
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse  = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2   = r2_score(y_test, y_pred)

corrs_clean = data[X.columns.tolist() + ['Life_expectancy']].corr(numeric_only=True)['Life_expectancy'].sort_values(ascending=False)
print(f'corrs:  {corrs_clean}')

print(f'mse:  {mse}')
print(f'rmse:  {rmse}')
print(f'r2:  {r2}')

corrs:  Life_expectancy                1.000000
Schooling                      0.732484
Polio                          0.641217
Diphtheria                     0.627541
BMI                            0.598423
GDP_per_capita                 0.583090
Economy_status_Developed       0.523791
Measles                        0.490019
Hepatitis_B                    0.417804
Alcohol_consumption            0.399159
Year                           0.174359
Population_mln                 0.026298
Thinness_five_nine_years      -0.458166
Thinness_ten_nineteen_years   -0.467824
Economy_status_Developing     -0.523791
Incidents_HIV                 -0.553027
Name: Life_expectancy, dtype: float64
mse:  1.2271328881338635
rmse:  1.1077603026529987
r2:  0.9852136993849238


In [39]:
# GridSearch
param_grid = {
    'max_depth': [3, 4, 5, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.85, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [200, 400, 600]
}

grid = GridSearchCV(
    XGBRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

{'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 600, 'subsample': 0.7}
-0.958761368976085
Incidents_HIV                  0.500079
GDP_per_capita                 0.208680
Schooling                      0.092864
BMI                            0.053984
Economy_status_Developed       0.025031
Thinness_five_nine_years       0.023524
Diphtheria                     0.023151
Polio                          0.020368
Thinness_ten_nineteen_years    0.014091
Population_mln                 0.010734
Measles                        0.010068
Year                           0.006425
Hepatitis_B                    0.005701
Alcohol_consumption            0.005300
Economy_status_Developing      0.000000
dtype: float32


In [43]:
best_model = grid.best_estimator_
importances = best_model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print(feature_importances)

# Construct XGBoost regression model.
model = XGBRegressor(
    n_estimators=600,       # Number of boosted trees.
    learning_rate=0.05,      # Shrinkage (eta).
    max_depth=6,            # Tree depth.
    subsample=0.7,          # Row sampling.
    colsample_bytree=1.0,   # Feature sampling.
    random_state=0
)

# Fit model and predict results.
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse  = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2   = r2_score(y_test, y_pred)

Incidents_HIV                  0.500079
GDP_per_capita                 0.208680
Schooling                      0.092864
BMI                            0.053984
Economy_status_Developed       0.025031
Thinness_five_nine_years       0.023524
Diphtheria                     0.023151
Polio                          0.020368
Thinness_ten_nineteen_years    0.014091
Population_mln                 0.010734
Measles                        0.010068
Year                           0.006425
Hepatitis_B                    0.005701
Alcohol_consumption            0.005300
Economy_status_Developing      0.000000
dtype: float32


In [46]:
# Predictions.
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Metrics.
train_rmse = mean_squared_error(y_train, y_train_pred)**0.5
test_rmse = mean_squared_error(y_test, y_test_pred) **0.5

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train RMSE:  {train_rmse}')
print(f'Test RMSE:  {test_rmse}')
print(f'Train MSE:  {train_rmse**2}')
print(f'Test MSE:  {test_rmse**2}')
print(f'Train R2:  {train_r2}')
print(f'Test R2:  {test_r2}')

Train RMSE:  0.11358688578523468
Test RMSE:  0.7458051116402588
Train MSE:  0.012901980622387949
Test MSE:  0.5562252645487388
Train R2:  0.9998562912330347
Test R2:  0.9932977805005087
