In [1]:
import kagglehub
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

# Regression Classifier.
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import pandas as pd
import pickle

# Statsmodels
import statsmodels.api as sm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset.
# Download latest version
path = kagglehub.dataset_download("lashagoch/life-expectancy-who-updated")
print("Path to dataset files:", path)

file = "Life-Expectancy-Data-Updated.csv"

data = pd.read_csv(path + "/" + file)

# FEATURE ENGINEERING
# take log of GDP per capita
data["GDP_per_capita"] = np.log10(data["GDP_per_capita"])

# take average vaccination percentage
data["Vaccination_score"] = (data["Hepatitis_B"] + data["Polio"] + data["Diphtheria"])/3

# Compute Lifestyle Index
BMI_score = 1*(data["BMI"] <= 30)*(data["BMI"] > 25) + 2*(data["BMI"] > 30)
Alcohol_score = 1*(data["Alcohol_consumption"] <= 9.722)*(data["Alcohol_consumption"] > 3.241) + 2*(data["Alcohol_consumption"] > 9.722)
data["Lifestyle_index"] = BMI_score + Alcohol_score

X = data[['Alcohol_consumption', 'Hepatitis_B', 'Measles',
       'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed', 'Vaccination_score', 'Lifestyle_index']]

y = data['Life_expectancy']

# Split data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Path to dataset files: /Users/nathanlonghurst/.cache/kagglehub/datasets/lashagoch/life-expectancy-who-updated/versions/1


In [3]:
# Split data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Logistic Regression classifier instance.
model = RandomForestRegressor(oob_score=True, random_state=42)

# number of features
d = len(X_train.iloc[0])

# Specify values for certain hyperparameters
param_grid = {"max_leaf_nodes": [30, 40, 50], # max number of leaf nodes
              "n_estimators": [50, 100, 150, 200], # number of trees
              "max_depth": [5, 10, 15], # maximum depth on any branch in tree
              "min_samples_leaf": [20, 50, 100], # minimum number of samples in a leaf
              "max_features": [1, 3, int(np.floor(np.log2(d) + 1)), 'log2', 'sqrt'], # max number of features to consider in a tree
              }

# Perform a grid search to find the best parameters
model_gs = GridSearchCV(model, param_grid, scoring="neg_mean_squared_error", n_jobs=-1)

model_gs.fit(X_train, y_train)

features = model_gs.best_estimator_.feature_importances_

sorted_features = np.argsort(model_gs.best_estimator_.feature_importances_)

print(f"best_parameters: {model_gs.best_params_}")
print(f"best train score: {model_gs.best_score_}")
print(f"best oob score: {model_gs.best_estimator_.oob_score_}")
print(f"most important features: {features[sorted_features[::-1]]}")

print(f"test score: {-mean_squared_error(model_gs.best_estimator_.predict(X_test),y_test)}")

best_parameters: {'max_depth': 15, 'max_features': 4, 'max_leaf_nodes': 50, 'min_samples_leaf': 20, 'n_estimators': 150}
best train score: -7.884372478888314
best oob score: 0.9230111002817553
most important features: [0.25424756 0.17390566 0.13589102 0.12493669 0.07335954 0.04329355
 0.03747066 0.03722645 0.03357138 0.02685727 0.01768385 0.01268915
 0.01175622 0.00857345 0.00853756]
test score: -6.853149954739142


In [4]:
# Save the model.
with open('RandomForestRegressor.pkl', 'wb') as f:
    pickle.dump(model_gs.best_estimator_, f)

In [5]:
with open('RandomForestRegressor.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [6]:
y_test_hat = loaded_model.predict(X_test)
error = mean_squared_error(y_test, y_test_hat)
print(f"mean squared error: {error}")

mean squared error: 6.853149954739142
