In [1]:
from __future__ import print_function
from packaging.version import parse as Version
from platform import python_version

OK = '\x1b[42m[ OK ]\x1b[0m'
FAIL = "\x1b[41m[FAIL]\x1b[0m"

try:
    import importlib
except ImportError:
    print(FAIL, "Python version 3.11 is required,"
                " but %s is installed." % sys.version)

def import_version(pkg, min_ver, fail_msg=""):
    mod = None
    try:
        mod = importlib.import_module(pkg)
        if pkg in {'PIL'}:
            ver = mod.VERSION
        else:
            ver = mod.__version__
        if Version(ver) == Version(min_ver):
            print(OK, "%s version %s is installed."
                  % (lib, min_ver))
        else:
            print(FAIL, "%s version %s is required, but %s installed."
                  % (lib, min_ver, ver))    
    except ImportError:
        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
    return mod


# first check the python version
pyversion = Version(python_version())

if pyversion >= Version("3.11.4"):
    print(OK, "Python version is %s" % pyversion)
elif pyversion < Version("3.11"):
    print(FAIL, "Python version 3.11 is required,"
                " but %s is installed." % pyversion)
else:
    print(FAIL, "Unknown Python version: %s" % pyversion)

    
print()
requirements = {'numpy': "1.24.4", 'matplotlib': "3.7.2",'sklearn': "1.3.0", 
                'pandas': "2.0.3",'xgboost': "1.7.6", 'shap': "0.42.1", 'seaborn': "0.12.2"}

# now the dependencies
for lib, required_version in list(requirements.items()):
    import_version(lib, required_version)

[42m[ OK ][0m Python version is 3.11.4

[42m[ OK ][0m numpy version 1.24.4 is installed.
[42m[ OK ][0m matplotlib version 3.7.2 is installed.
[42m[ OK ][0m sklearn version 1.3.0 is installed.
[42m[ OK ][0m pandas version 2.0.3 is installed.
[42m[ OK ][0m xgboost version 1.7.6 is installed.


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


[42m[ OK ][0m shap version 0.42.1 is installed.
[42m[ OK ][0m seaborn version 0.12.2 is installed.


In [2]:
# Data Handling and Array Manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# Preprocessing and Pipeline Tools
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# Machine Learning Models
import xgboost as xgb
from xgboost import XGBClassifier

# Model Selection and Evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, fbeta_score, auc,
    precision_recall_curve, average_precision_score
)

# Miscellaneous
import warnings
warnings.filterwarnings('ignore')
import pickle

In [18]:
df = pd.read_csv('data/CVD_cleaned.csv')
df.head()

random_state = 56

X = df.drop(labels=['Heart_Disease'], axis=1)
y = df['Heart_Disease']

label_mapping = {'No': 0, 'Yes': 1}
y = y.map(label_mapping)

X_subset, _, y_subset, _ = train_test_split(X, y, test_size=0.99, stratify=y, random_state=56)

print("Full Set")
print('X:',X.shape)
print('y:',y.shape)
#print("Subset Set")
print('X_subset:',X_subset.shape)
print('y_subset:',y_subset.shape)

cat_ftrs = ['Checkup','Exercise','Skin_Cancer','Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex','Smoking_History']
ordinal_ftrs = ['General_Health','Age_Category',]
ordinal_cats = [['Poor','Fair','Good','Very Good','Excellent'],\
               ['18-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80+']]
num_ftrs = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption',
       'Green_Vegetables_Consumption', 'FriedPotato_Consumption']

# one-hot encoder
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# ordinal encoder
ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

# standard scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs),
        ('ord', ordinal_transformer, ordinal_ftrs)])
final_scaler = StandardScaler()
prep = Pipeline(steps=[('preprocessor', preprocessor)])

Full Set
X: (308854, 18)
y: (308854,)
X_subset: (3088, 18)
y_subset: (3088,)


In [4]:
N, Y = df['Heart_Disease'].value_counts()
baseline_accuracy = Y/(N+Y)
print('baseline accuracy score for the entire dataset is', baseline_accuracy)

baseline accuracy score for the entire dataset is 0.08085049894124732


### XGBoost Classifier

In [5]:
nr_states = 5

# Initialize lists to store results
xgb_scores = []
xgb_models = []
xgb_test_sets = []

# Loop over a range of random states
for i in range(nr_states):
    print('Random state', i + 1)
    
    # Split the data
    X_train, X_other, y_train, y_other = train_test_split(X, y, stratify=y, train_size=0.6, random_state=56*i)
    X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, test_size=0.5, stratify=y_other, random_state=56*i)
    print(X_test.shape)
    
    # Preprocess the datasets
    df_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
    df_val = pd.DataFrame(preprocessor.transform(X_val), columns=preprocessor.get_feature_names_out())
    df_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

    # Create the XGBoost classifier
    XGB = xgb.XGBClassifier(early_stopping_rounds=50)

    # Define the parameter grid for grid search
    param_grid = {
        'n_estimators': [100, 125, 150],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'scale_pos_weight': [1, 10, 20, 50]
    }

    # Perform grid search
    grid = GridSearchCV(XGB, param_grid, scoring='recall', cv=3, verbose=1, n_jobs=-1)
    grid.fit(df_train, y_train, eval_set=[(df_val, y_val)], verbose=False)

    # Store the best model
    best_model = grid.best_estimator_
    xgb_models.append(best_model)

    # Evaluate and store results
    for X_save, y_save, dataset_name_save in [(df_train, y_train, 'train'), (df_val, y_val, 'validation'), (df_test, y_test, 'test')]:
        y_pred = best_model.predict(X_save)
        precision, recall, _ = precision_recall_curve(y_save, best_model.predict_proba(X_save)[:, 1])
        auc_pr = auc(recall, precision)
        score = {
            'dataset': dataset_name_save,
            'state': i + 1,
            'precision': precision_score(y_save, y_pred),
            'recall': recall_score(y_save, y_pred),
            'f2': fbeta_score(y_save, y_pred, beta=2),
            'auc-pr': auc_pr
        }
        xgb_scores.append(score)

    # Save the test set
    xgb_test_sets.append({'X_test': df_test, 'y_test': y_test, 'state': i + 1})

# xgb_scores contains the evaluation scores, xgb_models contains the models, and xgb_test_sets contains the test data for each random state

Random state 1
(61771, 18)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Random state 2
(61771, 18)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Random state 3
(61771, 18)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Random state 4
(61771, 18)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Random state 5
(61771, 18)
Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [6]:
xgb_score = pd.DataFrame(xgb_scores)
xgb_score
xgb_train_recall = xgb_score[xgb_score['dataset'] == 'train']['recall']
xgb_val_recall = xgb_score[xgb_score['dataset'] == 'validation']['recall']
xgb_test_recall = xgb_score[xgb_score['dataset'] == 'test']['recall']

print('train recall mean:',np.mean(xgb_train_recall))
print('validation recall mean:',np.mean(xgb_val_recall))
print('test recall mean:',np.mean(xgb_test_recall))
print('test recall standard deviation:',np.std(xgb_test_recall))
print(round((np.mean(xgb_test_recall)-baseline_accuracy)/np.std(xgb_test_recall),4),'standard deviations above the baseline')

train recall mean: 0.9686978575719148
validation recall mean: 0.9685622747296756
test recall mean: 0.9701641970364439
test recall standard deviation: 0.0025954107722898994
342.6485 standard deviations above the baseline


In [7]:
pd.DataFrame(xgb_scores)
xgb_test_sets[1]['X_test']

Unnamed: 0,num__Height_(cm),num__Weight_(kg),num__BMI,num__Alcohol_Consumption,num__Fruit_Consumption,num__Green_Vegetables_Consumption,num__FriedPotato_Consumption,cat__Checkup_5 or more years ago,cat__Checkup_Never,cat__Checkup_Within the past 2 years,...,cat__Diabetes_Yes,"cat__Diabetes_Yes, but female told only during pregnancy",cat__Arthritis_No,cat__Arthritis_Yes,cat__Sex_Female,cat__Sex_Male,cat__Smoking_History_No,cat__Smoking_History_Yes,ord__General_Health,ord__Age_Category
0,-0.340360,-0.355745,-0.210691,-0.377421,-0.996118,-0.208849,-0.617243,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,3.0,7.0
1,1.349604,0.419798,-0.262958,-0.621136,-1.036355,-0.741981,0.197958,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,3.0,5.0
2,0.692396,-1.048713,-1.422064,0.353722,-0.714461,-0.475415,-0.267871,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,4.0,1.0
3,0.880170,0.334878,-0.112305,-0.133707,-0.875408,0.057717,0.663787,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,4.0,7.0
4,-0.246473,-1.367751,-1.422064,0.597437,-0.714461,-0.741981,-0.617243,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61766,-0.997568,-1.474253,-1.268337,-0.377421,-1.036355,-0.408773,-0.384328,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,9.0
61767,-1.279229,-1.154746,-0.744126,-0.621136,0.009801,-0.475415,-0.267871,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,3.0,12.0
61768,-0.246473,-0.601592,-0.528907,-0.621136,0.009801,-0.741981,-0.617243,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,4.0,7.0
61769,-1.467003,-0.303667,0.537962,-0.621136,0.009801,-0.475415,-0.267871,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,9.0


### Save for later use

In [8]:
with open('xgb_results.pkl', 'wb') as file:
    pickle.dump({'scores': xgb_scores, 'models': xgb_models, 'test_sets': xgb_test_sets}, file)

In [3]:
with open('xgb_results.pkl', 'rb') as file:
    data = pickle.load(file)
    xgb_scores = data['scores']
    xgb_models = data['models']
    xgb_test_sets = data['test_sets']