## 1. Import libraries <a class="anchor" id="1"></a>

In [30]:
!pip show xgboost

Name: xgboost
Version: 0.90
Summary: XGBoost Python Package
Home-page: https://github.com/dmlc/xgboost
Author: None
Author-email: None
License: Apache-2.0
Location: /opt/conda/lib/python3.6/site-packages
Requires: scipy, numpy
Required-by: speedml, dask-xgboost


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import pandas_profiling as pp

# models

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

# NN models
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping, ModelCheckpoint

# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.


## 2. Download datasets <a class="anchor" id="2"></a>

In [17]:
data = pd.read_csv("/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv", sep=";")

In [18]:
# FE - thanks to: https://www.kaggle.com/benanakca/comparison-of-classification-disease-prediction
data.drop("id",axis=1,inplace=True)
data.drop_duplicates(inplace=True)
data["bmi"] = data["weight"] / (data["height"]/100)**2
out_filter = ((data["ap_hi"]>250) | (data["ap_lo"]>200))
data = data[~out_filter]
len(data)

68983

In [19]:
data.drop(["ap_hi", "ap_lo", "cholesterol", "gluc"],axis=1,inplace=True)

In [20]:
data

Unnamed: 0,age,gender,height,weight,smoke,alco,active,cardio,bmi
0,18393,2,168,62.0,0,0,1,0,21.967120
1,20228,1,156,85.0,0,0,1,1,34.927679
2,18857,1,165,64.0,0,0,0,1,23.507805
3,17623,2,169,82.0,0,0,1,1,28.710479
4,17474,1,156,56.0,0,0,0,0,23.011177
...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,1,0,1,0,26.927438
69996,22601,1,158,126.0,0,0,1,1,50.472681
69997,19066,2,183,105.0,0,1,0,1,31.353579
69998,22431,1,163,72.0,0,0,0,1,27.099251


In [None]:
out_filter2 = ((data["ap_hi"] < 0) | (data["ap_lo"] < 0))
data = data[~out_filter2]

In [None]:
data.head(3)

In [None]:
data.info()

## 4. Preparing to modeling <a class="anchor" id="4"></a>

In [22]:
target_name = 'cardio'
data_target = data[target_name]
data = data.drop([target_name], axis=1)

In [23]:
train, test, target, target_test = train_test_split(data, data_target, test_size=0.2, random_state=0)

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
train.info()

In [None]:
test.info()

In [24]:
#%% split training set to validation set
Xtrain, Xval, Ztrain, Zval = train_test_split(train, target, test_size=0.2, random_state=0)

## 5. Tuning models and test for all features <a class="anchor" id="5"></a>

### 5.10 XGB Classifier <a class="anchor" id="5.10"></a>

XGBoost is an ensemble tree method that apply the principle of boosting weak learners (CARTs generally) using the gradient descent architecture. XGBoost improves upon the base Gradient Boosting Machines (GBM) framework through systems optimization and algorithmic enhancements. Reference [Towards Data Science.](https://towardsdatascience.com/https-medium-com-vishalmorde-xgboost-algorithm-long-she-may-rein-edd9f99be63d)

We will tuning the hyperparameters of the XGBClassifier model using the HyperOpt and 10-fold crossvalidation

In [25]:
import numpy as np
from hyperopt import hp, fmin, tpe
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score

# Define the scoring function to return probabilities instead of 0 or 1
def hyperopt_xgb_score(params):
    params['tree_method'] = 'gpu_hist'  # Use GPU for XGBoost
    params['gpu_id'] = 0                # Specify GPU ID (if multiple GPUs are available)
    params['silent'] = 1                # Deprecated, but keeping it for backward compatibility
    clf = XGBClassifier(**params)
    
    # Use cross_val_predict to get probability scores instead of labels
    predicted_probs = cross_val_predict(clf, train, target, cv=10, method='predict_proba')[:, 1]
    current_score = roc_auc_score(target, predicted_probs)  # Using AUC to evaluate probabilities
    
    print(f"Score (AUC): {current_score} with params: {params}")
    return -current_score  # Minimize the negative of the AUC score to maximize the metric

# Hyperparameter space for XGBoost with updated parameters
space_xgb = {
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.05, 0.001),
    'n_estimators': hp.choice('n_estimators', range(100, 1000)),
    'eta': hp.quniform('eta', 0.025, 0.5, 0.005),
    'max_depth': hp.choice('max_depth', np.arange(2, 12, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 9, 0.025),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.005),
    'gamma': hp.quniform('gamma', 0.5, 1, 0.005),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.005),
    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'tree_method': 'gpu_hist',  # Use GPU
    'gpu_id': 0,                # Use GPU ID 0
    'missing': None
}

# Hyperparameter optimization with TPE and GPU settings
best = fmin(fn=hyperopt_xgb_score, space=space_xgb, algo=tpe.suggest, max_evals=10)
print('Best Hyperparameters:')
print(best)


Score (AUC): 0.6652988077767201 with params: {'booster': 'gbtree', 'colsample_bytree': 0.61, 'eta': 0.065, 'eval_metric': 'auc', 'gamma': 0.86, 'gpu_id': 0, 'learning_rate': 0.036000000000000004, 'max_depth': 7, 'min_child_weight': 2.4250000000000003, 'missing': None, 'n_estimators': 775, 'objective': 'binary:logistic', 'subsample': 0.86, 'tree_method': 'gpu_hist', 'silent': 1}
Score (AUC): 0.6697925510973968 with params: {'booster': 'gbtree', 'colsample_bytree': 0.65, 'eta': 0.14, 'eval_metric': 'auc', 'gamma': 0.87, 'gpu_id': 0, 'learning_rate': 0.027, 'max_depth': 9, 'min_child_weight': 2.025, 'missing': None, 'n_estimators': 185, 'objective': 'binary:logistic', 'subsample': 0.97, 'tree_method': 'gpu_hist', 'silent': 1}
Score (AUC): 0.6690529239960074 with params: {'booster': 'gbtree', 'colsample_bytree': 0.8250000000000001, 'eta': 0.48, 'eval_metric': 'auc', 'gamma': 0.68, 'gpu_id': 0, 'learning_rate': 0.021, 'max_depth': 7, 'min_child_weight': 6.65, 'missing': None, 'n_estimators'

In [None]:
# Train final model using best parameters
best_model = XGBClassifier(
    learning_rate=best['learning_rate'],
    n_estimators=range(100, 1000)[best['n_estimators']],
    eta=best['eta'],
    max_depth=best['max_depth'],
    min_child_weight=best['min_child_weight'],
    subsample=best['subsample'],
    gamma=best['gamma'],
    colsample_bytree=best['colsample_bytree'],
    eval_metric='auc',
    objective='binary:logistic',
    booster='gbtree',
    tree_method='gpu_hist',
    gpu_id=0,
    missing=None
)


In [None]:
# Fit the best model on the training data
best_model.fit(train, target)

In [26]:
params = space_eval(space_xgb, best)
params

{'booster': 'gbtree',
 'colsample_bytree': 0.995,
 'eta': 0.23,
 'eval_metric': 'auc',
 'gamma': 0.65,
 'gpu_id': 0,
 'learning_rate': 0.02,
 'max_depth': 2,
 'min_child_weight': 7.6000000000000005,
 'missing': None,
 'n_estimators': 708,
 'objective': 'binary:logistic',
 'subsample': 0.8150000000000001,
 'tree_method': 'gpu_hist'}

In [27]:
XGB_Classifier = XGBClassifier(**params)
XGB_Classifier.fit(train, target)
acc_XGB_Classifier = round(XGB_Classifier.score(train, target) * 100, 2)
acc_XGB_Classifier

63.01

In [28]:
acc_test_XGB_Classifier = round(XGB_Classifier.score(test, target_test) * 100, 2)
acc_test_XGB_Classifier

62.13

In [None]:
fig =  plt.figure(figsize = (15,15))
axes = fig.add_subplot(111)
xgb.plot_importance(XGB_Classifier,ax = axes,height =0.5)
plt.show();
plt.close()

In [None]:
# import pickle
# # Save the model using joblib
# joblib.dump(XGB_Classifier, 'xgb_half_model.pkl')

# print("Model saved as 'xgb_half_model.pkl'")

In [29]:
import pickle

# Save the trained XGBoost model using pickle
with open('xgb_half_model.pkl', 'wb') as file:
    pickle.dump(XGB_Classifier, file)

In [16]:
import pandas as pd
import numpy as np
import joblib

# Load the saved model
loaded_model = joblib.load('/kaggle/working/xgb_full_model.pkl')

# Function to preprocess incoming raw data and make predictions
def preprocess_and_predict(input_data):
    # Convert the input data to a DataFrame
    data = pd.DataFrame(input_data)
    
    # Calculate BMI
    if "weight" in data.columns and "height" in data.columns:
        data["bmi"] = data["weight"] / (data["height"] / 100) ** 2
    
    # Make predictions
    predictions = loaded_model.predict_proba(data)[:]
    print(loaded_model.predict(data))

    return predictions

# Example: Sample input data for testing (Replace this with your backend input)
sample_input = {
    'age': 2520,
    'gender' : 1,
    'height': 140,
    'weight': 90,
    'ap_hi': 1000,
    'ap_lo': 50,
    'cholesterol': 1,
    'gluc': 1,
    'smoke': 1,
    'alco': 1,
    'active': 0
}

# Convert to list of dictionaries to simulate multiple records
input_data = [sample_input]

# Predict values for the sample input
predicted_values = preprocess_and_predict(input_data)
print("Predicted values:", predicted_values)


[0]
Predicted values: [[0.63001174 0.36998826]]
