In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.inspection import permutation_importance


In [3]:
df = pd.read_csv('Preprocessed_cardio_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,0,50.391781,2,168,62.0,110.0,80,1,1,0,0,1,0,21.96712
1,1,55.419178,1,156,85.0,140.0,90,3,1,0,0,1,1,34.927679
2,2,51.663014,1,165,64.0,130.0,70,3,1,0,0,0,1,23.507805
3,3,48.282192,2,169,82.0,150.0,100,1,1,0,0,1,1,28.710479
4,4,47.873973,1,156,56.0,100.0,60,1,1,0,0,0,0,23.011177


In [5]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [7]:
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50.391781,2,168,62.0,110.0,80,1,1,0,0,1,0,21.96712
1,55.419178,1,156,85.0,140.0,90,3,1,0,0,1,1,34.927679
2,51.663014,1,165,64.0,130.0,70,3,1,0,0,0,1,23.507805
3,48.282192,2,169,82.0,150.0,100,1,1,0,0,1,1,28.710479
4,47.873973,1,156,56.0,100.0,60,1,1,0,0,0,0,23.011177


In [11]:
X = df.drop("cardio", axis=1)   # target column name
y = df["cardio"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [15]:
rf = RandomForestClassifier(random_state=42)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

grid = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)



grid.fit(X_train, y_train)

best_model = grid.best_estimator_

print("Best Parameters:", grid.best_params_)


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [17]:
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7360674478422407
              precision    recall  f1-score   support

           0       0.71      0.79      0.75      7001
           1       0.76      0.69      0.72      6995

    accuracy                           0.74     13996
   macro avg       0.74      0.74      0.74     13996
weighted avg       0.74      0.74      0.74     13996



In [25]:
importances = best_model.feature_importances_

feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(feature_importance)


        Feature  Importance
4         ap_hi    0.433390
5         ap_lo    0.193951
0           age    0.137732
6   cholesterol    0.086901
11          BMI    0.053029
3        weight    0.040915
2        height    0.023839
7          gluc    0.012138
10       active    0.006916
1        gender    0.004276
8         smoke    0.003847
9          alco    0.003066


In [27]:
# perm_importance = permutation_importance(
#     best_model,
#     X_test,
#     y_test,
#     n_repeats=10,
#     random_state=42,
#     scoring="accuracy"
# )

# perm_df = pd.DataFrame({
#     "Feature": X.columns,
#     "Importance": perm_importance.importances_mean
# }).sort_values(by="Importance", ascending=False)

# print(perm_df)


In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.inspection import permutation_importance

from xgboost import XGBClassifier


In [33]:
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42
)


In [35]:
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring="roc_auc",   # better for medical datasets
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_xgb = grid.best_estimator_

print("Best Parameters:", grid.best_params_)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}


In [37]:
y_pred = best_xgb.predict(X_test)
y_prob = best_xgb.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print(classification_report(y_test, y_pred))


Accuracy: 0.7366390397256359
ROC-AUC: 0.8052282534129966
              precision    recall  f1-score   support

           0       0.72      0.78      0.75      7001
           1       0.76      0.69      0.72      6995

    accuracy                           0.74     13996
   macro avg       0.74      0.74      0.74     13996
weighted avg       0.74      0.74      0.74     13996



In [39]:
importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_xgb.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(importance_df)


        Feature  Importance
4         ap_hi    0.424335
6   cholesterol    0.171072
5         ap_lo    0.168367
0           age    0.074193
10       active    0.036471
8         smoke    0.023602
11          BMI    0.023298
7          gluc    0.020773
3        weight    0.019588
9          alco    0.014964
1        gender    0.014431
2        height    0.008907


In [41]:
perm = permutation_importance(
    best_xgb,
    X_test,
    y_test,
    n_repeats=10,
    random_state=42,
    scoring="roc_auc"
)

perm_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": perm.importances_mean
}).sort_values(by="Importance", ascending=False)

print(perm_df)


        Feature  Importance
4         ap_hi    0.169322
0           age    0.037088
6   cholesterol    0.028247
5         ap_lo    0.006736
11          BMI    0.002540
10       active    0.002062
7          gluc    0.001765
3        weight    0.001737
8         smoke    0.000787
9          alco    0.000626
2        height    0.000337
1        gender    0.000111


In [43]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)



Confusion Matrix:
[[5460 1541]
 [2145 4850]]


In [45]:
cm_df = pd.DataFrame(
    cm,
    index=["Actual_No_Disease", "Actual_Disease"],
    columns=["Predicted_No_Disease", "Predicted_Disease"]
)

print(cm_df)


                   Predicted_No_Disease  Predicted_Disease
Actual_No_Disease                  5460               1541
Actual_Disease                     2145               4850


In [47]:
TN, FP, FN, TP = cm.ravel()

sensitivity = TP / (TP + FN)   # Recall
specificity = TN / (TN + FP)

print("Sensitivity (Recall):", sensitivity)
print("Specificity:", specificity)


Sensitivity (Recall): 0.6933523945675483
Specificity: 0.779888587344665
