#🩺 A Machine Learning Approach for Gallstone Disease Prediction
🧑‍💻 Dung Nguyen<br>
🐦 Elmhurst University<br>
📧 dung.nguyen@365.elmhurst.edu<br>


##📚 Libraries

In [417]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_classif
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier

##📊 Data

In [359]:
df = pd.read_csv('https://raw.githubusercontent.com/nguyend77/gallstone/refs/heads/main/data.csv')

Identify features (X) and target (y)

In [360]:
X = df.drop(columns=['gallstone', 'lean_mass', 'total_body_fat_ratio', 'bone_mass', 'extracellular_water', 'total_fat_content', 'ecf_tbw', 'visceral_fat_area', 'total_body_water', 'muscle_mass', 'hepatic_fat_accumulation', 'visceral_muscle_area', 'intracellular_water', 'visceral_fat_rating', 'obesity', 'protein'])
y = df['gallstone']

ANOVA F-Scores

In [375]:
f_scores, p_values = f_classif(X, y)
feature_scores_df = pd.DataFrame({'Feature': X.columns, 'F_Score': f_scores, 'P_Value': p_values})
feature_scores_df = feature_scores_df.sort_values(by='F_Score', ascending=False)
print(feature_scores_df)

                       Feature    F_Score       P_Value
22                    vitaminD  45.673154  6.709802e-11
20          c_reactive_protein  27.385949  3.037234e-07
21                  hemoglobin  12.781941  4.045010e-04
5               hyperlipidemia   8.532832  3.738263e-03
13                         hdl   8.181862  4.511971e-03
1                       gender   7.647763  6.017412e-03
15                         ast   5.876601  1.590243e-02
18                  creatinine   5.645945  1.808963e-02
9                          bmi   4.750322  3.002851e-02
17                         alp   3.874263  4.990295e-02
7                       height   3.735465  5.415938e-02
6                     diabetes   3.512774  6.181698e-02
3      coronary_artery_disease   3.010879  8.367928e-02
12                         ldl   1.014058  3.147007e-01
4               hypothyroidism   0.968731  3.257475e-01
14                triglyceride   0.821302  3.654882e-01
8                       weight   0.752641  3.862

Train-Test Split

In [376]:
X_reduced = X.drop(columns=['total_cholesterol', 'age'])
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42, stratify=y)

##🔢 Models

###Logistic Regression

In [381]:
lr_model = LogisticRegression(solver = 'liblinear', max_iter = 1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.82      0.75      0.78        48
           1       0.77      0.83      0.80        48

    accuracy                           0.79        96
   macro avg       0.79      0.79      0.79        96
weighted avg       0.79      0.79      0.79        96



In [382]:
print(confusion_matrix(y_test, y_pred_lr))

[[36 12]
 [ 8 40]]


In [383]:
print(roc_auc_score(y_test, y_pred_lr))

0.7916666666666666


###Random Forest

In [384]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.78      0.79      0.78        48
           1       0.79      0.77      0.78        48

    accuracy                           0.78        96
   macro avg       0.78      0.78      0.78        96
weighted avg       0.78      0.78      0.78        96



In [385]:
print(confusion_matrix(y_test, y_pred_rf))

[[38 10]
 [11 37]]


In [386]:
print(roc_auc_score(y_test, y_pred_rf))

0.78125


###🥈 **XGBoost**

In [387]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.80      0.85      0.83        48
           1       0.84      0.79      0.82        48

    accuracy                           0.82        96
   macro avg       0.82      0.82      0.82        96
weighted avg       0.82      0.82      0.82        96



In [388]:
print(confusion_matrix(y_test, y_pred_xgb))

[[41  7]
 [10 38]]


In [389]:
print(roc_auc_score(y_test, y_pred_xgb))

0.8229166666666665


###AdaBoost

In [390]:
adb_model = AdaBoostClassifier()
adb_model.fit(X_train, y_train)
y_pred_adb = adb_model.predict(X_test)
print(classification_report(y_test, y_pred_adb))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79        48
           1       0.79      0.79      0.79        48

    accuracy                           0.79        96
   macro avg       0.79      0.79      0.79        96
weighted avg       0.79      0.79      0.79        96



In [391]:
print(confusion_matrix(y_test, y_pred_adb))

[[38 10]
 [10 38]]


In [392]:
print(roc_auc_score(y_test, y_pred_adb))

0.7916666666666666


###Gradient Boosting

In [393]:
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

           0       0.77      0.77      0.77        48
           1       0.77      0.77      0.77        48

    accuracy                           0.77        96
   macro avg       0.77      0.77      0.77        96
weighted avg       0.77      0.77      0.77        96



In [394]:
print(confusion_matrix(y_test, y_pred_gb))

[[37 11]
 [11 37]]


In [395]:
print(roc_auc_score(y_test, y_pred_gb))

0.7708333333333335


###SVM

In [396]:
svm_model = Pipeline([('scaler', StandardScaler()), ('svm', SVC())])
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.73      0.75      0.74        48
           1       0.74      0.73      0.74        48

    accuracy                           0.74        96
   macro avg       0.74      0.74      0.74        96
weighted avg       0.74      0.74      0.74        96



In [397]:
print(confusion_matrix(y_test, y_pred_svm))

[[36 12]
 [13 35]]


In [398]:
print(roc_auc_score(y_test, y_pred_svm))

0.7395833333333334


###kNN

In [399]:
knn_model = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=3))])
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.60      0.71      0.65        48
           1       0.64      0.52      0.57        48

    accuracy                           0.61        96
   macro avg       0.62      0.61      0.61        96
weighted avg       0.62      0.61      0.61        96



In [400]:
print(confusion_matrix(y_test, y_pred_knn))

[[34 14]
 [23 25]]


In [401]:
print(roc_auc_score(y_test, y_pred_knn))

0.6145833333333334


###🥇 **Ensemble**

In [409]:
ensemble_model = VotingClassifier(estimators=[('lr', lr_model), ('xgb', xgb_model)], voting='soft')
ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)
print(classification_report(y_test, y_pred_ensemble, digits=4))

              precision    recall  f1-score   support

           0     0.8542    0.8542    0.8542        48
           1     0.8542    0.8542    0.8542        48

    accuracy                         0.8542        96
   macro avg     0.8542    0.8542    0.8542        96
weighted avg     0.8542    0.8542    0.8542        96



In [410]:
print(confusion_matrix(y_test, y_pred_ensemble))

[[41  7]
 [ 7 41]]


In [404]:
print(roc_auc_score(y_test, y_pred_ensemble))

0.8541666666666666


Ignore: Brute force ensemble model search

In [406]:
#models = [('lr', lr_model), ('rf', rf_model), ('xgb', xgb_model), ('adb', adb_model), ('gb', gb_model), ('knn', knn_model)]
#for model1 in models:
    #for model2 in models:
        #if model1 != model2:
            #ensemble_model = VotingClassifier(estimators=[model1, model2], voting='soft')
            #ensemble_model.fit(X_train, y_train)
            #y_pred_ensemble = ensemble_model.predict(X_test)
            #print(f"({model1[0]}, {model2[0]})")
            #print(classification_report(y_test, y_pred_ensemble))