#A Machine Learning Approach for Gallstone Disease Prediction
Dung Nguyen<br>
Elmhurst University<br>
dung.nguyen@365.elmhurst.edu<br>


##Libraries

In [391]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_classif
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier

##Data

In [366]:
df = pd.read_csv('https://raw.githubusercontent.com/nguyend77/gallstone/refs/heads/main/data.csv')

Identify features (X) and target (y)

In [367]:
X = df.drop(columns=['gallstone', 'lean_mass', 'total_body_fat_ratio', 'bone_mass', 'extracellular_water', 'total_fat_content', 'ecf_tbw', 'visceral_fat_area', 'total_body_water', 'muscle_mass', 'hepatic_fat_accumulation', 'visceral_muscle_area', 'intracellular_water', 'visceral_fat_rating', 'obesity', 'protein'])
y = df['gallstone']

ANOVA F-Scores

In [368]:
f_scores, p_values = f_classif(X, y)
feature_scores_df = pd.DataFrame({'Feature': X.columns, 'F_Score': f_scores, 'P_Value': p_values})
feature_scores_df = feature_scores_df.sort_values(by='F_Score', ascending=False)
print(feature_scores_df)

                       Feature    F_Score       P_Value
22                    vitaminD  45.673154  6.709802e-11
20          c_reactive_protein  27.385949  3.037234e-07
21                  hemoglobin  12.781941  4.045010e-04
5               hyperlipidemia   8.532832  3.738263e-03
13                         hdl   8.181862  4.511971e-03
1                       gender   7.647763  6.017412e-03
15                         ast   5.876601  1.590243e-02
18                  creatinine   5.645945  1.808963e-02
9                          bmi   4.750322  3.002851e-02
17                         alp   3.874263  4.990295e-02
7                       height   3.735465  5.415938e-02
6                     diabetes   3.512774  6.181698e-02
3      coronary_artery_disease   3.010879  8.367928e-02
12                         ldl   1.014058  3.147007e-01
4               hypothyroidism   0.968731  3.257475e-01
14                triglyceride   0.821302  3.654882e-01
8                       weight   0.752641  3.862

Train-Test Split

In [395]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

Logistic Regression

In [370]:
lr_model = LogisticRegression(solver = 'liblinear', max_iter = 1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.83      0.73      0.78        48
           1       0.76      0.85      0.80        48

    accuracy                           0.79        96
   macro avg       0.80      0.79      0.79        96
weighted avg       0.80      0.79      0.79        96



In [371]:
print(confusion_matrix(y_test, y_pred_lr))

[[35 13]
 [ 7 41]]


In [409]:
print(roc_auc_score(y_test, y_pred_lr))

0.7916666666666667


Random Forest

In [372]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.80      0.77      0.79        48
           1       0.78      0.81      0.80        48

    accuracy                           0.79        96
   macro avg       0.79      0.79      0.79        96
weighted avg       0.79      0.79      0.79        96



In [373]:
print(confusion_matrix(y_test, y_pred_rf))

[[37 11]
 [ 9 39]]


In [410]:
print(roc_auc_score(y_test, y_pred_rf))

0.7916666666666667


🥇 **XGBoost**

In [374]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82        48
           1       0.83      0.81      0.82        48

    accuracy                           0.82        96
   macro avg       0.82      0.82      0.82        96
weighted avg       0.82      0.82      0.82        96



In [375]:
print(confusion_matrix(y_test, y_pred_xgb))

[[40  8]
 [ 9 39]]


In [411]:
print(roc_auc_score(y_test, y_pred_xgb))

0.8229166666666667


AdaBoost

In [376]:
adb_model = AdaBoostClassifier()
adb_model.fit(X_train, y_train)
y_pred_adb = adb_model.predict(X_test)
print(classification_report(y_test, y_pred_adb))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78        48
           1       0.78      0.79      0.78        48

    accuracy                           0.78        96
   macro avg       0.78      0.78      0.78        96
weighted avg       0.78      0.78      0.78        96



In [377]:
print(confusion_matrix(y_test, y_pred_adb))

[[37 11]
 [10 38]]


In [412]:
print(roc_auc_score(y_test, y_pred_adb))

0.78125


Gradient Boosting

In [378]:
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

           0       0.75      0.79      0.77        48
           1       0.78      0.73      0.75        48

    accuracy                           0.76        96
   macro avg       0.76      0.76      0.76        96
weighted avg       0.76      0.76      0.76        96



In [379]:
print(confusion_matrix(y_test, y_pred_gb))

[[38 10]
 [13 35]]


In [413]:
print(roc_auc_score(y_test, y_pred_gb))

0.7604166666666665


SVM

In [398]:
svm_model = Pipeline([('scaler', StandardScaler()), ('svm', SVC())])
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.72      0.75      0.73        48
           1       0.74      0.71      0.72        48

    accuracy                           0.73        96
   macro avg       0.73      0.73      0.73        96
weighted avg       0.73      0.73      0.73        96



In [381]:
print(confusion_matrix(y_test, y_pred_svm))

[[36 12]
 [14 34]]


In [414]:
print(roc_auc_score(y_test, y_pred_svm))

0.7291666666666666


kNN

In [399]:
knn_model = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=3))])
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.55      0.67      0.60        48
           1       0.58      0.46      0.51        48

    accuracy                           0.56        96
   macro avg       0.57      0.56      0.56        96
weighted avg       0.57      0.56      0.56        96



In [383]:
print(confusion_matrix(y_test, y_pred_knn))

[[32 16]
 [26 22]]


In [415]:
print(roc_auc_score(y_test, y_pred_knn))

0.5625
