baseline of normal models

In [1]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m777.9 kB/s[0m eta [36m0:00:00[0m eta [36m0:00:01[0m[36m0:00:06[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0
Note: you may need to restart the kernel to use updated packages.


In [13]:
# Core classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)
# (Optional) for resampling later
from imblearn.over_sampling import SMOTE

# Example of computing scale_pos_weight for XGBoost
import pandas as pd

In [5]:
df = pd.read_csv("train.csv")
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,1,0,never,24.23,6.6,130,0
1,Female,43.0,0,0,current,22.75,5.8,155,0
2,Female,38.0,0,0,never,28.94,3.5,160,0
3,Female,52.0,0,0,No Info,27.32,6.1,126,0
4,Female,49.0,1,0,never,25.81,4.8,159,0


In [9]:
# Load data
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
    
# Separate features and target
X_train = train.drop(columns=['diabetes'])
y_train = train['diabetes']
X_val = val.drop(columns=['diabetes'])
y_val = val['diabetes']

In [16]:
models = {
  "LR": LogisticRegression(class_weight="balanced", random_state=42),
  "RF": RandomForestClassifier(class_weight="balanced", random_state=42),
  "KNN": KNeighborsClassifier(),
  "GB": GradientBoostingClassifier(random_state=42),
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    cm = confusion_matrix(y_val, preds)
    prec = precision_score(y_val, preds)
    rec  = recall_score(   y_val, preds)
    f1   = f1_score(       y_val, preds)

    print(f"\n=== {name} ===")
    print("Confusion Matrix:")
    print(cm)
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_val, preds, digits=3))


=== LR ===
Confusion Matrix:
[[15411  2122]
 [  201  1495]]
Precision: 0.413
Recall:    0.881
F1 Score:  0.563

Classification Report:
              precision    recall  f1-score   support

           0      0.987     0.879     0.930     17533
           1      0.413     0.881     0.563      1696

    accuracy                          0.879     19229
   macro avg      0.700     0.880     0.746     19229
weighted avg      0.937     0.879     0.898     19229



Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x74c69459c220>
Traceback (most recent call last):
  File "/home/thejoker/newanaconda/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/thejoker/newanaconda/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/thejoker/newanaconda/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/home/thejoker/newanaconda/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' ob


=== RF ===
Confusion Matrix:
[[17436    97]
 [  532  1164]]
Precision: 0.923
Recall:    0.686
F1 Score:  0.787

Classification Report:
              precision    recall  f1-score   support

           0      0.970     0.994     0.982     17533
           1      0.923     0.686     0.787      1696

    accuracy                          0.967     19229
   macro avg      0.947     0.840     0.885     19229
weighted avg      0.966     0.967     0.965     19229


=== KNN ===
Confusion Matrix:
[[17413   120]
 [  641  1055]]
Precision: 0.898
Recall:    0.622
F1 Score:  0.735

Classification Report:
              precision    recall  f1-score   support

           0      0.964     0.993     0.979     17533
           1      0.898     0.622     0.735      1696

    accuracy                          0.960     19229
   macro avg      0.931     0.808     0.857     19229
weighted avg      0.959     0.960     0.957     19229


=== GB ===
Confusion Matrix:
[[17522    11]
 [  552  1144]]
Precision: 0