In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score 
from sklearn.metrics import classification_report
from imblearn.over_sampling import SVMSMOTE
from pre_processing import load_and_clean_data
from sklearn.feature_selection import SelectKBest, f_classif

In [37]:
X_train, X_val, X_test, y_train, y_val, y_test = load_and_clean_data(
    "diabetes_prediction_dataset.csv", split=True, standardize=False
)

In [38]:
# Define the XGBoost classifier
xg_classifier = XGBClassifier(
    random_state=42,
    eval_metric='logloss',
)

In [39]:
# Hyperparameter grid
params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Stratified cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [45]:
# Grid search
grid = GridSearchCV(
    estimator=xg_classifier,
    param_grid=params,
    scoring='f1_macro',
    cv=skf,
    n_jobs=-1,
    verbose=1
)

# Fit grid search on training set only
grid.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [46]:
# Evaluate best model on validation set (optional)
print("Best Parameters:", grid.best_params_)
best_model = grid.best_estimator_

Best Parameters: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 150}


In [47]:
# Evaluate on the test set (untouched)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("\nClassification Report (Test Set):\n", classification_report(y_test, y_pred))



Classification Report (Test Set):
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     17534
           1       0.96      0.69      0.80      1696

    accuracy                           0.97     19230
   macro avg       0.97      0.84      0.89     19230
weighted avg       0.97      0.97      0.97     19230



In [48]:
# Predict probabilities on the test set
y_proba = best_model.predict_proba(X_test)[:, 1]

# Try multiple thresholds
thresholds = np.arange(0.1, 0.95, 0.05)
best_f1 = 0
best_thresh = 0.5

In [44]:
print("Threshold tuning results:")
for t in thresholds:
    y_pred_thresh = (y_proba > t).astype(int)
    f1 = f1_score(y_test, y_pred_thresh)
    print(f"Threshold: {t:.2f}, F1-score: {f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

Threshold tuning results:
Threshold: 0.10, F1-score: 0.6214
Threshold: 0.15, F1-score: 0.6775
Threshold: 0.20, F1-score: 0.7186
Threshold: 0.25, F1-score: 0.7477
Threshold: 0.30, F1-score: 0.7687
Threshold: 0.35, F1-score: 0.7894
Threshold: 0.40, F1-score: 0.7972
Threshold: 0.45, F1-score: 0.8008
Threshold: 0.50, F1-score: 0.8003
Threshold: 0.55, F1-score: 0.7987
Threshold: 0.60, F1-score: 0.7999
Threshold: 0.65, F1-score: 0.7958
Threshold: 0.70, F1-score: 0.7948
Threshold: 0.75, F1-score: 0.7943
Threshold: 0.80, F1-score: 0.7950
Threshold: 0.85, F1-score: 0.7949
Threshold: 0.90, F1-score: 0.7952
