In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib

# Load the dataset
data = pd.read_excel('Glycohemoglobin_t3.xlsx')

# Handle missing values by dropping rows with any missing values
data = data.dropna()

# Define the target variable (gh >= 6.5%)
data['diabetes'] = np.where(data['gh'] >= 6.5, 1, 0)

# Drop the original 'gh' column as it's no longer needed
X = data.drop(['gh', 'diabetes'], axis=1)
y = data['diabetes']

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for future use
joblib.dump(scaler, 'models/scaler.pkl')

# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)
y_pred_logreg = logreg.predict(X_test_scaled)
logreg_auc = roc_auc_score(y_test, y_pred_logreg)
print('Logistic Regression AUC: ', logreg_auc)

# Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred_dtree = dtree.predict(X_test)
dtree_auc = roc_auc_score(y_test, y_pred_dtree)
print('Decision Tree AUC: ', dtree_auc)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_auc = roc_auc_score(y_test, y_pred_rf)
print('Random Forest AUC: ', rf_auc)

# Hyperparameter Tuning for Random Forest
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
best_rf_auc = roc_auc_score(y_test, y_pred_best_rf)
print('Tuned Random Forest AUC: ', best_rf_auc)

# Save the best Random Forest model
joblib.dump(best_rf, 'models/rf_model.pkl')

# Model Comparison
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_logreg))
print("Decision Tree Report:\n", classification_report(y_test, y_pred_dtree))
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))
print("Best Tuned Random Forest Report:\n", classification_report(y_test, y_pred_best_rf))


Logistic Regression AUC:  0.8181374843945067
Decision Tree AUC:  0.7490929307116105
Random Forest AUC:  0.8175132646691635
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Tuned Random Forest AUC:  0.8559515449438202
Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98       801
           1       0.72      0.66      0.69        64

    accuracy                           0.96       865
   macro avg       0.85      0.82      0.83       865
weighted avg       0.95      0.96      0.96       865

Decision Tree Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96       801
           1       0.47      0.55      0.51        64

    accuracy                           0.92       865
   macro avg       0.72      0.75      0.73       865
weighted avg       0.93      0.92      0.92       865

Random Forest Report:
               precision    recall  f1-sco