In [40]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from pre_processing import load_and_clean_data

In [41]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, make_scorer, f1_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, StratifiedKFold,cross_val_score

In [42]:
X_train, X_val, X_test, y_train, y_val, y_test = load_and_clean_data(
    "../diabetes_prediction_dataset.csv", split=True
)

In [43]:
# Data Standardization
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [44]:
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [45]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],         # Regularization strength (smaller = stronger regularization)
    'penalty': ['l1', 'l2'],                     # Type of regularization
    'solver': ['liblinear', 'saga'],             # Needed for l1 support
    'class_weight': ['balanced'],                # Keep this to handle imbalance
    'max_iter': [500, 1000]
}
cv = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)

In [46]:
f1_minority = make_scorer(f1_score, pos_label=1)
grid = GridSearchCV(LogisticRegression(), param_grid, scoring=f1_minority, cv=cv, n_jobs=-1)
grid.fit(X_train_scaled, y_train)

print("Best params:", grid.best_params_)
print("Best F1 score:", grid.best_score_)

Best params: {'C': 0.001, 'class_weight': 'balanced', 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
Best F1 score: 0.5836662904947092


In [47]:
#best_model = LogisticRegression(
#    C=0.001,
#    class_weight='balanced',
#    max_iter=500,
#    penalty='l2',
#    solver='saga'
#)
best_model = grid.best_estimator_
best_model.fit(X_train_scaled, y_train)

In [21]:
#y_val_proba = best_model.predict_proba(X_val_scaled)[:, 1]  # Probabilities for class 1 (diabetes)

In [48]:
# Final threshold 
y_val_proba = best_model.predict_proba(X_val_scaled)[:, 1]
custom_threshold = 0.6
y_val_pred_custom = (y_val_proba >= custom_threshold).astype(int)
print(f"\nClassification Report (Threshold = {custom_threshold}):")
print(classification_report(y_val, y_val_pred_custom))


Classification Report (Threshold = 0.6):
              precision    recall  f1-score   support

           0       0.98      0.93      0.95      8767
           1       0.52      0.79      0.63       848

    accuracy                           0.92      9615
   macro avg       0.75      0.86      0.79      9615
weighted avg       0.94      0.92      0.92      9615



In [49]:
cv_scores = cross_val_score(
    best_model, X_train_scaled, y_train,
    scoring=f1_minority, cv=cv
)
print("Cross-validated F1 scores:", cv_scores)
print("Mean CV F1 score:", np.mean(cv_scores))

Cross-validated F1 scores: [0.56974691 0.59140099 0.58940591 0.58526563 0.58335801 0.5829912 ]
Mean CV F1 score: 0.5836947763347534


In [50]:
y_test_proba = best_model.predict_proba(X_test_scaled)[:, 1]
y_test_pred_custom = (y_test_proba >= custom_threshold).astype(int)

print("\nFinal Test Set Report:")
print(classification_report(y_test, y_test_pred_custom))


Final Test Set Report:
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      8767
           1       0.55      0.82      0.66       848

    accuracy                           0.93      9615
   macro avg       0.77      0.88      0.81      9615
weighted avg       0.94      0.93      0.93      9615

