In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report


In [2]:
# Load data
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")
# Separate features and target
X_train = train.drop(columns=['diabetes'])
y_train = train['diabetes']
X_val = val.drop(columns=['diabetes'])
y_val = val['diabetes']
X_test = test.drop(columns = ['diabetes'])
y_test = test['diabetes']

In [3]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'max_features': ['sqrt', 'log2']
}


In [None]:
# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit to training data
grid_search.fit(X_train, y_train)

# Best model
best_gb = grid_search.best_estimator_

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


In [None]:
#on test set
y_pred_test = bestModel.predict(X_test)



# Calculate metrics
cm = confusion_matrix(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

# Print evaluation results
print("\nBest Hyperparameters:", grid_search.best_params_)
print("Confusion Matrix:\n", cm)
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_test, digits=3))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay

# 1. Plot Confusion Matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['No Diabetes', 'Diabetes'],
            yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

# 2. Bar Chart of Precision, Recall, F1
metrics = [precision, recall, f1]
labels = ['Precision', 'Recall', 'F1 Score']

plt.figure(figsize=(6,4))
sns.barplot(x=labels, y=metrics, palette='viridis')
plt.ylim(0, 1)
for i, val in enumerate(metrics):
    plt.text(i, val + 0.02, f"{val:.2f}", ha='center', va='bottom')
plt.title('Evaluation Metrics (Threshold = 0.5)')
plt.ylabel('Score')
plt.tight_layout()
plt.show()
