# Enhanced Diabetes Prediction Model

This notebook improves upon the original model with:
- Multiple algorithm comparison
- Cross-validation
- Hyperparameter tuning
- Feature importance analysis
- Comprehensive evaluation metrics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
import pickle
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Load and Explore Data

In [None]:
# Load dataset
data = pd.read_csv("diabetes.csv")

print(f"Dataset Shape: {data.shape}")
print(f"\nClass Distribution:")
print(data['Outcome'].value_counts())
print(f"\nClass Balance: {data['Outcome'].value_counts(normalize=True).round(3)}")
data.head()

In [None]:
# Check for missing/zero values in critical columns
cols_with_zero = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
print("Zero values in critical columns:")
for col in cols_with_zero:
    zero_count = (data[col] == 0).sum()
    print(f"  {col}: {zero_count} ({zero_count/len(data)*100:.1f}%)")

## 2. Data Preprocessing

In [None]:
# Features and Target
X = data.drop("Outcome", axis=1).copy()
y = data["Outcome"].copy()

# Replace zeros with NaN and fill with median (more robust than mean)
X[cols_with_zero] = X[cols_with_zero].replace(0, np.nan)
X.fillna(X.median(), inplace=True)

print("Data preprocessing complete!")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## 3. Model Comparison

In [None]:
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'KNN': KNeighborsClassifier()
}

# Cross-validation comparison
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

print("Model Comparison (5-Fold Cross-Validation)")
print("=" * 50)

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    results.append({
        'Model': name,
        'Mean Accuracy': scores.mean(),
        'Std': scores.std()
    })
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")

results_df = pd.DataFrame(results).sort_values('Mean Accuracy', ascending=False)
print("\nRanking:")
print(results_df.to_string(index=False))

## 4. Hyperparameter Tuning (Random Forest)

In [None]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")

In [None]:
# Train best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("\n" + "=" * 50)
print("FINAL MODEL EVALUATION (Test Set)")
print("=" * 50)
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_test, y_pred_proba):.4f}")

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Diabetic', 'Diabetic']))

## 5. Feature Importance

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance:")
print(feature_importance.to_string(index=False))

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance for Diabetes Prediction')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=150)
plt.show()

## 6. Confusion Matrix & ROC Curve

In [None]:
# Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Not Diabetic', 'Diabetic'],
            yticklabels=['Not Diabetic', 'Diabetic'])
axes[0].set_title('Confusion Matrix')
axes[0].set_ylabel('Actual')
axes[0].set_xlabel('Predicted')

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
axes[1].plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {auc:.3f})')
axes[1].plot([0, 1], [0, 1], color='gray', linestyle='--')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('ROC Curve')
axes[1].legend(loc='lower right')

plt.tight_layout()
plt.savefig('model_evaluation.png', dpi=150)
plt.show()

## 7. Save Enhanced Model

In [None]:
# Save the enhanced model and scaler
with open('diabetes_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Enhanced model and scaler saved successfully!")
print(f"\nModel: Random Forest with tuned hyperparameters")
print(f"Best Parameters: {grid_search.best_params_}")

## 8. Model Summary

In [None]:
print("\n" + "=" * 60)
print("MODEL ENHANCEMENT SUMMARY")
print("=" * 60)
print("\nImprovements made:")
print("  1. Replaced mean imputation with median (more robust)")
print("  2. Added stratified train-test split")
print("  3. Compared 5 different algorithms")
print("  4. Performed hyperparameter tuning with GridSearchCV")
print("  5. Used 5-fold cross-validation")
print("  6. Added comprehensive evaluation metrics")
print("  7. Generated feature importance analysis")
print("\nFinal Model: Random Forest Classifier")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")