# Diabetes Classification using Classical Machine Learning Algorithms

## Comparison on Support Vector Machine, Logistic Regression, Random Forest, Decision Tree and KNN

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data/diabetes.csv')

X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [3]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [4]:
models = {
    'SVM': Pipeline([
        ('scaler', StandardScaler()),   # Add scaler for leveraging dataset statistics
        ('svm', SVC(random_state=42))
    ]),
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LogisticRegression(random_state=42))
    ]),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'KNN': Pipeline([
        ('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())
    ])
}

param_grids = {     # Set pre-defined hyperparameter ranges for finding optimal hyperparemter
    'SVM': {'svm__C': [0.1, 1, 10], 'svm__kernel': ['rbf', 'linear']},
    'Logistic Regression': {'lr__C': [0.1, 1, 10]},
    'Random Forest': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'Decision Tree': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]},
    'KNN': {'knn__n_neighbors': [3, 5, 7, 9]}
}

In [5]:
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    y_val_pred = best_model.predict(X_val)
    
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    y_test_pred = best_model.predict(X_test)
    
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
    
    results[name] = {
        'Best Parameters': grid_search.best_params_,
        'Validation Accuracy': val_accuracy,
        'Test Accuracy': test_accuracy,
        'Cross-Validation Mean': cv_scores.mean(),
        'Cross-Validation Std': cv_scores.std()
    }
    
    print(f"{name} - Validation Accuracy: {val_accuracy:.4f}")
    print(f"{name} - Test Accuracy: {test_accuracy:.4f}")
    print(f"{name} - CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"{name} - Best Parameters: {grid_search.best_params_}")
    print("Classification Report (Test Set):")
    print(classification_report(y_test, y_test_pred))
    print("\n" + "="*50 + "\n")

Training SVM...
SVM - Validation Accuracy: 0.7236
SVM - Test Accuracy: 0.7143
SVM - CV Accuracy: 0.7577 (+/- 0.0390)
SVM - Best Parameters: {'svm__C': 1, 'svm__kernel': 'rbf'}
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.76      0.82      0.79        99
           1       0.62      0.53      0.57        55

    accuracy                           0.71       154
   macro avg       0.69      0.67      0.68       154
weighted avg       0.71      0.71      0.71       154



Training Logistic Regression...
Logistic Regression - Validation Accuracy: 0.7886
Logistic Regression - Test Accuracy: 0.7273
Logistic Regression - CV Accuracy: 0.7576 (+/- 0.0416)
Logistic Regression - Best Parameters: {'lr__C': 1}
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.80      0.77      0.78        99
           1       0.61      0.65      0.63        55

    accuracy                  

In [6]:
print("Summary of Results:")
for name, result in results.items():
    print(f"{name}:")
    print(f"\tValidation Accuracy: {result['Validation Accuracy']:.4f}")
    print(f"\tTest Accuracy: {result['Test Accuracy']:.4f}")
    print(f"\tCV Accuracy: {result['Cross-Validation Mean']:.4f} (+/- {result['Cross-Validation Std'] * 2:.4f})")
    print(f"\tBest Parameters: {result['Best Parameters']}")

Summary of Results:
SVM:
	Validation Accuracy: 0.7236
	Test Accuracy: 0.7143
	CV Accuracy: 0.7577 (+/- 0.0390)
	Best Parameters: {'svm__C': 1, 'svm__kernel': 'rbf'}
Logistic Regression:
	Validation Accuracy: 0.7886
	Test Accuracy: 0.7273
	CV Accuracy: 0.7576 (+/- 0.0416)
	Best Parameters: {'lr__C': 1}
Random Forest:
	Validation Accuracy: 0.7724
	Test Accuracy: 0.7273
	CV Accuracy: 0.7597 (+/- 0.0316)
	Best Parameters: {'max_depth': None, 'n_estimators': 100}
Decision Tree:
	Validation Accuracy: 0.6341
	Test Accuracy: 0.7208
	CV Accuracy: 0.7353 (+/- 0.0565)
	Best Parameters: {'max_depth': None, 'min_samples_split': 10}
KNN:
	Validation Accuracy: 0.7073
	Test Accuracy: 0.7338
	CV Accuracy: 0.7434 (+/- 0.0691)
	Best Parameters: {'knn__n_neighbors': 5}


In [7]:
best_model = max(results, key=lambda x: results[x]['Validation Accuracy'])
print(f"Best performing model: {best_model}")
print(f"Validation Accuracy: {results[best_model]['Validation Accuracy']:.4f}")
print(f"Test Accuracy: {results[best_model]['Test Accuracy']:.4f}")

Best performing model: Logistic Regression
Validation Accuracy: 0.7886
Test Accuracy: 0.7273
