In [5]:
# 1. Use GridSearchCV & RandomizedSearchCV to optimize model hyperparameters
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import time

# Load data and prepare for binary classification
df = pd.read_csv('../data/heart_disease_preprocessed.csv')
X = df.drop('num', axis=1)
y_binary = (df['num'] > 0).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Baseline models
baseline_models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42)
}

baseline_scores = {}
for name, model in baseline_models.items():
    model.fit(X_train, y_train)
    baseline_scores[name] = model.score(X_test, y_test)
    print(f"{name} baseline: {baseline_scores[name]:.3f}")

# GridSearchCV for Random Forest
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    {'n_estimators': [100, 200], 'max_depth': [5, 10, None], 'min_samples_split': [2, 5]},
    cv=5, n_jobs=-1
)
rf_grid.fit(X_train, y_train)
print(f"Best RF params: {rf_grid.best_params_}")

# RandomizedSearchCV for SVM
svm_random = RandomizedSearchCV(
    SVC(random_state=42),
    {'C': np.logspace(-2, 2, 20), 'gamma': np.logspace(-3, 1, 20), 'kernel': ['rbf', 'linear']},
    n_iter=30, cv=5, random_state=42, n_jobs=-1
)
svm_random.fit(X_train, y_train)
print(f"Best SVM params: {svm_random.best_params_}")

# GridSearchCV for Logistic Regression
lr_grid = GridSearchCV(
    LogisticRegression(random_state=42),
    {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
    cv=5, n_jobs=-1
)
lr_grid.fit(X_train, y_train)
print(f"Best LR params: {lr_grid.best_params_}")

RandomForest baseline: 0.867
SVM baseline: 0.867
LogisticRegression baseline: 0.867
Best RF params: {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 200}
Best RF params: {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 200}
Best SVM params: {'kernel': 'linear', 'gamma': np.float64(0.5455594781168515), 'C': np.float64(23.357214690901213)}
Best LR params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best SVM params: {'kernel': 'linear', 'gamma': np.float64(0.5455594781168515), 'C': np.float64(23.357214690901213)}
Best LR params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}


In [6]:
# 2. Compare optimized models with baseline performance
# Test optimized models
optimized_scores = {
    'RandomForest': rf_grid.score(X_test, y_test),
    'SVM': svm_random.score(X_test, y_test),
    'LogisticRegression': lr_grid.score(X_test, y_test)
}

# Performance comparison
print("PERFORMANCE COMPARISON:")
print("=" * 40)
for model in baseline_scores.keys():
    baseline = baseline_scores[model]
    optimized = optimized_scores[model]
    improvement = optimized - baseline
    print(f"{model}:")
    print(f"  Baseline:  {baseline:.3f}")
    print(f"  Optimized: {optimized:.3f}")
    print(f"  Improvement: {improvement:+.3f}")

# Best performing model
best_model_name = max(optimized_scores, key=optimized_scores.get)
best_score = optimized_scores[best_model_name]

best_models = {
    'RandomForest': rf_grid.best_estimator_,
    'SVM': svm_random.best_estimator_,
    'LogisticRegression': lr_grid.best_estimator_
}

print(f"\nBEST PERFORMING MODEL: {best_model_name}")
print(f"Test Accuracy: {best_score:.3f}")

# Detailed results for best model
best_model = best_models[best_model_name]
y_pred = best_model.predict(X_test)
print(f"\nClassification Report for {best_model_name}:")
print(classification_report(y_test, y_pred))

PERFORMANCE COMPARISON:
RandomForest:
  Baseline:  0.867
  Optimized: 0.850
  Improvement: -0.017
SVM:
  Baseline:  0.867
  Optimized: 0.850
  Improvement: -0.017
LogisticRegression:
  Baseline:  0.867
  Optimized: 0.867
  Improvement: +0.000

BEST PERFORMING MODEL: LogisticRegression
Test Accuracy: 0.867

Classification Report for LogisticRegression:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        36
           1       0.83      0.83      0.83        24

    accuracy                           0.87        60
   macro avg       0.86      0.86      0.86        60
weighted avg       0.87      0.87      0.87        60

