In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV, 
                                    train_test_split, cross_val_score, StratifiedKFold)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, classification_report, confusion_matrix)
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# ===============================
# 1. LOAD PREPROCESSED DATA
# ===============================

In [3]:
# Load selected features and target
X_selected = pd.read_csv('../Data/X_selected_features.csv')
y = pd.read_csv('../Data/y_target.csv').squeeze()
    
print(f" Loaded selected features: {X_selected.shape}")
print(f" Loaded target variable: {y.shape}")
print(f" Selected features: {list(X_selected.columns)}")
print(f" Target distribution: {y.value_counts().to_dict()}")

 Loaded selected features: (303, 10)
 Loaded target variable: (303,)
 Selected features: ['PC1', 'PC2', 'PC4', 'PC9', 'PC5', 'PC6', 'PC7', 'PC3', 'PC8', 'PC10']
 Target distribution: {0: 164, 1: 139}


# ===============================
# 2. TRAIN-TEST SPLIT
# ===============================

In [4]:
# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

print(f" Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X_selected)*100:.1f}%)")
print(f" Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X_selected)*100:.1f}%)")
print(f" Training target distribution: {y_train.value_counts().to_dict()}")
print(f" Test target distribution: {y_test.value_counts().to_dict()}")


 Training set: 242 samples (79.9%)
 Test set: 61 samples (20.1%)
 Training target distribution: {0: 131, 1: 111}
 Test target distribution: {0: 33, 1: 28}


# ===============================
# 3. BASELINE MODELS TRAINING
# ===============================

In [None]:
# Define baseline models
baseline_models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42, probability=True)
}

# Train and evaluate baseline models
baseline_results = {}
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(" Training baseline models...")
for name, model in baseline_models.items():
    print(f"   Training {name}...")
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv_strategy, scoring='accuracy')
    
    # Train on full training set
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_test)
    
    # Calculate metrics
    test_accuracy = accuracy_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred)
    test_recall = recall_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)
    test_auc = roc_auc_score(y_test, y_pred_proba)
    
    baseline_results[name] = {
        'model': model,
        'cv_accuracy_mean': cv_scores.mean(),
        'cv_accuracy_std': cv_scores.std(),
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1': test_f1,
        'test_auc': test_auc,
        'training_time': training_time
    }
    
    print(f"       CV Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
    print(f"       Test Accuracy: {test_accuracy:.4f}")
    print(f"       Training Time: {training_time:.3f}s")


🔹 Training baseline models...
   Training Logistic Regression...
       CV Accuracy: 0.8224 (±0.0158)
       Test Accuracy: 0.8852
       Training Time: 0.004s
   Training Decision Tree...
       CV Accuracy: 0.7144 (±0.0584)
       Test Accuracy: 0.7705
       Training Time: 0.007s
   Training Random Forest...
       CV Accuracy: 0.8182 (±0.0202)
       Test Accuracy: 0.8852
       Training Time: 0.153s
   Training SVM...
       CV Accuracy: 0.8181 (±0.0355)
       Test Accuracy: 0.8525
       Training Time: 0.012s


# ===============================
# 4. DEFINE HYPERPARAMETER GRIDS
# ===============================

In [7]:
# Define parameter grids
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'Decision Tree': {
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto', 0.01, 0.1, 1]
    }
}

# ===============================
# 5. HYPERPARAMETER TUNING
# ===============================

In [9]:
# Find best baseline model based on test accuracy
best_model_name = max(baseline_results, key=lambda k: baseline_results[k]['test_accuracy'])
best_result = baseline_results[best_model_name]

print(f" Best baseline model: {best_model_name} (Test Accuracy = {best_result['test_accuracy']:.4f})")

# Use selected feature set
X_train_selected, X_test_selected = X_train, X_test
y_train_selected, y_test_selected = y_train, y_test
final_feature_names = X_train_selected.columns.tolist()

if best_model_name in param_grids:
    base_model = baseline_models[best_model_name]
    param_grid = param_grids[best_model_name]

    print(f"\n Performing GridSearchCV for {best_model_name}")
    print(f"   Parameter grid: {list(param_grid.keys())}")

    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=cv_strategy,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    print("    Running GridSearchCV (this may take a few minutes)...")
    grid_search.fit(X_train_selected, y_train_selected)

    best_tuned_model = grid_search.best_estimator_
    y_pred_tuned = best_tuned_model.predict(X_test_selected)
    accuracy_tuned = accuracy_score(y_test_selected, y_pred_tuned)

    baseline_accuracy = best_result['test_accuracy']
    improvement = accuracy_tuned - baseline_accuracy

    print(f"\n Hyperparameter Tuning Results:")
    print(f"   Best Parameters: {grid_search.best_params_}")
    print(f"   Best CV Score: {grid_search.best_score_:.4f}")
    print(f"   Baseline Accuracy: {baseline_accuracy:.4f}")
    print(f"   Tuned Accuracy: {accuracy_tuned:.4f}")
    print(f"   Improvement: {improvement:+.4f}")

    if improvement > 0.01:
        print(" Significant improvement achieved!")
        final_model = best_tuned_model
        final_accuracy = accuracy_tuned
    else:
        print("ℹ No significant improvement from tuning.")
        final_model = best_result['model']
        final_accuracy = baseline_accuracy

    print(f"\n Final Model Classification Report:")
    print(classification_report(y_test_selected, final_model.predict(X_test_selected)))

    if hasattr(final_model, 'feature_importances_'):
        final_importance = pd.DataFrame({
            'feature': final_feature_names,
            'importance': final_model.feature_importances_
        }).sort_values('importance', ascending=True)

        plt.figure(figsize=(10, 6))
        final_importance.plot(kind='barh', x='feature', y='importance',
                              color='green', alpha=0.7, legend=False)
        plt.title('Final Model Feature Importance', fontweight='bold')
        plt.xlabel('Importance')
        plt.tight_layout()
        plt.show()
else:
    print(f" No parameter grid defined for {best_model_name}")
    final_model = best_result['model']
    final_accuracy = best_result['test_accuracy']

 Best baseline model: Logistic Regression (Test Accuracy = 0.8852)

 Performing GridSearchCV for Logistic Regression
   Parameter grid: ['C', 'penalty', 'solver']
    Running GridSearchCV (this may take a few minutes)...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

 Hyperparameter Tuning Results:
   Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}
   Best CV Score: 0.8305
   Baseline Accuracy: 0.8852
   Tuned Accuracy: 0.8852
   Improvement: +0.0000
ℹ No significant improvement from tuning.

 Final Model Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.82      0.89        33
           1       0.82      0.96      0.89        28

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.90      0.89      0.89        61



# ===============================
# 6. SAVE FINAL MODEL
# ===============================

In [11]:
joblib.dump(final_model, '../Models/final_heart_disease_model.pkl')
print(" Final model saved to '../Models/final_heart_disease_model.pkl'")

 Final model saved to '../Models/final_heart_disease_model.pkl'
