# Hyperparameter Tuning and Final Model Selection

This notebook performs comprehensive hyperparameter optimization:
1. GridSearchCV for exhaustive parameter search
2. RandomizedSearchCV for efficient exploration
3. Model comparison and selection
4. Final model export and validation
5. Performance analysis and insights


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [2]:
# Load the best performing dataset (based on previous results)
# Typically selected features perform best
X_train = pd.read_csv('../data/X_train_selected.csv')
X_test = pd.read_csv('../data/X_test_selected.csv')
y_train = pd.read_csv('../data/y_train.csv').squeeze()
y_test = pd.read_csv('../data/y_test.csv').squeeze()

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Features: {list(X_train.columns)}")

# Load scaler for pipeline
scaler = joblib.load('../models/scaler.pkl')
print("✅ Data and scaler loaded successfully!")


Training data shape: (242, 8)
Test data shape: (61, 8)
Features: ['thal', 'exang', 'ca', 'cp', 'thalach', 'slope', 'sex', 'oldpeak']
✅ Data and scaler loaded successfully!


In [3]:
# Define hyperparameter grids for each model
param_grids = {
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    },
    
    'LogisticRegression': {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [1000, 2000]
    },
    
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf', 'linear', 'poly'],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
    },
    
    'DecisionTree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 5, 10],
        'criterion': ['gini', 'entropy']
    }
}

# Define models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

print("Hyperparameter grids defined for all models!")
print(f"Models to tune: {list(models.keys())}")


Hyperparameter grids defined for all models!
Models to tune: ['RandomForest', 'LogisticRegression', 'SVM', 'DecisionTree']


In [4]:
# Perform hyperparameter tuning
tuning_results = {}
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Starting hyperparameter tuning...")
print("="*60)

for model_name, model in models.items():
    print(f"\nTuning {model_name}...")
    print("-" * 40)
    
    # GridSearchCV for comprehensive search
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        cv=cv_folds,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Store results
    tuning_results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'best_estimator': grid_search.best_estimator_,
        'cv_results': grid_search.cv_results_
    }
    
    print(f"✅ Best {model_name} score: {grid_search.best_score_:.4f}")
    print(f"📋 Best parameters: {grid_search.best_params_}")

print(f"\n🎉 Hyperparameter tuning completed for all {len(models)} models!")


Starting hyperparameter tuning...

Tuning RandomForest...
----------------------------------------
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
✅ Best RandomForest score: 0.8247
📋 Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}

Tuning LogisticRegression...
----------------------------------------
Fitting 5 folds for each of 48 candidates, totalling 240 fits
✅ Best LogisticRegression score: 0.8118
📋 Best parameters: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}

Tuning SVM...
----------------------------------------
Fitting 5 folds for each of 72 candidates, totalling 360 fits
✅ Best SVM score: 0.8187
📋 Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}

Tuning DecisionTree...
----------------------------------------
Fitting 5 folds for each of 128 candidates, totalling 640 fits
✅ Best DecisionTree score: 0.7596
📋 Best parameters: {'criterion': 'entropy

In [5]:
# Select the best model and create final model
best_model_name = max(tuning_results.keys(), key=lambda x: tuning_results[x]['best_score'])
final_model = tuning_results[best_model_name]['best_estimator']

print(f"🏆 Best Model: {best_model_name}")
print(f"🎯 Cross-validation F1-Score: {tuning_results[best_model_name]['best_score']:.4f}")
print(f"⚙️  Best Parameters: {tuning_results[best_model_name]['best_params']}")

# Evaluate final model on test set
final_predictions = final_model.predict(X_test)
final_predictions_proba = final_model.predict_proba(X_test)[:, 1]

# Calculate test set metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

test_accuracy = accuracy_score(y_test, final_predictions)
test_precision = precision_score(y_test, final_predictions)
test_recall = recall_score(y_test, final_predictions)
test_f1 = f1_score(y_test, final_predictions)
test_roc_auc = roc_auc_score(y_test, final_predictions_proba)

print(f"\n📊 Final Model Test Set Performance:")
print(f"   Accuracy:  {test_accuracy:.4f}")
print(f"   Precision: {test_precision:.4f}")
print(f"   Recall:    {test_recall:.4f}")
print(f"   F1-Score:  {test_f1:.4f}")
print(f"   ROC-AUC:   {test_roc_auc:.4f}")


🏆 Best Model: RandomForest
🎯 Cross-validation F1-Score: 0.8247
⚙️  Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}

📊 Final Model Test Set Performance:
   Accuracy:  0.8852
   Precision: 0.8621
   Recall:    0.8929
   F1-Score:  0.8772
   ROC-AUC:   0.9513


In [6]:
# Save the final model and create a complete pipeline
from sklearn.pipeline import Pipeline

# Create a complete pipeline with preprocessing and model
final_pipeline = Pipeline([
    ('model', final_model)
])

# Save the final model and pipeline
joblib.dump(final_model, '../models/final_model.pkl')
joblib.dump(final_pipeline, '../models/final_pipeline.pkl')

# Save tuning results
joblib.dump(tuning_results, '../models/hyperparameter_tuning_results.pkl')

# Create model summary
model_summary = {
    'best_model': best_model_name,
    'best_params': tuning_results[best_model_name]['best_params'],
    'cv_score': tuning_results[best_model_name]['best_score'],
    'test_metrics': {
        'accuracy': test_accuracy,
        'precision': test_precision,
        'recall': test_recall,
        'f1_score': test_f1,
        'roc_auc': test_roc_auc
    },
    'feature_names': list(X_train.columns)
}

joblib.dump(model_summary, '../models/model_summary.pkl')

print("✅ Final model saved successfully!")
print("✅ Complete pipeline saved!")
print("✅ Hyperparameter tuning results saved!")
print("✅ Model summary saved!")

print(f"\nFiles saved:")
print(f"- ../models/final_model.pkl")
print(f"- ../models/final_pipeline.pkl") 
print(f"- ../models/hyperparameter_tuning_results.pkl")
print(f"- ../models/model_summary.pkl")


✅ Final model saved successfully!
✅ Complete pipeline saved!
✅ Hyperparameter tuning results saved!
✅ Model summary saved!

Files saved:
- ../models/final_model.pkl
- ../models/final_pipeline.pkl
- ../models/hyperparameter_tuning_results.pkl
- ../models/model_summary.pkl
