### `We'll tune the best-performing model from step 2.4 (let's assume it was Random Forest).`

In [6]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Load the cleaned data
df = pd.read_csv('../data/cleaned_heart_disease.csv')
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5, # 5-fold cross-validation
    n_jobs=-1, # Use all available cores
    verbose=2,
    scoring='accuracy'
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
print("Best Parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

# Evaluate the optimized model
y_pred_optimized = best_rf_model.predict(X_test)
print("\nOptimized Random Forest Performance:")
print(classification_report(y_test, y_pred_optimized))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END m

### `Model Export & Deployment`
After tuning, save your final, best-performing model. It's best practice to save the entire pipeline (scaler + model) to ensure preprocessing is consistent. Since we already scaled and selected features, we can just save the model. However, for a production system, a pipeline is better.

Let's create and save a pipeline with the scaler and the optimized model. This assumes we retrain on the full, unscaled dataset.

In [8]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 1. Load the original cleaned data (pre-scaling)
df_original = pd.read_csv('../data/cleaned_heart_disease.csv')

# 2. Select only the features we decided on earlier
final_features_list = list(X.columns) # From step 2.3
X_final = df_original[final_features_list]
y_final = df_original['target']

# 3. Create the pipeline
# This pipeline will first scale the data then apply the best RF model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', best_rf_model) # Use the best model from GridSearchCV
])

# 4. Train the pipeline on the full dataset
pipeline.fit(X_final, y_final)

# 5. Save the pipeline to a file
joblib.dump(pipeline, '../models/final_model.pkl')

print("Model pipeline saved to models/final_model.pkl")

Model pipeline saved to models/final_model.pkl


In [9]:
print(X_train.columns.to_list())

['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'sex_1.0', 'cp_2.0', 'cp_3.0', 'cp_4.0', 'fbs_1.0', 'restecg_1.0', 'restecg_2.0', 'exang_1.0', 'slope_2.0', 'slope_3.0', 'ca_1.0', 'ca_2.0', 'ca_3.0', 'thal_6.0', 'thal_7.0']
