## **2.6 Hyperparameter Tuning**

**Import Required Libraries**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint

**Define Hyperparameter Grid for GridSearchCV**

In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

**Run GridSearchCV on Random Forest**

In [None]:
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_weighted',
                           n_jobs=-1,
                           verbose=1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


**Show Best Parameters and Score from Grid Search**

In [None]:
print("Best Parameters from GridSearchCV:", grid_search.best_params_)
print("Best F1 Score (CV average):", grid_search.best_score_)

Best Parameters from GridSearchCV: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Best F1 Score (CV average): 0.5098143856007116


**Grid Search results**

In [None]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.288605,0.062411,0.031932,0.010855,gini,,2,50,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.451868,0.505019,0.461738,0.482941,0.567400,0.493793,0.041102,19
1,0.972533,0.564286,0.064447,0.046782,gini,,2,100,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.431034,0.501374,0.501605,0.483862,0.535386,0.490652,0.034166,23
2,0.450808,0.052297,0.026283,0.000767,gini,,2,150,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.499708,0.495144,0.499671,0.482412,0.494192,0.494225,0.006326,17
3,0.137667,0.001472,0.013364,0.001161,gini,,5,50,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.521289,0.536125,0.473888,0.497139,0.491742,0.504036,0.022068,7
4,0.276612,0.016215,0.019573,0.000682,gini,,5,100,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.533929,0.520833,0.501058,0.487054,0.479685,0.504512,0.020322,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.404968,0.059103,0.034123,0.011511,entropy,20,5,100,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.491663,0.525555,0.462876,0.520954,0.472399,0.494689,0.025142,15
68,0.681002,0.126561,0.039726,0.012048,entropy,20,5,150,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.448994,0.479911,0.471220,0.520954,0.437911,0.471798,0.028813,46
69,0.211448,0.047166,0.020276,0.007541,entropy,20,10,50,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.477098,0.468750,0.439594,0.426648,0.463443,0.455107,0.018920,64
70,0.420326,0.072688,0.023525,0.008749,entropy,20,10,100,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.477098,0.479911,0.501605,0.479737,0.414700,0.470610,0.029321,48


**RandomizedSearchCV for Faster Tuning**

In [None]:
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': randint(2, 11),
    'criterion': ['gini', 'entropy']
}

random_search = RandomizedSearchCV(estimator=rf,
                                   param_distributions=param_dist,
                                   n_iter=20,
                                   cv=5,
                                   scoring='f1_weighted',
                                   random_state=42,
                                   n_jobs=-1,
                                   verbose=1)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


**Best Parameters from Randomized Search**

In [None]:
print("Best Parameters from RandomizedSearchCV:", random_search.best_params_)
print("Best F1 Score (CV average):", random_search.best_score_)

Best Parameters from RandomizedSearchCV: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 3, 'n_estimators': 181}
Best F1 Score (CV average): 0.4973803640523965


**Randomized Search results**

In [None]:
pd.DataFrame(random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.191936,0.004196,0.014868,0.000272,gini,20.0,9,70,"{'criterion': 'gini', 'max_depth': 20, 'min_sa...",0.485502,0.511161,0.473243,0.48227,0.491078,0.488651,0.012654,5
1,0.341699,0.010436,0.022763,0.001786,gini,5.0,4,124,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.490205,0.496909,0.471036,0.459811,0.438972,0.471387,0.02093,15
2,1.655652,0.846439,0.120779,0.058242,gini,20.0,6,149,"{'criterion': 'gini', 'max_depth': 20, 'min_sa...",0.482143,0.509558,0.45461,0.453167,0.440232,0.467942,0.02489,16
3,1.777767,0.155,0.09728,0.021485,entropy,20.0,4,199,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.479762,0.447058,0.508473,0.520954,0.4147,0.47419,0.039162,12
4,0.635671,0.150242,0.04556,0.017774,gini,5.0,9,87,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.453259,0.523661,0.411805,0.48227,0.438972,0.461993,0.038312,18
5,0.441633,0.031745,0.028966,0.009627,entropy,20.0,6,107,"{'criterion': 'entropy', 'max_depth': 20, 'min...",0.47193,0.445685,0.489562,0.517847,0.44416,0.473837,0.027789,14
6,0.348484,0.055135,0.023661,0.010935,entropy,,10,98,"{'criterion': 'entropy', 'max_depth': None, 'm...",0.477098,0.450555,0.501605,0.479737,0.4147,0.464739,0.029806,17
7,0.900516,0.234499,0.058452,0.022668,gini,10.0,4,157,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.534649,0.447058,0.488339,0.503224,0.440678,0.48279,0.035179,9
8,1.194548,0.111492,0.066625,0.020463,gini,20.0,10,180,"{'criterion': 'gini', 'max_depth': 20, 'min_sa...",0.482328,0.461324,0.459588,0.453167,0.435598,0.458401,0.015032,20
9,0.347821,0.067783,0.032049,0.009009,gini,10.0,8,70,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.52753,0.491453,0.45677,0.463728,0.470887,0.482074,0.025519,10


**Save best models after tuning**

In [None]:
best_rf_model = grid_search.best_estimator_

# Evaluate on test set
y_pred_best = best_rf_model.predict(X_test)
from sklearn.metrics import classification_report
print("Classification Report for Best RF Model:")
print(classification_report(y_test, y_pred_best))

Classification Report for Best RF Model:
              precision    recall  f1-score   support

           0       0.76      0.94      0.84        36
           1       0.00      0.00      0.00         9
           2       0.33      0.20      0.25         5
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         3

    accuracy                           0.58        60
   macro avg       0.22      0.23      0.22        60
weighted avg       0.48      0.58      0.52        60



In [None]:
best_rf_model = random_search.best_estimator_

# Evaluate on test set
y_pred_best = best_rf_model.predict(X_test)
from sklearn.metrics import classification_report
print("Classification Report for Best RF Model:")
print(classification_report(y_test, y_pred_best))

Classification Report for Best RF Model:
              precision    recall  f1-score   support

           0       0.74      0.97      0.84        36
           1       0.00      0.00      0.00         9
           2       0.33      0.20      0.25         5
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         3

    accuracy                           0.60        60
   macro avg       0.22      0.23      0.22        60
weighted avg       0.47      0.60      0.53        60



### ✅ Deliverables:

- Best hyperparameters from `GridSearchCV` and/or `RandomizedSearchCV`
- Best cross-validated F1-score for Random Forest
- Grid/Randomized Search results for reporting
- Optimized Random Forest model ready for final evaluation or deployment

**Next Step:** 2.7 Model Export & Deployment


## **2.7 Model Export & Deployment**

**Import joblib for exporting the model**

In [None]:
import joblib

**Define column types**

In [None]:
# These are raw input columns
columns = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
           "thalach", "exang", "oldpeak", "slope", "ca", "thal"]

categorical_cols = ['cp', 'thal', 'slope']
numerical_cols = [col for col in columns if col not in categorical_cols]

**Build preprocessor + pipeline**

In [None]:
# Preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

**Fit the pipeline on the original (raw) dataset**

In [None]:
# Reloading original dataset (before one-hot encoding)
columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

df_raw = pd.read_csv("heart_disease.csv", header=None, names=columns)
df_raw.replace("?", pd.NA, inplace=True)
df_raw = df_raw.apply(pd.to_numeric, errors="coerce").dropna().reset_index(drop=True)

# Prepare input and target
X_raw = df_raw[columns[:-1]]   # the 13 raw features
y_raw = df_raw["target"]

In [None]:
pipeline.fit(X_raw, y_raw)

In [None]:
joblib.dump(pipeline, 'heart_disease_pipeline.pkl')
print("✅ Pipeline saved as 'heart_disease_pipeline.pkl'")

✅ Pipeline saved as 'heart_disease_pipeline.pkl'


### ✅ Deliverables:

- `best_random_forest_model.pkl`: optimized model
- `scaler.pkl`: saved StandardScaler
- `heart_disease_pipeline.pkl`: full preprocessing + model pipeline

**Next Step:** 2.8 Streamlit Web UI Development [Bonus]

**File Download Cell**

In [None]:
from google.colab import files
import joblib

# Save the pipeline
joblib.dump(pipeline, 'heart_disease_pipeline.pkl')

# Save the dataset
df_raw.to_csv('heart_disease.csv', index=False)

# Create a requirements file
!pip freeze > requirements.txt

# Download files
files.download('heart_disease_pipeline.pkl')
files.download('heart_disease.csv')
files.download('requirements.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>