In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Project folder in Drive
project_dir = "/content/drive/MyDrive/Heart_Disease_Project"

# Go to notebooks folder
os.chdir(os.path.join(project_dir, "notebooks"))
os.listdir()  # Check current files

['02_pca_analysis.ipynb',
 '03_feature_selection.ipynb',
 '01_data_preprocessing.ipynb',
 '04_supervised_learning.ipynb',
 '05_unsupervised_learning.ipynb']

In [9]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [10]:

df = pd.read_csv(os.path.join(project_dir, "data", "heart_disease_selected.csv"))
print("✅ Cleaned data loaded successfully!")
df.head()

✅ Cleaned data loaded successfully!


Unnamed: 0,cp_4.0,age,exang,cp_3.0,thalach,thal_7.0,chol,ca,trestbps,oldpeak,target
0,0,0.936181,0.0,0,0.017494,0,-0.276443,0.0,0.75038,1.068965,0
1,1,1.378929,1.0,0,-1.816334,0,0.744555,3.0,1.596266,0.381773,1
2,1,1.378929,1.0,0,-0.89942,1,-0.3535,2.0,-0.659431,1.326662,1
3,0,-1.94168,0.0,1,1.63301,0,0.051047,0.0,-0.095506,2.099753,0
4,0,-1.498933,0.0,0,0.978071,0,-0.835103,0.0,-0.095506,0.295874,0


In [11]:

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [12]:

baseline_rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'
)
baseline_rf.fit(X_train, y_train)

#Define Hyperparameter Grids

Hyperparameter grid for GridSearchCV


In [13]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}


Hyperparameter distribution for RandomizedSearchCV


In [14]:
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

Run GridSearchCV

In [15]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_rf_grid = grid_search.best_estimator_

print("Best hyperparameters (GridSearchCV):", grid_search.best_params_)

Best hyperparameters (GridSearchCV): {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}


Run RandomizedSearchCV

In [16]:
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)
best_rf_random = random_search.best_estimator_

print("Best hyperparameters (RandomizedSearchCV):", random_search.best_params_)


Best hyperparameters (RandomizedSearchCV): {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 8}


In [19]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

y_pred_grid = best_rf_grid.predict(X_test)

print("GridSearchCV Random Forest Performance:")
print(f"Accuracy : {accuracy_score(y_test, y_pred_grid):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_grid):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred_grid):.4f}")
print(f"F1-score : {f1_score(y_test, y_pred_grid):.4f}")

GridSearchCV Random Forest Performance:
Accuracy : 0.8333
Precision: 0.8750
Recall   : 0.7500
F1-score : 0.8077


In [20]:
y_pred_random = best_rf_random.predict(X_test)

print("RandomizedSearchCV Random Forest Performance:")
print(f"Accuracy : {accuracy_score(y_test, y_pred_random):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_random):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred_random):.4f}")
print(f"F1-score : {f1_score(y_test, y_pred_random):.4f}")

RandomizedSearchCV Random Forest Performance:
Accuracy : 0.8333
Precision: 0.8750
Recall   : 0.7500
F1-score : 0.8077


In [23]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

models_to_compare = {
    'Baseline RF': baseline_rf,
    'GridSearch RF': best_rf_grid,
    'RandomizedSearch RF': best_rf_random
}

for name, model in models_to_compare.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Performance:")
    print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall   : {recall_score(y_test, y_pred):.4f}")
    print(f"F1-score : {f1_score(y_test, y_pred):.4f}")


Baseline RF Performance:
Accuracy : 0.8333
Precision: 0.8750
Recall   : 0.7500
F1-score : 0.8077

GridSearch RF Performance:
Accuracy : 0.8333
Precision: 0.8750
Recall   : 0.7500
F1-score : 0.8077

RandomizedSearch RF Performance:
Accuracy : 0.8333
Precision: 0.8750
Recall   : 0.7500
F1-score : 0.8077


In [24]:
!mv "/content/drive/MyDrive/Colab Notebooks/06_hyperparameter_tuning.ipynb" "/content/drive/MyDrive/Heart_Disease_Project/notebooks/06_hyperparameter_tuning.ipynb"


In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Example: Using the best GridSearch Random Forest
pipeline = Pipeline([
    ('scaler', StandardScaler()),    # optional for RF, required for some models like SVM
    ('model', best_rf_grid)          # or best_rf_random
])

# Fit pipeline on entire dataset for deployment
X = df.drop('target', axis=1)
y = df['target']
pipeline.fit(X, y)


In [28]:
import os

# Example: create a folder in Drive
save_path = '/content/drive/MyDrive/Heart_Disease_Project/models'
os.makedirs(save_path, exist_ok=True)


In [29]:
import joblib

# Full path for the file
file_path = os.path.join(save_path, 'heart_disease_pipeline.pkl')

# Save pipeline
joblib.dump(pipeline, file_path)

print(f"Pipeline saved successfully at: {file_path}")

Pipeline saved successfully at: /content/drive/MyDrive/Heart_Disease_Project/models/heart_disease_pipeline.pkl
