Importing libraries

In [3]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

Loading the dataset 

In [4]:
try:
    df = pd.read_csv('C:/MENNA_DATA/Heart_Disease_Project/dataset attributes/feature_selected_dataset.csv')
    print("Feature-selected dataset loaded.")
except FileNotFoundError:
    print("Error: 'feature_selected_dataset.csv' not found.")
    exit()

X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Feature-selected dataset loaded.


Best model selection

In [18]:
base_model = RandomForestClassifier(random_state=42)
base_model.fit(X_train, y_train)
y_pred_base = base_model.predict(X_test)
print("--- Baseline Random Forest Performance ---")
print(f"\tAccuracy: {accuracy_score(y_test, y_pred_base):.4f}")

--- Baseline Random Forest Performance ---
	Accuracy: 0.8852


Hyperparameter Tuning

In [None]:
# 1-RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
print("\n--- Running RandomizedSearchCV ---")
rf_random = RandomizedSearchCV(estimator=base_model,
                               param_distributions=param_grid,
                               n_iter=100, 
                               cv=5,       
                               verbose=1,
                               random_state=42,
                               n_jobs=-1) 
rf_random.fit(X_train, y_train)
print("Best parameters from RandomizedSearch:", rf_random.best_params_)


--- Running RandomizedSearchCV ---
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters from RandomizedSearch: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 20, 'bootstrap': True}


In [16]:
# 2-Grid SearchCV
param_grid_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
print("\n--- Running GridSearchCV ---")
grid_search = GridSearchCV(estimator=base_model,
                           param_grid=param_grid_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters from GridSearchCV:", grid_search.best_params_)


--- Running GridSearchCV ---
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters from GridSearchCV: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [22]:
# Evaluation Report
best_model = rf_random.best_estimator_
y_pred_best = best_model.predict(X_test)
print("\n--- Final Model Performance after Hyperparameter Tuning ---")
report  = classification_report(y_test, y_pred_best)
print(report)
filename = 'C:/MENNA_DATA/Heart_Disease_Project/results/final_evaluation_report.txt'

joblib.dump(best_model, 'C:/MENNA_DATA/Heart_Disease_Project/models/final_model.pkl')

print("\nOptimized model saved to 'models/final_model.pkl'")
with open(filename, 'w') as f:
    f.write(report)


--- Final Model Performance after Hyperparameter Tuning ---
              precision    recall  f1-score   support

           0       0.94      0.88      0.91        33
           1       0.87      0.93      0.90        28

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61


Optimized model saved to 'models/final_model.pkl'
