In [4]:
from google.colab import drive
import pandas as pd
import os

# Mount Google Drive
drive.mount("/content/drive")

# Path للمشروع
project_path = "/content/drive/MyDrive/HeartProject/data"

# Load data
X_scaled_df = pd.read_csv(os.path.join(project_path, "X_scaled.csv"))
X_final = pd.read_csv(os.path.join(project_path, "X_final.csv"))
y_binary = pd.read_csv(os.path.join(project_path, "y_binary.csv")).squeeze()

print("✅ Data loaded successfully from Drive")
print("Shapes:", X_scaled_df.shape, X_final.shape, y_binary.shape)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Data loaded successfully from Drive
Shapes: (303, 18) (303, 6) (303,)


In [7]:
from sklearn.model_selection import train_test_split

# -----------------------
# Train-test split for ALL features
# -----------------------
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(
    X_scaled_df, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

# -----------------------
# Train-test split for FINAL features
# -----------------------
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_final, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)



# STEP 6 - Hyperparameter Tuning

In this step, we fine-tune the hyperparameters of our models to improve performance.  
We use **GridSearchCV** with cross-validation for:  

- **K-Nearest Neighbors (KNN)**  
- **Random Forest Classifier**
- SVM

This allows us to find the best parameter settings and achieve higher accuracy compared to baseline models.


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


# Define parameter grids
rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5],
    "max_features": ["sqrt", "log2"]
}
knn_params = {
    "n_neighbors": [3, 5, 7, 9],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}
svm_params = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"]
}

# Function for tuning
def tune_models(X_train, y_train, X_test, y_test, label="ALL"):
    print(f"\n=== Tuning with {label} Features ===")

    # Random Forest
    rf = RandomForestClassifier(random_state=42)
    rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring="accuracy", n_jobs=-1)
    rf_grid.fit(X_train, y_train)
    print("Best RF Params:", rf_grid.best_params_)
    print("Best RF Score (CV):", rf_grid.best_score_)
    print("RandomForest Test Accuracy:", accuracy_score(y_test, rf_grid.best_estimator_.predict(X_test)))

    # KNN
    knn = KNeighborsClassifier()
    knn_grid = GridSearchCV(knn, knn_params, cv=5, scoring="accuracy", n_jobs=-1)
    knn_grid.fit(X_train, y_train)
    print("\nBest KNN Params:", knn_grid.best_params_)
    print("Best KNN Score (CV):", knn_grid.best_score_)
    print("KNN Test Accuracy:", accuracy_score(y_test, knn_grid.best_estimator_.predict(X_test)))

    # SVM
    svm = SVC()
    svm_grid = GridSearchCV(svm, svm_params, cv=5, scoring="accuracy", n_jobs=-1)
    svm_grid.fit(X_train, y_train)
    print("\nBest SVM Params:", svm_grid.best_params_)
    print("Best SVM Score (CV):", svm_grid.best_score_)
    print("SVM Test Accuracy:", accuracy_score(y_test, svm_grid.best_estimator_.predict(X_test)))

# Run tuning for both cases
tune_models(X_train_all, y_train_all, X_test_all, y_test_all, label="ALL")
tune_models(X_train_final, y_train_final, X_test_final, y_test_final, label="FINAL")



=== Tuning with ALL Features ===
Best RF Params: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
Best RF Score (CV): 0.8470238095238095
RandomForest Test Accuracy: 0.9016393442622951

Best KNN Params: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'uniform'}
Best KNN Score (CV): 0.8097789115646258
KNN Test Accuracy: 0.9180327868852459

Best SVM Params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best SVM Score (CV): 0.8385204081632655
SVM Test Accuracy: 0.8852459016393442

=== Tuning with FINAL Features ===
Best RF Params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}
Best RF Score (CV): 0.8470238095238095
RandomForest Test Accuracy: 0.8524590163934426

Best KNN Params: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best KNN Score (CV): 0.8140306122448979
KNN Test Accuracy: 0.8032786885245902

Best SVM Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Best SVM Score (CV): 0.83

In [11]:
# === Final Accuracy Summary ===
final_results = {
    "Logistic Regression (Baseline)": 0.836,
    "KNN (Baseline)": 0.918,
    "Decision Tree (Baseline)": 0.869,
    "Random Forest (Baseline)": 0.918,
    "SVM (Baseline)": 0.852,
    "Random Forest (Tuned)": 0.902,
    "KNN (Tuned)": 0.918,
    "SVM (Tuned)": 0.885,
    "Voting (Hard)": 0.902,
    "Voting (Soft)": 0.885,
    "Stacking": 0.885
}

import pandas as pd
summary_df = pd.DataFrame(list(final_results.items()), columns=["Model", "Accuracy"])
print("=== Final Model Accuracies ===")
display(summary_df)


=== Final Model Accuracies ===


Unnamed: 0,Model,Accuracy
0,Logistic Regression (Baseline),0.836
1,KNN (Baseline),0.918
2,Decision Tree (Baseline),0.869
3,Random Forest (Baseline),0.918
4,SVM (Baseline),0.852
5,Random Forest (Tuned),0.902
6,KNN (Tuned),0.918
7,SVM (Tuned),0.885
8,Voting (Hard),0.902
9,Voting (Soft),0.885
