In [1]:
from google.colab import drive
import pandas as pd
import os

# Mount Google Drive
drive.mount("/content/drive")

# Path للمشروع
project_path = "/content/drive/MyDrive/HeartProject/data"

# Load data
X_scaled_df = pd.read_csv(os.path.join(project_path, "X_scaled.csv"))
X_final = pd.read_csv(os.path.join(project_path, "X_final.csv"))
y_binary = pd.read_csv(os.path.join(project_path, "y_binary.csv")).squeeze()

print("✅ Data loaded successfully from Drive")
print("Shapes:", X_scaled_df.shape, X_final.shape, y_binary.shape)


Mounted at /content/drive
✅ Data loaded successfully from Drive
Shapes: (303, 18) (303, 6) (303,)


# Step 4: Supervised Learning Models

In this step, we train and evaluate multiple supervised learning models.  
To simplify the problem, we first convert the target variable `num` into **binary classes**:
- 0 → Healthy (no disease)
- 1, 2, 3, 4 → Diseased

This is the standard approach in most research papers using the Cleveland dataset.  

We will then:
1. Train baseline models with default parameters.
2. Apply **Hyperparameter Tuning (GridSearchCV)** to improve their performance.

Models tested:
- Logistic Regression
- K-Nearest Neighbors (KNN)
- Decision Tree
- Random Forest
- Support Vector Machine (SVM)


In [2]:
import os

# Path in Google Drive
drive_path = "/content/drive/MyDrive/HeartProject/data"
os.makedirs(drive_path, exist_ok=True)

# Save final data
X_final.to_csv(os.path.join(drive_path, "X_final.csv"), index=False)
y_binary.to_csv(os.path.join(drive_path, "y_binary.csv"), index=False)

print("✅ Saved X_final.csv and y_binary.csv in Google Drive")


✅ Saved X_final.csv and y_binary.csv in Google Drive


Binary Target Transformation

Baseline Models

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# -----------------------
# 1. Using ALL features
# -----------------------
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(
    X_scaled_df, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver="liblinear"),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC()
}

print("=== Baseline with ALL Features (X_scaled_df) ===")
accuracies_all = {}
for name, model in models.items():
    model.fit(X_train_all, y_train_all)
    y_pred = model.predict(X_test_all)
    acc = accuracy_score(y_test_all, y_pred)
    accuracies_all[name] = acc
    print(f"{name}: {acc:.3f}")

# -----------------------
# 2. Using FINAL features
# -----------------------
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_final, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

print("\n=== Baseline with FINAL Features (X_final) ===")
accuracies_final = {}
for name, model in models.items():
    model.fit(X_train_final, y_train_final)
    y_pred = model.predict(X_test_final)
    acc = accuracy_score(y_test_final, y_pred)
    accuracies_final[name] = acc
    print(f"{name}: {acc:.3f}")


=== Baseline with ALL Features (X_scaled_df) ===
Logistic Regression: 0.836
KNN: 0.918
Decision Tree: 0.869
Random Forest: 0.918
SVM: 0.852

=== Baseline with FINAL Features (X_final) ===
Logistic Regression: 0.852
KNN: 0.885
Decision Tree: 0.738
Random Forest: 0.787
SVM: 0.869
