In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [3]:
df = pd.read_csv("../data/heart_selected.csv")
X = df.drop("target", axis=1)
y = df["target"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)






Train shape: (237, 10) Test shape: (60, 10)


In [4]:
baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(X_train, y_train)

y_pred = baseline_rf.predict(X_test)
baseline_acc = accuracy_score(y_test, y_pred)

print("Baseline Random Forest Accuracy:", baseline_acc)



Baseline Random Forest Accuracy: 0.8166666666666667


In [5]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
rf_acc = accuracy_score(y_test, best_rf.predict(X_test))

print("Best Params (Random Forest):", grid_search.best_params_)
print("Random Forest Test Accuracy:", rf_acc)


Best Params (Random Forest): {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Random Forest Test Accuracy: 0.8333333333333334


In [6]:
param_dist = {
    "C": [0.1, 1, 10, 100],
    "gamma": [1, 0.1, 0.01, 0.001],
    "kernel": ["rbf", "linear"]
}

random_search = RandomizedSearchCV(
    estimator=SVC(probability=True, random_state=42),
    param_distributions=param_dist,
    n_iter=10,
    scoring="accuracy",
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_svm = random_search.best_estimator_
svm_acc = accuracy_score(y_test, best_svm.predict(X_test))

print("Best Params (SVM):", random_search.best_params_)
print("SVM Test Accuracy:", svm_acc)



Best Params (SVM): {'kernel': 'rbf', 'gamma': 0.01, 'C': 1}
SVM Test Accuracy: 0.8666666666666667


In [7]:
print("\nModel Comparison:")
print(f"Baseline RF Accuracy: {baseline_acc:.3f}")
print(f"Tuned RF Accuracy: {rf_acc:.3f}")
print(f"Tuned SVM Accuracy: {svm_acc:.3f}")

# Pick best model
if rf_acc >= svm_acc:
    best_model = best_rf
    print("Selected Model: Random Forest")
else:
    best_model = best_svm
    print("Selected Model: SVM")



Model Comparison:
Baseline RF Accuracy: 0.817
Tuned RF Accuracy: 0.833
Tuned SVM Accuracy: 0.867
Selected Model: SVM


In [8]:
print("\nModel Comparison:")
print(f"Baseline RF Accuracy: {baseline_acc:.3f}")
print(f"Tuned RF Accuracy: {rf_acc:.3f}")
print(f"Tuned SVM Accuracy: {svm_acc:.3f}")

# Pick best model
if rf_acc >= svm_acc:
    best_model = best_rf
    print("Selected Model: Random Forest")
else:
    best_model = best_svm
    print("Selected Model: SVM")




Model Comparison:
Baseline RF Accuracy: 0.817
Tuned RF Accuracy: 0.833
Tuned SVM Accuracy: 0.867
Selected Model: SVM


In [9]:
import os       # for creating folders
import joblib   # for saving/loading models

# Make sure models folder exists
os.makedirs("models", exist_ok=True)

# Save best model
joblib.dump(best_model, "models/final_model.pkl")

print("Best model saved as models/final_model.pkl")


Best model saved as models/final_model.pkl


In [10]:
# Reload model to confirm
loaded_model = joblib.load("models/final_model.pkl")

sample = X.iloc[[0]]
print("Sample prediction:", loaded_model.predict(sample))


Sample prediction: [0]


In [11]:
print(X_train.columns.tolist())


['ca', 'thalach', 'oldpeak', 'cp_4.0', 'thal_7.0', 'age', 'chol', 'trestbps', 'exang', 'slope_2.0']
