In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [16]:
df = pd.read_csv("../data/heart_disease.csv")
X = df.drop("num", axis=1)
y = df["num"]     

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [18]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
param_grid = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=2000),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "solver": ["liblinear", "lbfgs"]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 10, None],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [5, 10, None],
            "min_samples_split": [2, 5, 10]
        }
    },
    "SVM": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf", "poly"]
        }
    }
}

In [20]:
best_models = {}
for name, mp in param_grid.items():
    print(f"\n Running GridSearch for {name}...")
    clf = GridSearchCV(mp["model"], mp["params"], cv=5, scoring="accuracy")
    clf.fit(X_train_scaled, y_train)
    
    print("Best Params:", clf.best_params_)
    y_pred = clf.predict(X_test_scaled)
    
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    
    best_models[name] = {
        "best_model": clf.best_estimator_,
        "accuracy": acc
    }


 Running GridSearch for Logistic Regression...
Best Params: {'C': 1, 'solver': 'lbfgs'}
Accuracy: 0.65
              precision    recall  f1-score   support

           0       0.82      1.00      0.90        36
           1       0.50      0.22      0.31         9
           2       0.25      0.20      0.22         5
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         3

    accuracy                           0.65        60
   macro avg       0.31      0.28      0.29        60
weighted avg       0.59      0.65      0.60        60


 Running GridSearch for Decision Tree...
Best Params: {'max_depth': 3, 'min_samples_split': 2}
Accuracy: 0.6333333333333333
              precision    recall  f1-score   support

           0       0.81      0.97      0.89        36
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         5
           3       0.19      0.43      0.26         7
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 0.6166666666666667
              precision    recall  f1-score   support

           0       0.80      0.92      0.86        36
           1       0.33      0.22      0.27         9
           2       0.00      0.00      0.00         5
           3       0.25      0.29      0.27         7
           4       0.00      0.00      0.00         3

    accuracy                           0.62        60
   macro avg       0.28      0.28      0.28        60
weighted avg       0.56      0.62      0.59        60


 Running GridSearch for SVM...
Best Params: {'C': 0.1, 'kernel': 'linear'}
Accuracy: 0.6333333333333333
              precision    recall  f1-score   support

           0       0.78      1.00      0.88        36
           1       0.33      0.11      0.17         9
           2       0.14      0.20      0.17         5
           3       0.00      0.00      0.00         7
           4       0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
best_model_name = max(best_models, key=lambda x: best_models[x]["accuracy"])
print("\n Best Model:", best_model_name)
print("Accuracy:", best_models[best_model_name]["accuracy"])


 Best Model: Logistic Regression
Accuracy: 0.65


In [22]:
joblib.dump('LogisticRegression', "../models/final_model.pkl")
joblib.dump('scaler', "../models/scaler.pkl")

['../models/scaler.pkl']