In [1]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

x_train_reduced = 'E:/Heart_Disease_Project/data/x_train_reduced.csv'
y_train = 'E:/Heart_Disease_Project/data/y_train.csv'

x_test_reduced = 'E:/Heart_Disease_Project/data/x_test_reduced.csv'
y_test = 'E:/Heart_Disease_Project/data/y_test.csv'

x_train_reduced = pd.read_csv(x_train_reduced)
y_train = pd.read_csv(y_train)

x_test_reduced = pd.read_csv(x_test_reduced)
y_test = pd.read_csv(y_test)

# Logistic Regression
param_logreg = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l2"],
    "solver": ["lbfgs", "saga"]
}

# Decision Tree
param_dt = {
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
    "criterion": ["gini", "entropy"]
}

# Random Forest
param_rf = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

# SVM
param_svm = {
    "C": [0.1, 1, 10, 100],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"]
}

In [2]:
# Dictionary of models + parameter grids
models_params = {
    "Logistic Regression": (LogisticRegression(max_iter=1000), param_logreg),
    "Decision Tree": (DecisionTreeClassifier(random_state=42), param_dt),
    "Random Forest": (RandomForestClassifier(random_state=42), param_rf),
    "SVM": (SVC(probability=True, random_state=42), param_svm)
}

best_models = {}
results = []

for name, (model, params) in models_params.items():
    print(f"🔍 Tuning {name} ...")
    
    # RandomizedSearch for speed
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=20,             
        scoring="f1_weighted", 
        cv=5,
        random_state=42,
        n_jobs=-1
    )
    
    random_search.fit(x_train_reduced, y_train)
    
    best_models[name] = random_search.best_estimator_
    results.append({
        "Model": name,
        "Best Params": random_search.best_params_,
        "Best CV Score": random_search.best_score_
    })

results_df = pd.DataFrame(results)
print("\n Hyperparameter Tuning Results:")
print(results_df)

# Save results
results_df.to_csv("tuning_results.csv", index=False)


🔍 Tuning Logistic Regression ...


  y = column_or_1d(y, warn=True)


🔍 Tuning Decision Tree ...
🔍 Tuning Random Forest ...


  return fit_method(estimator, *args, **kwargs)


🔍 Tuning SVM ...

 Hyperparameter Tuning Results:
                 Model                                        Best Params  \
0  Logistic Regression      {'solver': 'saga', 'penalty': 'l2', 'C': 100}   
1        Decision Tree  {'min_samples_split': 2, 'min_samples_leaf': 1...   
2        Random Forest  {'n_estimators': 100, 'min_samples_split': 5, ...   
3                  SVM     {'kernel': 'poly', 'gamma': 'scale', 'C': 100}   

   Best CV Score  
0       0.874134  
1       0.989027  
2       0.990246  
3       0.987807  


  y = column_or_1d(y, warn=True)


In [3]:
from sklearn.metrics import classification_report

for name, model in best_models.items():
    print(f"\n📊 Final Evaluation for {name}")
    y_pred = model.predict(x_test_reduced)
    print(classification_report(y_test, y_pred))



📊 Final Evaluation for Logistic Regression
              precision    recall  f1-score   support

           0       0.86      0.85      0.86       101
           1       0.86      0.87      0.86       104

    accuracy                           0.86       205
   macro avg       0.86      0.86      0.86       205
weighted avg       0.86      0.86      0.86       205


📊 Final Evaluation for Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       101
           1       1.00      1.00      1.00       104

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted avg       1.00      1.00      1.00       205


📊 Final Evaluation for Random Forest
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       101
           1       1.00      0.97      0.99       104

    accuracy                           0.99       205
   macro

In [4]:
from sklearn.pipeline import Pipeline
import joblib

final_model = best_models["Random Forest"]
preprocessor = joblib.load("E:/Heart_Disease_Project/models/preprocessor.pkl")

# full pipeline = preprocessing + model
final_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", final_model)
])

# Save the pipeline as .pkl
joblib.dump(final_pipeline, "../models/final_model.pkl")
print("💾 Final pipeline saved as models/final_model.pkl")


💾 Final pipeline saved as models/final_model.pkl
