In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Load data
X = pd.read_csv("D:\AI & ML Sprints\data\selected_features.csv")
y = pd.read_csv("D:\AI & ML Sprints\data\cleaned_heart_disease.csv")['target']

# Define models and parameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'saga']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10],
            'min_samples_split': [2, 5]
        }
    },
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    }
}

# Perform tuning and save best models
best_models = {}
for name, config in models.items():
    # Grid Search
    gs = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=5,
        scoring='accuracy'
    )
    gs.fit(X, y)
    
    # Save best model
    best_models[name] = gs.best_estimator_
    joblib.dump(gs.best_estimator_, f'D:/AI & ML Sprints/models/best_{name}.pkl')
    
    # Print results
    print(f"Best {name} params:", gs.best_params_)
    print(f"Best {name} accuracy:", gs.best_score_)
    print("----------")

# Compare with baseline performance
baseline_metrics = pd.read_csv('D:/AI & ML Sprints/results/supervised_metrics.csv')
print("\nBaseline vs Optimized Performance:\n", baseline_metrics)

Best LogisticRegression params: {'C': 1, 'solver': 'liblinear'}
Best LogisticRegression accuracy: 0.815191256830601
----------
Best RandomForest params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best RandomForest accuracy: 0.8053551912568306
----------
Best SVM params: {'C': 0.1, 'kernel': 'linear'}
Best SVM accuracy: 0.821639344262295
----------

Baseline vs Optimized Performance:
             Unnamed: 0  Accuracy  F1 Score   ROC AUC
0  Logistic Regression  0.868852  0.870968  0.946659
1        Decision Tree  0.704918  0.735294  0.700970
2        Random Forest  0.786885  0.786885  0.887931
3                  SVM  0.868852  0.870968  0.931573


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load preprocessing steps from earlier
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'trestbps', 'chol']),
        ('cat', OneHotEncoder(), ['cp', 'thal'])
    ])

# Create final pipeline with best model (e.g., LogisticRegression)
best_model = best_models['LogisticRegression']

final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])

# Save complete pipeline
joblib.dump(final_pipeline, 'D:/AI & ML Sprints/models/final_pipeline.pkl')

['D:/AI & ML Sprints/models/final_pipeline.pkl']