In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import pickle


heart_dataset = pd.read_csv("../datasets/heart.csv")


x = heart_dataset.drop(columns=['target'], axis=1)
y = heart_dataset['target']


scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)


x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)


model = LogisticRegression(random_state=42, max_iter=1000)


param_grid = [
    {'model__C': [0.01, 0.1, 1, 10, 100], 'model__penalty': ['l2'], 'model__solver': ['lbfgs', 'saga']},
    {'model__C': [0.01, 0.1, 1, 10, 100], 'model__penalty': ['l1'], 'model__solver': ['liblinear', 'saga']},
    {'model__C': [0.01, 0.1, 1, 10, 100], 'model__penalty': ['elasticnet'], 'model__solver': ['saga'], 'model__l1_ratio': [0.5]}
]


pipeline = Pipeline([
    ('oversample', SMOTE(random_state=42)),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('model', model)
])


grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)


best_model = grid_search.best_estimator_


cv_scores = cross_val_score(best_model, x_train, y_train, cv=5)
print(f"Logistic Regression Cross-Validation Accuracy: {np.mean(cv_scores) * 100:.2f}%")


best_model.fit(x_train, y_train)


train_predictions = best_model.predict(x_train)
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Logistic Regression Training Accuracy: {train_accuracy * 100:.2f}%")


test_predictions = best_model.predict(x_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Logistic Regression Test Accuracy: {test_accuracy * 100:.2f}%")


filename = 'heart_model.sav'
pickle.dump(best_model, open(filename, 'wb'))


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Logistic Regression Cross-Validation Accuracy: 83.47%
Logistic Regression Training Accuracy: 84.71%
Logistic Regression Test Accuracy: 86.89%
