In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor


from sklearn.metrics import recall_score, precision_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

## Загрузка данных

In [2]:
heart_df = pd.read_csv('./Data/heart.csv')
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## 1. Кодирование переменных

In [3]:
heart_df = pd.concat([heart_df, pd.get_dummies(heart_df['ChestPainType'])], axis=1).drop('ChestPainType', axis=1)
heart_df = pd.concat([heart_df, pd.get_dummies(heart_df['RestingECG'])], axis=1).drop('RestingECG', axis=1)
heart_df = pd.concat([heart_df, pd.get_dummies(heart_df['ST_Slope'])], axis=1).drop('ST_Slope', axis=1)
heart_df['Sex'] = LabelEncoder().fit_transform(heart_df['Sex'])
heart_df['ExerciseAngina'] = LabelEncoder().fit_transform(heart_df['ExerciseAngina'])

## 2. Разделение данных на train и test

In [4]:
X = heart_df.drop('HeartDisease', axis=1)
y = heart_df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## 3. Обучение модели логистической регрессии без параметров

In [5]:
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

### 3.1 Основные метрики модели на кросс-валидации

In [6]:
default_log_reg = pd.DataFrame(
    cross_validate(log_reg_model, X_train, y_train, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])
).drop(['fit_time', 'score_time'], axis=1)

## 4. Оптимизация моделей через GridSearchCV 

In [7]:
parametrs = {
    'multi_class': ('auto', 'ovr', 'multinomial'),
    'l1_ratio': np.arange(0, 1.1, 0.1),
    'solver': ('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga')
}

grid_search = GridSearchCV(log_reg_model, parametrs)
grid_search.fit(X_train, y_train)

### 4.1 Проверка метрик модели с оптимизированными параметрами на кросс валидации

In [8]:
GS_lof_reg = pd.DataFrame(cross_validate(log_reg_model.set_params(**grid_search.best_estimator_.get_params()), 
    X_train, 
    y_train,
    cv=10,
    scoring=['accuracy', 'recall', 'precision', 'f1']
)).drop(['fit_time', 'score_time'], axis=1)

## 5. Оптимизация моделей через RandomizedSearchCV 

In [9]:
parametrs = {
    'multi_class': ('auto', 'ovr', 'multinomial'),
    'l1_ratio': np.arange(0, 1.1, 0.1),
    'solver': ('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga')
}

random_search = RandomizedSearchCV(log_reg_model, parametrs, random_state=42).fit(X_train, y_train)

### 4.1 Проверка метрик модели с оптимизированными параметрами на кросс валидации

In [10]:
RS_log_reg = pd.DataFrame(cross_validate(log_reg_model.set_params(**random_search.best_params_), 
    X_train, 
    y_train,
    cv=10,
    scoring=['accuracy', 'recall', 'precision', 'f1']
)).drop(['fit_time', 'score_time'], axis=1)

## Сравнение метрик

In [14]:
default_log_reg.mean()

test_accuracy     0.861033
test_recall       0.893264
test_precision    0.866897
test_f1           0.878196
dtype: float64

In [15]:
GS_lof_reg.mean()

test_accuracy     0.854165
test_recall       0.885947
test_precision    0.861208
test_f1           0.871727
dtype: float64

In [16]:
RS_log_reg.mean()

test_accuracy     0.854165
test_recall       0.885947
test_precision    0.861208
test_f1           0.871727
dtype: float64

> Если взять среднее значение метрик качества, то после оптимизации они показали себя чуть лучше