#Задание
**Цель:**  
Применить на практике алгоритмы по автоматической оптимизации параметров моделей машинного обучения.  
**Описание задания:**  
В домашнем задании нужно решить задачу классификации наличия болезни сердца у пациентов наиболее эффективно. Данные для обучения моделей необходимо загрузить самостоятельно с сайта. Целевая переменная – наличие болезни сердца (HeartDisease). Она принимает значения 0 или 1 в зависимости от отсутствия или наличия болезни соответственно. Подробное описание признаков можно прочесть в описании датасета на сайте. Для выполнения работы не обязательно вникать в медицинские показатели.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
import warnings
warnings.filterwarnings("ignore")

### 1.Получите данные и загрузите их в рабочую среду.

In [None]:
data = pd.read_csv('/content/drive/MyDrive/feml_data/heart.csv')
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


### 2.Подготовьте датасет к обучению моделей:
a) Категориальные переменные переведите в цифровые значения. Можно использовать pd.get_dummies, preprocessing.LabelEncoder. Старайтесь не использовать для этой задачи циклы.


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


кодирование категориальных столбцов

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
data['Sex'] = le.fit(data['Sex']).transform(data['Sex'])

In [None]:
data['ChestPainType'] = le.fit(data['ChestPainType']).transform(data['ChestPainType'])

In [None]:
data['RestingECG'] = le.fit(data['RestingECG']).transform(data['RestingECG'])

In [None]:
data['ExerciseAngina'] = le.fit(data['ExerciseAngina']).transform(data['ExerciseAngina'])

In [None]:
data['ST_Slope'] = le.fit(data['ST_Slope']).transform(data['ST_Slope'])

In [None]:
print(data['Age'].min())
print(data['Age'].max())
print(data['Age'].unique())

28
77
[40 49 37 48 54 39 45 58 42 38 43 60 36 44 53 52 51 56 41 32 65 35 59 50
 47 31 46 57 55 63 66 34 33 61 29 62 28 30 74 68 72 64 69 67 73 70 77 75
 76 71]


In [None]:
# Признак Age преобразуем в категории, если возраст от 0 до 39 - young, 40-59 - adult, 60 - 100 - пожилой (elderly)
# функция создает новый столбец с категориями Age_cat (young, adult, elderly)
def age_category(row):
    if  (row['Age'] > 0) & (row['Age'] < 40):
        row['Age_cat'] = 'young'
    elif (row['Age'] > 39) and row['Age'] < 60:
        row['Age_cat'] = 'adult'
    elif row['Age'] > 59 and row['Age'] < 100:
        row['Age_cat'] = 'elderly'
    else: row['Age_cat']  = 100
    return row

In [None]:
data = data.apply(age_category, axis = 1)
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_cat
0,40.0,1.0,1.0,140.0,289.0,0.0,1.0,172.0,0.0,0.0,2.0,0.0,adult
1,49.0,0.0,2.0,160.0,180.0,0.0,1.0,156.0,0.0,1.0,1.0,1.0,adult
2,37.0,1.0,1.0,130.0,283.0,0.0,2.0,98.0,0.0,0.0,2.0,0.0,young
3,48.0,0.0,0.0,138.0,214.0,0.0,1.0,108.0,1.0,1.5,1.0,1.0,adult
4,54.0,1.0,2.0,150.0,195.0,0.0,1.0,122.0,0.0,0.0,2.0,0.0,adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45.0,1.0,3.0,110.0,264.0,0.0,1.0,132.0,0.0,1.2,1.0,1.0,adult
914,68.0,1.0,0.0,144.0,193.0,1.0,1.0,141.0,0.0,3.4,1.0,1.0,elderly
915,57.0,1.0,0.0,130.0,131.0,0.0,1.0,115.0,1.0,1.2,1.0,1.0,adult
916,57.0,0.0,1.0,130.0,236.0,0.0,0.0,174.0,0.0,0.0,1.0,1.0,adult


In [None]:
data = data.drop(columns = ['Age'])
data = pd.get_dummies (data, columns = ['Age_cat'])

In [None]:
data

Unnamed: 0,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Age_cat_adult,Age_cat_elderly,Age_cat_young
0,1.0,1.0,140.0,289.0,0.0,1.0,172.0,0.0,0.0,2.0,0.0,1,0,0
1,0.0,2.0,160.0,180.0,0.0,1.0,156.0,0.0,1.0,1.0,1.0,1,0,0
2,1.0,1.0,130.0,283.0,0.0,2.0,98.0,0.0,0.0,2.0,0.0,0,0,1
3,0.0,0.0,138.0,214.0,0.0,1.0,108.0,1.0,1.5,1.0,1.0,1,0,0
4,1.0,2.0,150.0,195.0,0.0,1.0,122.0,0.0,0.0,2.0,0.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,1.0,3.0,110.0,264.0,0.0,1.0,132.0,0.0,1.2,1.0,1.0,1,0,0
914,1.0,0.0,144.0,193.0,1.0,1.0,141.0,0.0,3.4,1.0,1.0,0,1,0
915,1.0,0.0,130.0,131.0,0.0,1.0,115.0,1.0,1.2,1.0,1.0,1,0,0
916,0.0,1.0,130.0,236.0,0.0,0.0,174.0,0.0,0.0,1.0,1.0,1,0,0


### 3.Разделите выборку на обучающее и тестовое подмножество. 80% данных оставить на обучающее множество, 20% на тестовое.

In [None]:
X = data[['Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'Age_cat_adult', 'Age_cat_elderly', 'Age_cat_young']]
y = data['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 4.Обучите модель логистической регрессии с параметрами по умолчанию.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

LogisticRegression()

In [None]:
predictions = model.predict(X_test)

In [None]:
model.score(X_train, y_train)

0.8596730245231607

In [None]:
model.score(X_test,y_test)

0.8369565217391305

In [None]:
 from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.76      0.88      0.82        77
         1.0       0.91      0.80      0.85       107

    accuracy                           0.84       184
   macro avg       0.83      0.84      0.84       184
weighted avg       0.85      0.84      0.84       184



### 5.Подсчитайте основные метрики модели. Используйте следующие метрики и функцию:
cross_validate(…, cv=10, scoring=[‘accuracy’,‘recall’,‘precision’,‘f1’])

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
scores = cross_validate(model, X, y, cv=10, scoring=['accuracy', 'recall', 'precision', 'f1'])
scores

{'fit_time': array([0.06245995, 0.04618764, 0.04535055, 0.05641603, 0.06167626,
        0.0542376 , 0.05097651, 0.07433176, 0.07358122, 0.06092048]),
 'score_time': array([0.00708103, 0.0068748 , 0.0135808 , 0.00682044, 0.00810194,
        0.00782609, 0.00791001, 0.0093534 , 0.00922608, 0.00895071]),
 'test_accuracy': array([0.7826087 , 0.88043478, 0.84782609, 0.88043478, 0.85869565,
        0.84782609, 0.86956522, 0.79347826, 0.76923077, 0.81318681]),
 'test_recall': array([0.70588235, 0.8627451 , 0.80392157, 0.8627451 , 0.96078431,
        0.98039216, 0.98039216, 0.80392157, 0.74      , 0.76      ]),
 'test_precision': array([0.87804878, 0.91666667, 0.91111111, 0.91666667, 0.81666667,
        0.79365079, 0.81967213, 0.82      , 0.82222222, 0.88372093]),
 'test_f1': array([0.7826087 , 0.88888889, 0.85416667, 0.88888889, 0.88288288,
        0.87719298, 0.89285714, 0.81188119, 0.77894737, 0.8172043 ])}

### 6.Оптимизируйте 3-4 параметра модели:


#### a) Используйте GridSearchCV.


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%time

param_grid = {'penalty':['l2', 'l1', 'elasticnet'],
              'max_iter': [100, 500, 1000, 1500, 2000],
            #   'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
              'multi_class': ['auto', 'ovr', 'multinomial']
}
model = LogisticRegression()
grid = GridSearchCV(model, param_grid, cv=10, scoring='accuracy')

grid.fit(X,y)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.78 µs


GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'max_iter': [100, 500, 1000, 1500, 2000],
                         'multi_class': ['auto', 'ovr', 'multinomial'],
                         'penalty': ['l2', 'l1', 'elasticnet']},
             scoring='accuracy')

In [None]:
print(grid.best_score_)
print(grid.best_estimator_)

0.8343287147634975
LogisticRegression()


#### b) Используйте RandomizedSearchCV.


In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
%%time

param_grid = {'penalty':['l2', 'l1', 'elasticnet'],
              'max_iter': [100, 500, 1000, 1500, 2000],
              'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
              'multi_class': ['auto', 'ovr', 'multinomial']
}
model = LogisticRegression()
grid = RandomizedSearchCV(model, param_grid, cv=10, scoring='accuracy')

grid.fit(X,y)

CPU times: user 3.67 s, sys: 1.49 s, total: 5.15 s
Wall time: 3.59 s


RandomizedSearchCV(cv=10, estimator=LogisticRegression(),
                   param_distributions={'max_iter': [100, 500, 1000, 1500,
                                                     2000],
                                        'multi_class': ['auto', 'ovr',
                                                        'multinomial'],
                                        'penalty': ['l2', 'l1', 'elasticnet'],
                                        'solver': ['lbfgs', 'liblinear',
                                                   'newton-cg',
                                                   'newton-cholesky', 'sag',
                                                   'saga']},
                   scoring='accuracy')

In [None]:
print(grid.best_score_)
print(grid.best_estimator_)

0.8321548017200191
LogisticRegression(max_iter=1500, solver='liblinear')


#### c) *Добавьте в п. 6b 2-5 моделей классификации и вариации их параметров.


In [None]:
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import uniform

In [None]:
models=[
      {'name':'NB',"model":BernoulliNB(), 'params':{'alpha': uniform(loc=0, scale=4)}},
      {'name':'Lr',"model": LogisticRegression()  , 'params':{'C':[0.1,0.2,0.3,0.5,0.7,1], 'penalty':['l1', 'l2']}},
      {'name':'R',"model": Ridge(), 'params':{'alpha': uniform(loc=0, scale=4), 'solver':['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}},
      {'name':'SVC',"model": SVC(), 'params':{'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'gamma':['scale', 'auto']}},
      {'name':'RF',"model": RandomForestClassifier(), 'params':{'n_estimators':[10,25,50,100,150,200], 'criterion':['gini', 'entropy'], 'max_depth':[3,5,7,9,11]}},
      {'name':'KN',"model": KNeighborsClassifier(), 'params':{'n_neighbors':list(range(1,30)),'weights': ['uniform', 'distance'], 'p':[1,2,3]}},
      {'name':'DT',"model": DecisionTreeClassifier(), 'params':{'criterion':['gini', 'entropy'], 'max_depth':[3,5,7,9,11]}}

]

res=[]
for v in  models:
    res.append((v['name'], RandomizedSearchCV(v['model'], v['params'], cv=10).fit(X_train, y_train)))

#### d) Повторите п. 5 после каждого итогового изменения параметров.

In [None]:
for r in res:
    print(r[0], r[1].best_score_, r[1].best_params_)

NB 0.8119585338763422 {'alpha': 1.126482383562423}
Lr 0.855516475379489 {'penalty': 'l2', 'C': 0.5}
R 0.5083162854858131 {'alpha': 3.2945490227976677, 'solver': 'sparse_cg'}
SVC 0.8595520177711959 {'kernel': 'linear', 'gamma': 'scale'}
RF 0.8758793039614957 {'n_estimators': 200, 'max_depth': 11, 'criterion': 'gini'}
KN 0.7396519807478711 {'weights': 'distance', 'p': 1, 'n_neighbors': 10}
DT 0.8472972972972974 {'max_depth': 3, 'criterion': 'entropy'}


Выводы: лучше всего отработала модель RandomForestClassifier - метрика - 0.8758793039614957

параметры - {'n_estimators': 200, 'max_depth': 11, 'criterion': 'gini'}