### EXAMPLES
### Категориальные признаки для LIME
Понадобится для пункта 4 первой части проекта.
### Пример использования пайплайна препроцессинга при обучении модели
Понадобится для пункта 2 первой части проекта.

### Необходимые модули

In [1]:
import numpy as np
import pandas as pd
import dill
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score

import warnings
warnings.filterwarnings('ignore')

### Загрузка пайплайнов препроцессинга

In [2]:
with open('../prep_with_cat.dill', 'rb') as prep_cat_pipeline:
    prep_cat = dill.load(prep_cat_pipeline)

In [3]:
with open('../prep_without_cat.dill', 'rb') as prep_without_cat_pipeline:
    prep_without_cat = dill.load(prep_without_cat_pipeline)

### Загрузка данных

In [4]:
TRAIN_DATA_PATH = '../data/X_train.csv'
TRAIN_TARGET_PATH = '../data/y_train.csv'

TEST_DATA_PATH = '../data/X_test.csv'
TEST_TARGET_PATH = '../data/y_test.csv'

In [5]:
train_base = pd.read_csv(TRAIN_DATA_PATH)
train = train_base.copy()
train_target_base = pd.read_csv(TRAIN_TARGET_PATH)
train_target = train_target_base.copy()
display(train.shape, train_target.shape)

(641, 11)

(641, 1)

In [6]:
test_base = pd.read_csv(TEST_DATA_PATH)
test = test_base.copy()
test_target_base = pd.read_csv(TEST_TARGET_PATH)
test_target = test_target_base.copy()
display(test.shape, test_target.shape)

(276, 11)

(276, 1)

### Список категориальных признаков до кодирования - для LIME

In [7]:
train.head(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,54,F,NAP,108,267,0,LVH,167,N,0.0,Up
1,55,M,ASY,120,226,0,LVH,127,Y,1.7,Down
2,62,F,ASY,160,164,0,LVH,145,N,6.2,Down


In [8]:
cat_features = train.select_dtypes(include=[object]).columns.to_list()
cat_features_indices = [1, 2, 6, 8, 10]
print(cat_features)
print(cat_features_indices)

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
[1, 2, 6, 8, 10]


### Список категориальных признаков после кодирования

In [9]:
ohe_cat_features = prep_cat.named_steps['preprocessor'].transformers_[1][1].named_steps['ohe'].get_feature_names(cat_features)
ohe_cat_features_indices = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
print(ohe_cat_features)
print(ohe_cat_features_indices)

['Sex_F' 'Sex_M' 'ChestPainType_ASY' 'ChestPainType_ATA'
 'ChestPainType_NAP' 'ChestPainType_TA' 'RestingECG_LVH'
 'RestingECG_Normal' 'RestingECG_ST' 'ExerciseAngina_N' 'ExerciseAngina_Y'
 'ST_Slope_Down' 'ST_Slope_Flat' 'ST_Slope_Up']
[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


### Модель для примера LogisticRegression

In [10]:
classifier = Pipeline([
    ('preprocessing', prep_cat),
    ('classifier', LogisticRegression(random_state = 0)),
])

classifier.fit(train, train_target)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('num',
                                                                   Pipeline(steps=[('minmax',
                                                                                    MinMaxScaler())]),
                                                                   ['Age',
                                                                    'RestingBP',
                                                                    'Cholesterol',
                                                                    'FastingBS',
                                                                    'MaxHR',
                                                                    'Oldpeak']),
                                                                  ('cat',
                                                                   Pipeline(steps=[('ohe',
  

In [11]:
y_pred_proba = classifier.predict_proba(test)
y_pred = classifier.predict(test)

In [12]:
print(f'ROC_AUC: {roc_auc_score(test_target, y_pred_proba.T[1])}')
print(f'F1 SCORE: {f1_score(test_target, y_pred)}')

ROC_AUC: 0.9447367022689834
F1 SCORE: 0.8686868686868686


### Сохранение обученной модели

In [13]:
with open("example_model_LR_example.dill", "wb") as f:
    dill.dump(classifier, f)