### EXAMPLES

### Необходимые модули

In [1]:
import numpy as np
import pandas as pd
import dill
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score

import warnings
warnings.filterwarnings('ignore')

### Загрузка пайплайнов препроцессинга

In [2]:
with open('prep_with_cat.dill', 'rb') as prep_cat_pipeline:
    prep_cat = dill.load(prep_cat_pipeline)

In [3]:
with open('prep_without_cat.dill', 'rb') as prep_without_cat_pipeline:
    prep_without_cat = dill.load(prep_without_cat_pipeline)

### Загрузка данных

In [4]:
TRAIN_DATA_PATH = 'data/X_train.csv'
TRAIN_TARGET_PATH = 'data/y_train.csv'

TEST_DATA_PATH = 'data/X_test.csv'
TEST_TARGET_PATH = 'data/y_test.csv'

VALID_DATA_PATH = 'data/X_valid.csv'
VALID_TARGET_PATH = 'data/y_valid.csv'

In [5]:
train_base = pd.read_csv(TRAIN_DATA_PATH)
train = train_base.copy()
train_target_base = pd.read_csv(TRAIN_TARGET_PATH)
train_target = train_target_base.copy()
display(train.shape, train_target.shape)

(577, 11)

(577, 1)

In [6]:
test_base = pd.read_csv(TEST_DATA_PATH)
test = test_base.copy()
test_target_base = pd.read_csv(TEST_TARGET_PATH)
test_target = test_target_base.copy()
display(test.shape, test_target.shape)

(248, 11)

(248, 1)

In [7]:
valid_base = pd.read_csv(VALID_DATA_PATH)
valid = valid_base.copy()
valid_target_base = pd.read_csv(VALID_TARGET_PATH)
valid_target = valid_target_base.copy()
display(valid.shape, valid_target.shape)

(92, 11)

(92, 1)

### Список категориальных признаков до кодирования - для LIME

In [8]:
train.head(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,56,M,ASY,137,282,1,Normal,126,Y,1.2,Flat
1,57,F,ASY,128,303,0,LVH,159,N,0.0,Up
2,59,M,ASY,124,160,0,Normal,117,Y,1.0,Flat


In [9]:
cat_features = train.select_dtypes(include=[object]).columns.to_list()
cat_features_indices = [1, 2, 6, 8, 10]
print(cat_features)
print(cat_features_indices)

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
[1, 2, 6, 8, 10]


### Список категориальных признаков после кодирования

In [10]:
ohe_cat_features = prep_cat.named_steps['preprocessor'].transformers_[1][1].named_steps['ohe'].get_feature_names(cat_features)
ohe_cat_features_indices = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
print(ohe_cat_features)
print(ohe_cat_features_indices)

['Sex_F' 'Sex_M' 'ChestPainType_ASY' 'ChestPainType_ATA'
 'ChestPainType_NAP' 'ChestPainType_TA' 'RestingECG_LVH'
 'RestingECG_Normal' 'RestingECG_ST' 'ExerciseAngina_N' 'ExerciseAngina_Y'
 'ST_Slope_Down' 'ST_Slope_Flat' 'ST_Slope_Up']
[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


### Модель для примера LogisticRegression

In [11]:
classifier = Pipeline([
    ('preprocessing', prep_cat),
    ('classifier', LogisticRegression(random_state = 0)),
])

classifier.fit(train, train_target)

Pipeline(steps=[('preprocessing',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('num',
                                                                   Pipeline(steps=[('minmax',
                                                                                    MinMaxScaler())]),
                                                                   ['Age',
                                                                    'RestingBP',
                                                                    'Cholesterol',
                                                                    'FastingBS',
                                                                    'MaxHR',
                                                                    'Oldpeak']),
                                                                  ('cat',
                                                                   Pipeline(steps=[('ohe',
  

In [12]:
y_pred_proba = classifier.predict_proba(test)
y_pred = classifier.predict(test)

In [13]:
print(f'ROC_AUC: {roc_auc_score(test_target, y_pred_proba.T[1])}')
print(f'F1 SCORE: {f1_score(test_target, y_pred)}')

ROC_AUC: 0.9215492865127901
F1 SCORE: 0.8754448398576512


### Сохранение обученной модели

In [14]:
with open("model_LR_example.dill", "wb") as f:
    dill.dump(classifier, f)