## Курсовой проект по курсу "Интерпретируемый ИИ и майнинг данных"
### Часть 1: Работа с табличными данными
### Этап 2: Preprocessing

## Шаг 1: Подготовка инструментов

### 1.1 Необходимые модули

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import dill

import warnings
warnings.filterwarnings('ignore')

In [2]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select all columns from the data frame to perform additional transformations on
    """
    def __init__(self, key=None):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

### 1.2 Загрузка данных

In [3]:
FULL_DATA_PATH = 'data/X_full.csv'

In [4]:
df_base = pd.read_csv(FULL_DATA_PATH)
df = df_base.copy()
display(df.shape)

(917, 11)

## Шаг 2: Подготовка пайплайнов

### 2.1 С обработкой категориальных признаков

In [5]:
num_features = df.select_dtypes(include=[np.number]).columns.to_list()
num_transformer = Pipeline(steps=[
    ('minmax', MinMaxScaler())])

cat_features = df.select_dtypes(include=[np.object]).columns.to_list()
cat_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

preprocessing = Pipeline(steps=[('preprocessor', preprocessor)])
preprocessing.fit(df)

preprocessing.named_steps['preprocessor'].transformers_[1][1].named_steps['ohe'].get_feature_names(cat_features)

array(['Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA',
       'ChestPainType_NAP', 'ChestPainType_TA', 'RestingECG_LVH',
       'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_N',
       'ExerciseAngina_Y', 'ST_Slope_Down', 'ST_Slope_Flat',
       'ST_Slope_Up'], dtype=object)

### 2.2 Без обработки категориальных признаков

In [6]:
num_features = df.select_dtypes(include=[np.number]).columns.to_list()
num_transformer = Pipeline(steps=[
    ('minmax', MinMaxScaler())])

cat_features = df.select_dtypes(include=[np.object]).columns.to_list()
cat_transformer = Pipeline(steps=[
    ('select', ColumnSelector())])

preprocessor_without_cat = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

preprocessing_without_cat = Pipeline(steps=[('preprocessor', preprocessor_without_cat)])
preprocessing_without_cat.fit(df)

cat_features

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

### 2.3 Сохраним полученные пайплайны

In [7]:
with open('prep_with_cat.dill', 'wb') as f1:
    dill.dump(preprocessing, f1)

In [8]:
with open('prep_without_cat.dill', 'wb') as f2:
    dill.dump(preprocessing_without_cat, f2)