# Predifines

In [1]:
# Для функционирования watermark - раскомментируйте строку ниже, либо установите библиотеку в консоли вручную
# !pip install watermark
%load_ext watermark

In [2]:
%watermark -v -m -p numpy,matplotlib,pandas,sklearn,seaborn -g

Python implementation: CPython
Python version       : 3.10.7
IPython version      : 8.5.0

numpy     : 1.23.4
matplotlib: 3.6.0
pandas    : 1.5.1
sklearn   : 0.0
seaborn   : 0.12.1

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 21.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

Git hash: f2716f75e9545c52f0049a4d989efaf478ff53b5



# Modules

In [10]:
# Future Python versions compatibility
from __future__ import division

# Warnings Ignore
import warnings
warnings.filterwarnings("ignore")

# System libraries
import os
import random

# Data manipulation libraries
import pandas as pd
import numpy as np

# Data transformation libraries
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 

# Functions and Classes

In [8]:
def seed_all(seed):
    """A function to seed everything for getting stable results and reproducibility"""
    
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed = 42    
seed_all(seed)

# Preprocessing

In [47]:
data_path = os.path.join(os.path.dirname(os.getcwd()), "data/raw")
raw_data_filename = "heart_cleveland_upload.csv"

In [48]:
raw_data = pd.read_csv(os.path.join(data_path, raw_data_filename))
raw_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


In [49]:
X = raw_data.drop('condition', axis=1)
y = raw_data['condition']

In [50]:
X.nunique()

age          41
sex           2
cp            4
trestbps     50
chol        152
fbs           2
restecg       3
thalach      91
exang         2
oldpeak      40
slope         3
ca            4
thal          3
dtype: int64

In [61]:
CAT_FEATURES = [item for item in X.loc[:, X.nunique() <= 10]]
NUMERICAL_FEATURES = [item for item in X.loc[:, X.nunique() > 10]]

In [64]:
CAT_FEATURES

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [63]:
NUMERICAL_FEATURES

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [66]:
assert len(CAT_FEATURES) + len(NUMERICAL_FEATURES) == len(X.columns)

In [68]:
encoder = OneHotEncoder()
encoder.fit(X[CAT_FEATURES])
encoder.categories_

X_transform = pd.concat(
    (
        X.drop(CAT_FEATURES, axis=1),
        pd.DataFrame(
            data=encoder.transform(X[CAT_FEATURES]).toarray(),
            columns=encoder.get_feature_names_out(),
        ),
    ),
    axis=1,
)
X_transform

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,slope_0,slope_1,slope_2,ca_0,ca_1,ca_2,ca_3,thal_0,thal_1,thal_2
0,69,160,234,131,0.1,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,69,140,239,151,1.8,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,66,150,226,114,2.6,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,65,138,282,174,1.4,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,64,110,211,144,1.8,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,152,223,181,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
293,39,118,219,140,1.2,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
294,35,120,198,130,1.6,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
295,35,138,183,182,1.4,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
