# 클리브랜드 심장병 데이터셋을 이용.
- 전처리
- 학습
- 평가
파이프라인 만들기.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# % ======================================================================
# % John Gennari
# % 3/13/90
# %
# % This is Dr. Detrano's database modified to be a real MIXED dataset.
# %
# % Attributes: 8 symbolic, 6 numeric.
# %  Age; sex; chest pain type (angina, abnang, notang, asympt)
# %  Trestbps (resting blood pres); cholesteral; fasting blood sugar < 120
# %  (true or false); resting ecg (norm, abn, hyper); max heart rate; 
# %  exercise induced angina (true or false); oldpeak; slope (up, flat, down)
# %  number of vessels colored (???); thal (norm, fixed, rever). Finally, the
# %  class is either healthy (buff) or with heart-disease (sick).
# %
# % Original atts: 
# %   age; sex (1,0); cp (1-4); trestbps; chol; fbs (1,0); restecg (0,1,2); 
# %   thalach; exang (1,0); oldpeak; slope (1,2,3); ca; thal (3,6,7);
# %   class att: 0 is healthy, 1,2,3,4 is sick.
# % ======================================================================

cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'class', 'class_num']
continues_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
target_col_1 = ['class']
target_col_2 = ['class_num']

## 데이터셋 확인

In [7]:
cleve = pd.read_csv(r'./cleve.mod', sep='\s+', header=None, names=cols, na_values='?', index_col=False)

In [8]:
cleve.shape

(303, 15)

In [22]:
cleve.info()

<class 'pandas.core.frame.DataFrame'>
Index: 296 entries, 0 to 302
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        296 non-null    float64
 1   sex        296 non-null    object 
 2   cp         296 non-null    object 
 3   trestbps   296 non-null    float64
 4   chol       296 non-null    float64
 5   fbs        296 non-null    object 
 6   restecg    296 non-null    object 
 7   thalach    296 non-null    float64
 8   exang      296 non-null    object 
 9   oldpeak    296 non-null    float64
 10  slope      296 non-null    object 
 11  ca         296 non-null    float64
 12  thal       296 non-null    object 
 13  class      296 non-null    object 
 14  class_num  296 non-null    object 
dtypes: float64(6), object(9)
memory usage: 37.0+ KB


In [9]:
cleve.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,class,class_num
0,63.0,male,angina,145.0,233.0,true,hyp,150.0,fal,2.3,down,0.0,fix,buff,H
1,67.0,male,asympt,160.0,286.0,fal,hyp,108.0,true,1.5,flat,3.0,norm,sick,S2
2,67.0,male,asympt,120.0,229.0,fal,hyp,129.0,true,2.6,flat,2.0,rev,sick,S1
3,37.0,male,notang,130.0,250.0,fal,norm,187.0,fal,3.5,down,0.0,norm,buff,H
4,41.0,fem,abnang,130.0,204.0,fal,hyp,172.0,fal,1.4,up,0.0,norm,buff,H


In [10]:
cleve[categorical_cols].head()

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,thal
0,male,angina,true,hyp,fal,down,fix
1,male,asympt,fal,hyp,true,flat,norm
2,male,asympt,fal,hyp,true,flat,rev
3,male,notang,fal,norm,fal,down,norm
4,fem,abnang,fal,hyp,fal,up,norm


In [11]:
cleve[continues_cols].head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,ca
0,63.0,145.0,233.0,150.0,2.3,0.0
1,67.0,160.0,286.0,108.0,1.5,3.0
2,67.0,120.0,229.0,129.0,2.6,2.0
3,37.0,130.0,250.0,187.0,3.5,0.0
4,41.0,130.0,204.0,172.0,1.4,0.0


In [12]:
cleve[target_col_1 + target_col_2].head()

Unnamed: 0,class,class_num
0,buff,H
1,sick,S2
2,sick,S1
3,buff,H
4,buff,H


In [13]:
cleve[target_col_1].value_counts(), cleve[target_col_2].value_counts()

(class
 buff     165
 sick     138
 Name: count, dtype: int64,
 class_num
 H            165
 S1            54
 S2            36
 S3            35
 S4            13
 Name: count, dtype: int64)

In [14]:
# 결측치 확인
cleve.isnull().sum()

age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           5
thal         2
class        0
class_num    0
dtype: int64

In [16]:
# 결측치 제거
cleve = cleve.dropna()

In [21]:
cleve.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,class,class_num
0,63.0,male,angina,145.0,233.0,true,hyp,150.0,fal,2.3,down,0.0,fix,buff,H
1,67.0,male,asympt,160.0,286.0,fal,hyp,108.0,true,1.5,flat,3.0,norm,sick,S2
2,67.0,male,asympt,120.0,229.0,fal,hyp,129.0,true,2.6,flat,2.0,rev,sick,S1
3,37.0,male,notang,130.0,250.0,fal,norm,187.0,fal,3.5,down,0.0,norm,buff,H
4,41.0,fem,abnang,130.0,204.0,fal,hyp,172.0,fal,1.4,up,0.0,norm,buff,H


In [26]:
# ca칼럼은 int형으로, fbs, exang는 bool형으로 변환
import numpy as np
cleve['ca'] = cleve['ca'].astype(np.int64)
cleve['fbs'] = cleve['fbs'] == 'true'
cleve['exang'] = cleve['exang'] == 'true'

In [27]:
cleve.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,class,class_num
0,63.0,male,angina,145.0,233.0,True,hyp,150.0,False,2.3,down,0,fix,buff,H
1,67.0,male,asympt,160.0,286.0,False,hyp,108.0,True,1.5,flat,3,norm,sick,S2
2,67.0,male,asympt,120.0,229.0,False,hyp,129.0,True,2.6,flat,2,rev,sick,S1
3,37.0,male,notang,130.0,250.0,False,norm,187.0,False,3.5,down,0,norm,buff,H
4,41.0,fem,abnang,130.0,204.0,False,hyp,172.0,False,1.4,up,0,norm,buff,H


In [28]:
# 데이터 분할
X = cleve.drop(target_col_1 + target_col_2, axis=1)
y = cleve[target_col_2] # class_num만 사용

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=1473032201,
                                                    )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((236, 13), (60, 13), (236, 1), (60, 1))

In [29]:
# 타겟 데이터 라벨링
from sklearn.preprocessing import LabelEncoder
target_le = LabelEncoder()
y_train = target_le.fit_transform(y_train)
y_test = target_le.transform(y_test)
target_le.classes_

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array(['H', 'S1', 'S2', 'S3', 'S4'], dtype=object)

In [30]:
# 파이프라인에 사용할 모듈 만들기.
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
# 라벨링할 데이터, 원핫인코딩할 데이터, 스케일링할 데이터를 나누어서 파이프라인에 넣어준다.
cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'class', 'class_num']
continues_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
target_col_1 = ['class']
target_col_2 = ['class_num']

scale_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
ohe_cols = ['cp', 'restecg', 'slope', 'ca', 'thal']
label_cols = ['sex'] #, 'fbs', 'exang'] -> bool형은 내부적으로 0,1 이므로 라벨링할 필요 없음.

In [None]:
# 사용자 정의 변환기 구현
from sklearn.base import BaseEstimator, TransformerMixin

class OheColsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, cols):
        self.cols = cols    # ohe_cols
    
    def fit(self, X):
        ohe_dict = {}
        result_cols_name = []
        # cols에 있는 칼럼들을 원핫인코딩 객체를 생성하여 ohe에 저장
        for col in self.cols:
            ohe = OneHotEncoder(sparse=False)
            ohe.fit(X[[col]])    # ohe.fit(X[[col]]) -> 2차원으로 넣어줘야함.
            ohe_dict[col] = ohe # ohe 객체를 저장
            result_cols_name += list(ohe.get_feature_names_out())
        self.ohes = ohe_dict
        self.result_cols_name = result_cols_name
        return self
    
    def transform(self, X):
        # X를 복사하여 X_ohe에 저장
        X_ohe = X.copy()
        # ohe_cols에 있는 칼럼들을 원핫인코딩
        for col in self.cols:
            ohe = self.ohes[col]
            # 원핫인코딩 후 칼럼명을 지정하여 저장
            X_ohe = pd.concat([X_ohe, pd.DataFrame(ohe.transform(X[[col]]), columns=ohe.get_feature_names([col]))], axis=1)
            # 원핫인코딩된 칼럼 삭제
            X_ohe = X_ohe.drop(col, axis=1)
    
        