### 라이브러리 불러오기

In [14]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

from PipeModule import PipeProcess

### 데이터 불러오기

- kaggle titanic data

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

X_train=train.drop(columns=['Survived'])
y_train=train['Survived']

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 적용 변수 분리

In [8]:
#연속형 변수
num_attribs=[ 'Age', 'Fare']

#카테고리형 변수
cat_attribs=[ 'Pclass', 'Sex', 'SibSp','Parch', 'Ticket', 'Cabin', 'Embarked']

#사용자지정 변수
age_attribs=['Age']

### 적용 함수 생성

In [35]:
### pipeline에 사용되는 사용자 지정 함수는 다음과 같이 Estimator 형태로 만들어야 함

class AgeTrans(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        X['Age'].fillna(0, inplace=True)
        X['Age'] = X['Age'].astype(int)
        return X


In [23]:
### 연속형변수 전처리 적용함수
num_fun=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
        ]

### 카테고리형변수 전처리 적용함수
cat_fun=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'))
        ]

### Age변수(사용자지정) 전처리 적용함수
Age_fun=[
        ('AGE_1', AgeTrans())
        ] 

### 입력변수

In [24]:
### 파이프라인 적용 steps 
steps={
    'age_pipe':[Age_fun,age_attribs],
    'num_pipe':[num_fun,num_attribs],
    'cat_pipe':[cat_fun,cat_attribs]
}

### 입력 모델
model=RandomForestClassifier(random_state=100)

### 그리드서치 param, cv
cv=KFold(n_splits=3, shuffle=True, random_state=1)

param_grid = {
           'preprocess__num_pipe__imputer__strategy': ["mean", "median", "most_frequent", "constant"],
           'model__n_estimators' : [10, 100],
           'model__max_depth' : [6, 8, 10, 12],
           'model__min_samples_leaf' : [8, 12, 18],
           'model__min_samples_split' : [8, 16, 20]
        }

### 파이프라인 만들기

In [25]:
###PipeProcess 객체 생성
pipe_object=PipeProcess(steps,model,param_grid,cv)

In [26]:
###파이프라인
pipe1=pipe_object.get_pipe()

### fit & predict

In [27]:
pipeline=pipe1.fit(X_train,y_train)

In [28]:
pipe1.predict(X_train)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,

### 파이프라인 Save

In [30]:
PipeProcess.save_pipe(pipe1,'save_pipe/pipe1')

save_pipe/pipe1.pkl


### 파이프라인 Load

In [31]:
loaded_pipe=PipeProcess.load_pipe('save_pipe/pipe1')

In [36]:
loaded_pipe.predict(test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,