#### Pipeline/ColumnTransformer  -회귀

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge, Lasso 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [8]:
carseat = pd.read_csv('./data/carseat.csv')
y = carseat['Sales']
X = carseat.drop(['Sales'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

scaler = StandardScaler()
oh_enc = OneHotEncoder()

obj_col = X.dtypes[X.dtypes == 'object'].index
num_col = X.dtypes[X.dtypes != 'object'].index

ct = ColumnTransformer([('scaler', scaler, num_col),
                        ('oh_enc', oh_enc, obj_col)])
poly = PolynomialFeatures(degree=2, include_bias=False)

pipe = Pipeline([('ct', ct), ('poly', poly), 
                 ('linear', LinearRegression())])

params = [{'poly__degree' : [2, 3, 4], 
          'linear' : [Ridge(), 
                      Lasso()],
          'linear__alpha' : [1, 0.1, 0.01, 10, 100]},
          
          {'poly__degree' : [2, 3, 4], 
          'linear' : [LinearRegression()]},
          
          {'poly__degree' : [1], 
          'linear' : [RandomForestRegressor()],
           'linear__max_depth' : [None, 3, 5, 7],
           'linear__min_samples_leaf' : [1, 3, 5]}
]
grid = GridSearchCV(pipe, param_grid=params, 
                    scoring='neg_root_mean_squared_error', cv = 5, verbose=True)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [12]:
# 모델 저장하여 불러와서 기능 완전히 사용하기(libjob)
import joblib      # 바이너리 형태의 파일을 저장할 수 있게 해주는 라이브러리/ 피클과 비슷
model=grid.best_estimator_
joblib.dump(model, 'model.h5')

['model.h5']

In [None]:
model2=joblib.load('model.h5') 
model2.predict()    # 안에 맞는 데이터 넣어서 예측해보기

In [None]:
# 다른 파일에서도 불러내서 사용 가능: 
import joblib
model=joblib.load('model.h5')
model.predict()    

# 다른 것도 저장 가능하나, 학습되지 않으면 기능없음/ 너무 모델을 저장하면 속도 등의 문제가 있을 수 있음(예)
joblib.dump(rf_reg, 'model.h5') 