<font color="#CC3D3D"><p>
# ML Pipeline: Hyperparameter Tuning using Pipeline+Optuna

<font color="blue"><p>
#### 모형개발 절차
1. 수치형 피처
 - 결측값처리: SimpleImputer(strategy=`???`)
 - 이상값처리: FunctionTransformer()
 - 스케일링: StandardScaler()
2. 범주형 피처
 - 결측값처리: SimpleImputer(strategy="most_frequent")
 - 인코딩: OneHotEncoder(handle_unknown="ignore")
 - 차원축소: `MyPCATransformer()` # Custom PCA    
3. 공통
 - Feature Selection: SelectPercentile(percentile=`???`)
 - Modeling: Logistic Regression(C=`???`)
 - Hyperparametor Optimization: `OptunaSearchCV`

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # still experimental 
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from category_encoders import TargetEncoder  # scikit-learn과 호환됨
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn import set_config

import optuna

#### Load data

In [2]:
data = pd.read_csv('allstate_train.csv')
data.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10000000,1,0,0,08:35,IN,10001,2,0,2,...,1.0,2.0,1,0,2,2,1,2,2,633
1,10000000,2,0,0,08:38,IN,10001,2,0,2,...,1.0,2.0,1,0,2,2,1,2,1,630
2,10000000,3,0,0,08:38,IN,10001,2,0,2,...,1.0,2.0,1,0,2,2,1,2,1,630
3,10000000,4,0,0,08:39,IN,10001,2,0,2,...,1.0,2.0,1,0,2,2,1,2,1,630
4,10000000,5,0,0,11:55,IN,10001,2,0,2,...,1.0,2.0,1,0,2,2,1,2,1,630


#### 수치형/범주형 피처 분리 & 학습/평가 데이터 분할

In [3]:
numeric_features = ['group_size','car_age','age_oldest','age_youngest','duration_previous','cost']
categorical_features = ['day','homeowner','car_value','risk_factor','married_couple','C_previous','state','shopping_pt']

X_train, X_test, y_train, y_test = train_test_split(data[numeric_features+categorical_features], 
                                                    data['record_type'], test_size=0.9, 
                                                    stratify=data['record_type'], random_state=0)

####  파이프라인 구축: 수치형과 범주형 피처를 다르게 처리할 수 있는 ColumnTransformer를 활용

In [4]:
# 이상치 처리 방법 중 가장 단순한 방법:
def remove_outlier(X):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(.05), x.quantile(.95)), axis=0).values

In [22]:
# PCA 차원을 자동으로 결정하는 Custom PCA 전처리기 클래스
class MyPCATransformer(TransformerMixin, BaseEstimator):
    # 전처리기 생성 즉, MyPCATransformer() 호출시 실행
    def __init__(self, sum_explained_variance=0.99):                     # self : 클래스에 들어간 정보를 저장하는 장소, 클래스 안에서만 쓸 수 있다
        self.sum_explained_variance = sum_explained_variance

    # 전처리기의 fit() 호출시 실행
    def fit(self, X, y=None):
        max_d = X.shape[1]
        pca = PCA(n_components=max_d).fit(X)
        cumsum = np.cumsum(pca.explained_variance_ratio_)                 #분산의 설명량을 누적합
        self.num_d = np.argmax(cumsum >= self.sum_explained_variance) + 1 #분산의 설명량이 99%이상 되는 차원의 수
        if self.num_d == 1: self.num_d = max_d
        self.pca = PCA(n_components=self.num_d)
        self.pca.fit(X)
        return self
    
    # 전처리기의 transform() 호출시 실행
    def transform(self, X):
        return self.pca.transform(X)
    
# 피처별로 ohe 하고 pca 하는 편이 성능이 좋을 수 있다 -> 파이프라인에서는 그렇게는 못한다

In [14]:
pca = MyPCATransformer(sum_explained_variance=0.95)

객체 생성 완료


In [15]:
pca.sum_explained_variance

0.95

In [17]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("outlier", FunctionTransformer(remove_outlier)), # 함수를 전처리기로 변환하여 sklearn에 없는 새로운 전처리기를 만듬
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False)),
        ("pca", MyPCATransformer()),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer), 
        ("selector", SelectPercentile(percentile=50)),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", LogisticRegression()),
    ]
)

객체 생성 완료


In [18]:
set_config(display="diagram")  # To view the text pipeline, change to display='text'.
model

#### 파이프라인을 통한 모형 학습

In [19]:
model.fit(X_train, y_train)
print("model score: %.3f" % roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

객체 생성 완료
fitting
transform
transform
model score: 0.841


#### `파이프라인+Optuna`를 통한 하이퍼파라미터 최적화

In [20]:
%%time

param_distributions = {
    "preprocessor__column__num__imputer__strategy": optuna.distributions.CategoricalDistribution(["mean", "median"]), #범주형
    "preprocessor__selector__percentile": optuna.distributions.IntDistribution(50,100,step=10), #정수형
    "classifier__C": optuna.distributions.FloatDistribution(0.01, 100), #실수형
}

optuna_search = optuna.integration.OptunaSearchCV(model, param_distributions, cv=5, scoring='roc_auc', n_trials=20, 
                                                  study=optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize"))
optuna_search.fit(X_train, y_train)

[32m[I 2022-11-04 15:25:00,458][0m A new study created in memory with name: no-name-ad896b65-530a-49e5-8e55-e81cffb946ff[0m


객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:25:15,392][0m Trial 0 finished with value: 0.8427228117486727 and parameters: {'preprocessor__column__num__imputer__strategy': 'mean', 'preprocessor__selector__percentile': 70, 'classifier__C': 84.47916547066717}. Best is trial 0 with value: 0.8427228117486727.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:25:31,347][0m Trial 1 finished with value: 0.8426591422139122 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 90, 'classifier__C': 82.58701698295371}. Best is trial 0 with value: 0.8427228117486727.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:25:47,581][0m Trial 2 finished with value: 0.8427546940089862 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 100, 'classifier__C': 20.928120190497786}. Best is trial 2 with value: 0.8427546940089862.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:26:02,220][0m Trial 3 finished with value: 0.8426879325879109 and parameters: {'preprocessor__column__num__imputer__strategy': 'mean', 'preprocessor__selector__percentile': 60, 'classifier__C': 97.8625922328899}. Best is trial 2 with value: 0.8427546940089862.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:26:17,557][0m Trial 4 finished with value: 0.8426529941957505 and parameters: {'preprocessor__column__num__imputer__strategy': 'mean', 'preprocessor__selector__percentile': 90, 'classifier__C': 27.4146339666995}. Best is trial 2 with value: 0.8427546940089862.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:26:33,232][0m Trial 5 finished with value: 0.8426640677771099 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 90, 'classifier__C': 33.617833892588656}. Best is trial 2 with value: 0.8427546940089862.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:26:47,880][0m Trial 6 finished with value: 0.8422824869846315 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 50, 'classifier__C': 25.250111080949594}. Best is trial 2 with value: 0.8427546940089862.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:27:03,249][0m Trial 7 finished with value: 0.8426228114642397 and parameters: {'preprocessor__column__num__imputer__strategy': 'mean', 'preprocessor__selector__percentile': 80, 'classifier__C': 60.384415858894926}. Best is trial 2 with value: 0.8427546940089862.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:27:17,352][0m Trial 8 finished with value: 0.8422755477209989 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 50, 'classifier__C': 89.04225222857315}. Best is trial 2 with value: 0.8427546940089862.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:27:33,051][0m Trial 9 finished with value: 0.8427293245482715 and parameters: {'preprocessor__column__num__imputer__strategy': 'mean', 'preprocessor__selector__percentile': 100, 'classifier__C': 57.694380925009284}. Best is trial 2 with value: 0.8427546940089862.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:27:48,840][0m Trial 10 finished with value: 0.8436816325698281 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 100, 'classifier__C': 0.16209216634076995}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:28:04,887][0m Trial 11 finished with value: 0.842917782245021 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 100, 'classifier__C': 1.4752694528297232}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:28:20,800][0m Trial 12 finished with value: 0.8430007781128838 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 100, 'classifier__C': 0.9531599168813898}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:28:35,582][0m Trial 13 finished with value: 0.8429894744518137 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 80, 'classifier__C': 0.6270256636639941}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:28:51,557][0m Trial 14 finished with value: 0.8427667306215565 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 100, 'classifier__C': 11.166801403933569}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:29:07,566][0m Trial 15 finished with value: 0.8426634056408568 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 90, 'classifier__C': 38.36648105668739}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:29:23,335][0m Trial 16 finished with value: 0.8427571973613517 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 70, 'classifier__C': 10.965311656168248}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:29:39,043][0m Trial 17 finished with value: 0.8426397082104667 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 80, 'classifier__C': 45.20993428563923}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:29:54,643][0m Trial 18 finished with value: 0.8427617691161341 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 100, 'classifier__C': 13.127870667622197}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform
transform
객체 생성 완료
객체 생성 완료
fitting
transform


[32m[I 2022-11-04 15:30:10,332][0m Trial 19 finished with value: 0.8426713603541325 and parameters: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 90, 'classifier__C': 16.5372229623126}. Best is trial 10 with value: 0.8436816325698281.[0m


transform
객체 생성 완료
객체 생성 완료
fitting
transform
CPU times: total: 12min 17s
Wall time: 5min 13s


In [21]:
print(f"Best params: {optuna_search.best_params_}")
print(f"Internal CV score: {optuna_search.best_score_:.3f}")
print("Test score from grid search: %.3f" % roc_auc_score(y_test, optuna_search.predict_proba(X_test)[:,1]))

Best params: {'preprocessor__column__num__imputer__strategy': 'median', 'preprocessor__selector__percentile': 100, 'classifier__C': 0.16209216634076995}
Internal CV score: 0.844
transform
Test score from grid search: 0.842


<font color="#CC3D3D"><p>
# End