## 머신러닝 Pipeline + Optuna
- 이전 강의들에서 결측치 처리, 모델 학습 등을 진행하였다. 
- 이 때, 결측치 처리, 스케일링, 하이퍼 파라미터 등을 최소화하여 쉽게 연결할 수 있도록 도와준다. 
- 본 예제에서는 scikit-learn pipeline에 대해 학습할 예정이다. 

## 필수 라이브러리 불러오기
- 본 튜토리얼에 적합한 주요 라이브러리들을 불러온다. 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import set_config
import optuna
import mlflow

import xgboost
from xgboost import XGBRegressor

print(f"numpy version {np.__version__}")
print(f"pandas version {pd.__version__}")
print(f"seaborn version {sns.__version__}")
print(f"sklearn version {sklearn.__version__}")
print(f"optuna version {optuna.__version__}")
print(f"xgboost version {xgboost.__version__}")

numpy version 1.23.1
pandas version 1.4.3
seaborn version 0.11.2
sklearn version 1.1.1
optuna version 3.0.0b1
xgboost version 1.6.1


## 데이터 불러오기
- 본 실습을 위해 간단한 데이터를 불러온다. 

In [2]:
# DATA_PATH = "C:\\Users\\human\\Desktop\\mlops_tutorial\\data\\bike-sharing-demand\\"
DATA_PATH = "C:\\Users\\j2hoo\\OneDrive\\Desktop\\mlops_tutorial\\data\\bike-sharing-demand\\"

train_df = pd.read_csv(DATA_PATH + "train.csv")
test = pd.read_csv(DATA_PATH + "test.csv")
submission = pd.read_csv(DATA_PATH + "sampleSubmission.csv")

train_df.shape, test.shape, submission.shape

((10886, 12), (6493, 9), (6493, 2))

## 데이터 전처리
- 데이터 전처리는 기존 강의와 비슷하게 진행한다. 
- 단, 범주형 데이터셋은 그대로 놔두도록 한다. 

In [3]:
# 타깃값 별도 저장
train_df = train_df[train_df['weather'] != 4]
y = train_df['count'] # 타깃값

# count 컬럼 제거
train_df = train_df.drop(['count'], axis=1)

# 데이터 합치기
all_df = pd.concat([train_df, test])

# 날짜 데이터로 파생변수 만들기
all_df['date'] = pd.to_datetime(all_df['datetime'])
all_df['year'] = all_df['date'].dt.year
all_df['date'] = pd.to_datetime(all_df['datetime'])
all_df['year'] = all_df['date'].dt.year
all_df['month'] = all_df['date'].dt.month
all_df['day'] = all_df['date'].dt.day
all_df['hour'] = all_df['date'].dt.hour
all_df['weekday'] = all_df['date'].dt.day_name()

# 원-핫 인코딩을 위해 변경함
season_num = [1, 2, 3, 4]
season_str = ['Spring', 'Summer', 'Fall', 'Winter']
all_df['season'] = all_df['season'].replace(season_num, season_str)

weather_num = [1, 2, 3, 4]
weather_str = ['Clear', 'Few Clouds', 'Light Snow, Rain', 'Heavy Snow, Rain']
all_df['weather'] = all_df['weather'].replace(season_num, season_str)

del_features = ['casual', 'registered', 'datetime', 'date', 'windspeed', 'month', 'atemp']
all_df = all_df.drop(del_features, axis=1)

all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17378 entries, 0 to 6492
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      17378 non-null  object 
 1   holiday     17378 non-null  int64  
 2   workingday  17378 non-null  int64  
 3   weather     17378 non-null  object 
 4   temp        17378 non-null  float64
 5   humidity    17378 non-null  int64  
 6   year        17378 non-null  int64  
 7   day         17378 non-null  int64  
 8   hour        17378 non-null  int64  
 9   weekday     17378 non-null  object 
dtypes: float64(1), int64(6), object(3)
memory usage: 1.5+ MB


## 데이터셋 분리

In [4]:
train = all_df.iloc[0:len(y), :]
test = all_df.iloc[len(y):, :]
train.shape, test.shape

((10885, 10), (6493, 10))

In [5]:
# train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.3, shuffle=True, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((7619, 10), (3266, 10), (7619,), (3266,))

## 데이터 전처리 파이프라인 만들기
- (1) 수치형 변수에 대해서는 StandardScaler() 클래스를 이용해서 표준화를 진행했다. 
- (2) 범주형 변수의 원핫인코딩하는 데이터 전처리 파이프라인을 만든다. 

In [6]:
# (1) 수치형 변수 StandardScaler()
num_features = ["temp", "humidity", "hour"]
num_transformer = Pipeline(
    steps = [("scaler", StandardScaler()), ("pca", PCA())]
)

# (2) 범주형 변수 원핫 인코딩 OneHotEncoder()
cat_features = ["season", "weather", "weekday"]
cat_transformer = OneHotEncoder(handle_unknown="ignore")

# (3) ColumnTransformer
preprocessor = ColumnTransformer(
    transformers = [
        ("num", num_transformer, num_features), 
        ("cat", cat_transformer, cat_features)
    ]
)

## Optuna
- 그리드 서치 리뷰

In [8]:
set_config(display = 'diagram')

# (4) 모델 정의
pipe_reg_model = Pipeline(
    steps = [("preprocessor", preprocessor), ("xgbrg", XGBRegressor())]
)

# (5) Grid Search
param_grid = {
    'xgbrg__max_depth' : [2, 7, 10], 
    'xgbrg__n_estimators' : [10, 500], 
}

gridCV = GridSearchCV(pipe_reg_model, cv = 2, param_grid = param_grid)

log_y = np.log(y_train)
gridCV.fit(X_train, log_y)

## 평가지표 구현

In [9]:
def rmsle(y, y_,convertExp=True):
    
    # 지수변환
    if convertExp:
        y = np.exp(y),
        y_ = np.exp(y_)
        
    # 로그변환 후 결측값을 0으로 변환 
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    
    # RMSLE 계산
    return np.sqrt(np.mean(calc))

## Optuna
- Pipeline + Optuna + MLFlow와 연동하는 코드를 작성한다. 
    + 참고 : https://hvy-optuna.readthedocs.io/en/latest/reference/generated/optuna.integration.MLflowCallback.html
    

In [14]:
from optuna.integration.mlflow import MLflowCallback

# (1) 수치형 변수 StandardScaler()
num_features = ["temp", "humidity", "hour"]
num_transformer = Pipeline(
    steps = [("scaler", StandardScaler()), ("pca", PCA())]
)

# (2) 범주형 변수 원핫 인코딩 OneHotEncoder()
cat_features = ["season", "weather", "weekday"]
cat_transformer = OneHotEncoder(handle_unknown="ignore")

# (3) ColumnTransformer
preprocessor = ColumnTransformer(
    transformers = [
        ("num", num_transformer, num_features), 
        ("cat", cat_transformer, cat_features)
    ]
)

# Optuna 설정 for mlflow
mlflc = MLflowCallback(
    tracking_uri = 'mlruns', 
    metric_name='rmsle', 
) 

# Optuna 모델 정의
def create_model(trial):
    params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1), 
              'max_depth' : trial.suggest_int('max_depth', 2, 10), 
              'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)
             }
    
    # 모델 정의 
    # (4) 모델 정의
    pipe_reg_model = Pipeline(
        steps = [("preprocessor", preprocessor), ("xgbrg", XGBRegressor(**params, random_state=42))]
    )
    
    # mlflow logging 코드 작성
    mlflow.log_param('eta', params['eta'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('subsample', params['subsample'])
    
    return pipe_reg_model

@mlflc.track_in_mlflow()
def objective(trial):
    
    # 타깃값 로그변환
    log_y_train = np.log(y_train)
    log_y_valid = np.log(y_valid)
    
    model = create_model(trial)
    model.fit(X_train, log_y_train)
    
    # 모형 예측값 반환
    preds = model.predict(X_valid)
    
    print("RMSLE 값:",  rmsle(log_y_valid, preds, True))
    print("r2 Score 값:",  r2_score(log_y_valid, preds))
    mlflow.log_metric("r2_score", r2_score(log_y_valid, preds))
    
    rmsle_score = rmsle(log_y_valid, preds, True)
    mlflow.sklearn.log_model(model, 'xgboost')
    
    mlflow.end_run()
    
    return rmsle_score


  mlflc = MLflowCallback(
  @mlflc.track_in_mlflow()


In [15]:
study = optuna.create_study(study_name='bike_sharing_regression', direction='minimize')
study.optimize(objective, n_trials=10, callbacks=[mlflc])

[32m[I 2022-08-23 00:01:51,431][0m A new study created in memory with name: bike_sharing_regression[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.6432171285074844
r2 Score 값: 0.7891103005611851


[32m[I 2022-08-23 00:01:55,966][0m Trial 0 finished with value: 0.6432171285074844 and parameters: {'eta': 0.03617187314262316, 'max_depth': 6, 'subsample': 0.8805333181585354}. Best is trial 0 with value: 0.6432171285074844.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.8678009800006087
r2 Score 값: 0.6147135119266067


[32m[I 2022-08-23 00:02:00,088][0m Trial 1 finished with value: 0.8678009800006087 and parameters: {'eta': 0.07262202509710443, 'max_depth': 2, 'subsample': 0.6929956484119242}. Best is trial 0 with value: 0.6432171285074844.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.49204755938428263
r2 Score 값: 0.8737429441732658


[32m[I 2022-08-23 00:02:04,874][0m Trial 2 finished with value: 0.49204755938428263 and parameters: {'eta': 0.07551678742246395, 'max_depth': 10, 'subsample': 0.5106844705557261}. Best is trial 2 with value: 0.49204755938428263.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.5573186634588694
r2 Score 값: 0.8400836963748929


[32m[I 2022-08-23 00:02:09,428][0m Trial 3 finished with value: 0.5573186634588694 and parameters: {'eta': 0.057269515790783605, 'max_depth': 7, 'subsample': 0.7067330641711635}. Best is trial 2 with value: 0.49204755938428263.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.5988585827919581
r2 Score 값: 0.8157383543399295


[32m[I 2022-08-23 00:02:14,067][0m Trial 4 finished with value: 0.5988585827919581 and parameters: {'eta': 0.06323502627469954, 'max_depth': 6, 'subsample': 0.5837340444193186}. Best is trial 2 with value: 0.49204755938428263.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.7381309091014998
r2 Score 값: 0.721514906668294


[32m[I 2022-08-23 00:02:18,017][0m Trial 5 finished with value: 0.7381309091014998 and parameters: {'eta': 0.05195718502482184, 'max_depth': 4, 'subsample': 0.7968614890042334}. Best is trial 2 with value: 0.49204755938428263.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.5030746706919359
r2 Score 값: 0.8679919656955828


[32m[I 2022-08-23 00:02:22,761][0m Trial 6 finished with value: 0.5030746706919359 and parameters: {'eta': 0.07472418935559826, 'max_depth': 9, 'subsample': 0.7737838104723862}. Best is trial 2 with value: 0.49204755938428263.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.6686598625750326
r2 Score 값: 0.7711130548581518


[32m[I 2022-08-23 00:02:27,050][0m Trial 7 finished with value: 0.6686598625750326 and parameters: {'eta': 0.057320598679274797, 'max_depth': 5, 'subsample': 0.7094518213948884}. Best is trial 2 with value: 0.49204755938428263.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.9115814462645843
r2 Score 값: 0.576689158473394


[32m[I 2022-08-23 00:02:31,365][0m Trial 8 finished with value: 0.9115814462645843 and parameters: {'eta': 0.03871960576424438, 'max_depth': 2, 'subsample': 0.660727769278395}. Best is trial 2 with value: 0.49204755938428263.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 1.1803483110014465
r2 Score 값: 0.3366405074264698


[32m[I 2022-08-23 00:02:36,354][0m Trial 9 finished with value: 1.1803483110014465 and parameters: {'eta': 0.014560773048376231, 'max_depth': 9, 'subsample': 0.537909061324295}. Best is trial 2 with value: 0.49204755938428263.[0m


- 이번에는 여러 소스코드를 MLFLow에 추가하는 코드를 작성해본다. 

In [11]:
# 

  mlflc = MLflowCallback(
  @mlflc.track_in_mlflow()


In [None]:
# 

[32m[I 2022-07-27 00:10:50,408][0m A new study created in memory with name: bike_sharing_regression[0m


RMSLE 값 : 0.778078099833077
r2 Score 값: 0.6909895046767656


[32m[I 2022-07-27 00:10:53,173][0m Trial 0 finished with value: 0.778078099833077 and parameters: {'regressor': 'DecisionTree', 'max_depth': 7}. Best is trial 0 with value: 0.778078099833077.[0m


RMSLE 값 : 0.8194296708318933
r2 Score 값: 0.6576367178825051


[32m[I 2022-07-27 00:10:56,222][0m Trial 1 finished with value: 0.8194296708318933 and parameters: {'regressor': 'DecisionTree', 'max_depth': 6}. Best is trial 0 with value: 0.778078099833077.[0m


RMSLE 값 : 0.6488401161002054
r2 Score 값: 0.7831433481751126


[32m[I 2022-07-27 00:11:58,741][0m Trial 2 finished with value: 0.6488401161002054 and parameters: {'regressor': 'RandomForest', 'max_depth': 8, 'n_estimators': 2982}. Best is trial 2 with value: 0.6488401161002054.[0m
  'eta':trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample':trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값 : 0.8702494097122431
r2 Score 값: 0.6121810041947033


[32m[I 2022-07-27 00:12:03,450][0m Trial 3 finished with value: 0.8702494097122431 and parameters: {'regressor': 'XGBoost', 'eta': 0.07960817475292989, 'max_depth': 2, 'subsample': 0.860632550262914}. Best is trial 2 with value: 0.6488401161002054.[0m
  'eta':trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample':trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값 : 0.6226168520695903
r2 Score 값: 0.8005569363914146


[32m[I 2022-07-27 00:12:08,478][0m Trial 4 finished with value: 0.6226168520695903 and parameters: {'regressor': 'XGBoost', 'eta': 0.044210080735759875, 'max_depth': 6, 'subsample': 0.8119700490366031}. Best is trial 4 with value: 0.6226168520695903.[0m
  'eta':trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample':trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값 : 0.9639943675697076
r2 Score 값: 0.5386467178266348


[32m[I 2022-07-27 00:12:12,675][0m Trial 5 finished with value: 0.9639943675697076 and parameters: {'regressor': 'XGBoost', 'eta': 0.022215842063315394, 'max_depth': 3, 'subsample': 0.8813381579107818}. Best is trial 4 with value: 0.6226168520695903.[0m


RMSLE 값 : 0.590432402979249
r2 Score 값: 0.8199675288759712


[32m[I 2022-07-27 00:12:23,667][0m Trial 6 finished with value: 0.590432402979249 and parameters: {'regressor': 'RandomForest', 'max_depth': 10, 'n_estimators': 251}. Best is trial 6 with value: 0.590432402979249.[0m


RMSLE 값 : 0.690214605972267
r2 Score 값: 0.7550400563793147


[32m[I 2022-07-27 00:12:34,826][0m Trial 7 finished with value: 0.690214605972267 and parameters: {'regressor': 'RandomForest', 'max_depth': 7, 'n_estimators': 383}. Best is trial 6 with value: 0.590432402979249.[0m


RMSLE 값 : 0.5874724703430075
r2 Score 값: 0.8216987962566236


[32m[I 2022-07-27 00:13:25,051][0m Trial 8 finished with value: 0.5874724703430075 and parameters: {'regressor': 'RandomForest', 'max_depth': 10, 'n_estimators': 2490}. Best is trial 8 with value: 0.5874724703430075.[0m
  'eta':trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample':trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값 : 0.916990108730916
r2 Score 값: 0.590819903780663


[32m[I 2022-07-27 00:13:28,238][0m Trial 9 finished with value: 0.916990108730916 and parameters: {'regressor': 'XGBoost', 'eta': 0.019073739937408372, 'max_depth': 6, 'subsample': 0.6142990449920527}. Best is trial 8 with value: 0.5874724703430075.[0m


RMSLE 값 : 0.5876992853278866
r2 Score 값: 0.8215604271013629


[32m[I 2022-07-27 00:14:21,170][0m Trial 10 finished with value: 0.5876992853278866 and parameters: {'regressor': 'RandomForest', 'max_depth': 10, 'n_estimators': 2670}. Best is trial 8 with value: 0.5874724703430075.[0m


RMSLE 값 : 0.587680089320982
r2 Score 값: 0.821559037973576


[32m[I 2022-07-27 00:15:17,543][0m Trial 11 finished with value: 0.587680089320982 and parameters: {'regressor': 'RandomForest', 'max_depth': 10, 'n_estimators': 2850}. Best is trial 8 with value: 0.5874724703430075.[0m


RMSLE 값 : 0.6152751050606429
r2 Score 값: 0.8047443238627244


[32m[I 2022-07-27 00:15:54,773][0m Trial 12 finished with value: 0.6152751050606429 and parameters: {'regressor': 'RandomForest', 'max_depth': 9, 'n_estimators': 2058}. Best is trial 8 with value: 0.5874724703430075.[0m


RMSLE 값 : 0.6152463130127642
r2 Score 값: 0.8047635694811344


[32m[I 2022-07-27 00:16:33,520][0m Trial 13 finished with value: 0.6152463130127642 and parameters: {'regressor': 'RandomForest', 'max_depth': 9, 'n_estimators': 2125}. Best is trial 8 with value: 0.5874724703430075.[0m


RMSLE 값 : 0.8329563222472344
r2 Score 값: 0.6453369127701732


[32m[I 2022-07-27 00:16:56,176][0m Trial 14 finished with value: 0.8329563222472344 and parameters: {'regressor': 'RandomForest', 'max_depth': 4, 'n_estimators': 2329}. Best is trial 8 with value: 0.5874724703430075.[0m


RMSLE 값 : 0.5874297313511178
r2 Score 값: 0.8217142830641939


[32m[I 2022-07-27 00:17:23,452][0m Trial 15 finished with value: 0.5874297313511178 and parameters: {'regressor': 'RandomForest', 'max_depth': 10, 'n_estimators': 1337}. Best is trial 15 with value: 0.5874297313511178.[0m


RMSLE 값 : 0.7534747424195544
r2 Score 값: 0.7084252138868223


[32m[I 2022-07-27 00:17:26,150][0m Trial 16 finished with value: 0.7534747424195544 and parameters: {'regressor': 'DecisionTree', 'max_depth': 8}. Best is trial 15 with value: 0.5874297313511178.[0m


RMSLE 값 : 0.6151452785580223
r2 Score 값: 0.8048290805492865


[32m[I 2022-07-27 00:17:47,386][0m Trial 17 finished with value: 0.6151452785580223 and parameters: {'regressor': 'RandomForest', 'max_depth': 9, 'n_estimators': 1154}. Best is trial 15 with value: 0.5874297313511178.[0m


RMSLE 값 : 0.8335048524294792
r2 Score 값: 0.6448865673932174


[32m[I 2022-07-27 00:18:01,151][0m Trial 18 finished with value: 0.8335048524294792 and parameters: {'regressor': 'RandomForest', 'max_depth': 4, 'n_estimators': 1380}. Best is trial 15 with value: 0.5874297313511178.[0m


RMSLE 값 : 0.7534747424195544
r2 Score 값: 0.7084252138868223


[32m[I 2022-07-27 00:18:03,724][0m Trial 19 finished with value: 0.7534747424195544 and parameters: {'regressor': 'DecisionTree', 'max_depth': 8}. Best is trial 15 with value: 0.5874297313511178.[0m


In [None]:
# 