## 머신러닝 Pipeline + Optuna
- 이전 강의들에서 결측치 처리, 모델 학습 등을 진행하였다. 
- 이 때, 결측치 처리, 스케일링, 하이퍼 파라미터 등을 최소화하여 쉽게 연결할 수 있도록 도와준다. 
- 본 예제에서는 scikit-learn pipeline에 대해 학습할 예정이다. 

## 필수 라이브러리 불러오기
- 본 튜토리얼에 적합한 주요 라이브러리들을 불러온다. 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import set_config
import optuna
import mlflow

import xgboost
from xgboost import XGBRegressor

print(f"numpy version {np.__version__}")
print(f"pandas version {pd.__version__}")
print(f"seaborn version {sns.__version__}")
print(f"sklearn version {sklearn.__version__}")
print(f"optuna version {optuna.__version__}")
print(f"xgboost version {xgboost.__version__}")

numpy version 1.23.1
pandas version 1.4.3
seaborn version 0.11.2
sklearn version 1.1.1
optuna version 3.0.0b1
xgboost version 1.6.1


## 데이터 불러오기
- 본 실습을 위해 간단한 데이터를 불러온다. 

In [10]:
import os
os.getcwd()

'C:\\Users\\j2hoo\\OneDrive\\Desktop\\ml_optuna_mlflow'

In [11]:
# DATA_PATH = "C:\\Users\\human\\Desktop\\mlops_tutorial\\data\\bike-sharing-demand\\"
DATA_PATH = "C:\\Users\\j2hoo\\OneDrive\\Desktop\\ml_optuna_mlflow\\data\\bike-sharing-demand\\"

train_df = pd.read_csv(DATA_PATH + "train.csv")
test = pd.read_csv(DATA_PATH + "test.csv")
submission = pd.read_csv(DATA_PATH + "sampleSubmission.csv")

train_df.shape, test.shape, submission.shape

((10886, 12), (6493, 9), (6493, 2))

## 데이터 전처리
- 데이터 전처리는 기존 강의와 비슷하게 진행한다. 
- 단, 범주형 데이터셋은 그대로 놔두도록 한다. 

In [4]:
# 타깃값 별도 저장
train_df = train_df[train_df['weather'] != 4]
y = train_df['count'] # 타깃값

# count 컬럼 제거
train_df = train_df.drop(['count'], axis=1)

# 데이터 합치기
all_df = pd.concat([train_df, test])

# 날짜 데이터로 파생변수 만들기
all_df['date'] = pd.to_datetime(all_df['datetime'])
all_df['year'] = all_df['date'].dt.year
all_df['date'] = pd.to_datetime(all_df['datetime'])
all_df['year'] = all_df['date'].dt.year
all_df['month'] = all_df['date'].dt.month
all_df['day'] = all_df['date'].dt.day
all_df['hour'] = all_df['date'].dt.hour
all_df['weekday'] = all_df['date'].dt.day_name()

# 원-핫 인코딩을 위해 변경함
season_num = [1, 2, 3, 4]
season_str = ['Spring', 'Summer', 'Fall', 'Winter']
all_df['season'] = all_df['season'].replace(season_num, season_str)

weather_num = [1, 2, 3, 4]
weather_str = ['Clear', 'Few Clouds', 'Light Snow, Rain', 'Heavy Snow, Rain']
all_df['weather'] = all_df['weather'].replace(season_num, season_str)

del_features = ['casual', 'registered', 'datetime', 'date', 'windspeed', 'month', 'atemp']
all_df = all_df.drop(del_features, axis=1)

all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17378 entries, 0 to 6492
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      17378 non-null  object 
 1   holiday     17378 non-null  int64  
 2   workingday  17378 non-null  int64  
 3   weather     17378 non-null  object 
 4   temp        17378 non-null  float64
 5   humidity    17378 non-null  int64  
 6   year        17378 non-null  int64  
 7   day         17378 non-null  int64  
 8   hour        17378 non-null  int64  
 9   weekday     17378 non-null  object 
dtypes: float64(1), int64(6), object(3)
memory usage: 1.5+ MB


## 데이터셋 분리

In [5]:
train = all_df.iloc[0:len(y), :]
test = all_df.iloc[len(y):, :]
train.shape, test.shape

((10885, 10), (6493, 10))

In [6]:
# train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.3, shuffle=True, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((7619, 10), (3266, 10), (7619,), (3266,))

## 데이터 전처리 파이프라인 만들기
- (1) 수치형 변수에 대해서는 StandardScaler() 클래스를 이용해서 표준화를 진행했다. 
- (2) 범주형 변수의 원핫인코딩하는 데이터 전처리 파이프라인을 만든다. 

In [7]:
# (1) 수치형 변수 StandardScaler()
num_features = ["temp", "humidity", "hour"]
num_transformer = Pipeline(
    steps = [("scaler", StandardScaler()), ("pca", PCA())]
)

# (2) 범주형 변수 원핫 인코딩 OneHotEncoder()
cat_features = ["season", "weather", "weekday"]
cat_transformer = OneHotEncoder(handle_unknown="ignore")

# (3) ColumnTransformer
preprocessor = ColumnTransformer(
    transformers = [
        ("num", num_transformer, num_features), 
        ("cat", cat_transformer, cat_features)
    ]
)

## Optuna
- 그리드 서치 리뷰

In [7]:
set_config(display = 'diagram')

# (4) 모델 정의
pipe_reg_model = Pipeline(
    steps = [("preprocessor", preprocessor), ("xgbrg", XGBRegressor())]
)

# (5) Grid Search
param_grid = {
    'xgbrg__max_depth' : [2, 7, 10], 
    'xgbrg__n_estimators' : [10, 500], 
}

gridCV = GridSearchCV(pipe_reg_model, cv = 2, param_grid = param_grid)

log_y = np.log(y_train)
gridCV.fit(X_train, log_y)

## 평가지표 구현

In [8]:
def rmsle(y, y_,convertExp=True):
    
    # 지수변환
    if convertExp:
        y = np.exp(y),
        y_ = np.exp(y_)
        
    # 로그변환 후 결측값을 0으로 변환 
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    
    # RMSLE 계산
    return np.sqrt(np.mean(calc))

## Optuna
- Pipeline + Optuna + MLFlow와 연동하는 코드를 작성한다. 
    + 참고 : https://hvy-optuna.readthedocs.io/en/latest/reference/generated/optuna.integration.MLflowCallback.html
    

In [9]:
from optuna.integration.mlflow import MLflowCallback

# (1) 수치형 변수 StandardScaler()
num_features = ["temp", "humidity", "hour"]
num_transformer = Pipeline(
    steps = [("scaler", StandardScaler()), ("pca", PCA())]
)

# (2) 범주형 변수 원핫 인코딩 OneHotEncoder()
cat_features = ["season", "weather", "weekday"]
cat_transformer = OneHotEncoder(handle_unknown="ignore")

# (3) ColumnTransformer
preprocessor = ColumnTransformer(
    transformers = [
        ("num", num_transformer, num_features), 
        ("cat", cat_transformer, cat_features)
    ]
)

# Optuna 설정 for mlflow
mlflc = MLflowCallback(
    tracking_uri = 'mlruns', 
    metric_name='rmsle', 
) 

# Optuna 모델 정의
def create_model(trial):
    params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1), 
              'max_depth' : trial.suggest_int('max_depth', 2, 10), 
              'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)
             }
    
    # 모델 정의 
    # (4) 모델 정의
    pipe_reg_model = Pipeline(
        steps = [("preprocessor", preprocessor), ("xgbrg", XGBRegressor(**params, random_state=42))]
    )
    
    # mlflow logging 코드 작성
    mlflow.log_param('eta', params['eta'])
    mlflow.log_param('max_depth', params['max_depth'])
    mlflow.log_param('subsample', params['subsample'])
    
    return pipe_reg_model

@mlflc.track_in_mlflow()
def objective(trial):
    
    # 타깃값 로그변환
    log_y_train = np.log(y_train)
    log_y_valid = np.log(y_valid)
    
    model = create_model(trial)
    model.fit(X_train, log_y_train)
    
    # 모형 예측값 반환
    preds = model.predict(X_valid)
    
    print("RMSLE 값:",  rmsle(log_y_valid, preds, True))
    print("r2 Score 값:",  r2_score(log_y_valid, preds))
    mlflow.log_metric("r2_score", r2_score(log_y_valid, preds))
    
    rmsle_score = rmsle(log_y_valid, preds, True)
    mlflow.sklearn.log_model(model, 'xgboost')
    
    mlflow.end_run()
    
    return rmsle_score


  mlflc = MLflowCallback(
  @mlflc.track_in_mlflow()


In [10]:
study = optuna.create_study(study_name='bike_sharing_regression', direction='minimize')
study.optimize(objective, n_trials=10, callbacks=[mlflc])

[32m[I 2022-08-23 22:53:05,724][0m A new study created in memory with name: bike_sharing_regression[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.5433891244813719
r2 Score 값: 0.8478656564809588


[32m[I 2022-08-23 22:53:09,856][0m Trial 0 finished with value: 0.5433891244813719 and parameters: {'eta': 0.07748432161862982, 'max_depth': 7, 'subsample': 0.5292005174932818}. Best is trial 0 with value: 0.5433891244813719.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 2.134492927168491
r2 Score 값: -1.18670784262574


[32m[I 2022-08-23 22:53:12,866][0m Trial 1 finished with value: 2.134492927168491 and parameters: {'eta': 0.007355173628468108, 'max_depth': 7, 'subsample': 0.503597007914984}. Best is trial 0 with value: 0.5433891244813719.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.5994336744683513
r2 Score 값: 0.816974562212244


[32m[I 2022-08-23 22:53:16,002][0m Trial 2 finished with value: 0.5994336744683513 and parameters: {'eta': 0.0346824181402225, 'max_depth': 7, 'subsample': 0.5448133297782245}. Best is trial 0 with value: 0.5433891244813719.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.7702008511684635
r2 Score 값: 0.6987694933334118


[32m[I 2022-08-23 22:53:18,896][0m Trial 3 finished with value: 0.7702008511684635 and parameters: {'eta': 0.03510539433752137, 'max_depth': 4, 'subsample': 0.8985494608568947}. Best is trial 0 with value: 0.5433891244813719.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 1.257330550058946
r2 Score 값: 0.2464957975391281


[32m[I 2022-08-23 22:53:21,951][0m Trial 4 finished with value: 1.257330550058946 and parameters: {'eta': 0.013816130990303929, 'max_depth': 7, 'subsample': 0.7938744875376784}. Best is trial 0 with value: 0.5433891244813719.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.5777442501306552
r2 Score 값: 0.8282040705359704


[32m[I 2022-08-23 22:53:24,919][0m Trial 5 finished with value: 0.5777442501306552 and parameters: {'eta': 0.08750969678544768, 'max_depth': 6, 'subsample': 0.6119245240601322}. Best is trial 0 with value: 0.5433891244813719.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.5229016247049864
r2 Score 값: 0.8582304450720135


[32m[I 2022-08-23 22:53:28,121][0m Trial 6 finished with value: 0.5229016247049864 and parameters: {'eta': 0.06104359032799647, 'max_depth': 8, 'subsample': 0.6515345414413763}. Best is trial 6 with value: 0.5229016247049864.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.49830139271368373
r2 Score 값: 0.8703941453469299


[32m[I 2022-08-23 22:53:31,492][0m Trial 7 finished with value: 0.49830139271368373 and parameters: {'eta': 0.06942789741660357, 'max_depth': 9, 'subsample': 0.6976633960243437}. Best is trial 7 with value: 0.49830139271368373.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.5114619620426137
r2 Score 값: 0.8644294301499188


[32m[I 2022-08-23 22:53:34,721][0m Trial 8 finished with value: 0.5114619620426137 and parameters: {'eta': 0.048241383502509595, 'max_depth': 9, 'subsample': 0.7243472485484673}. Best is trial 7 with value: 0.49830139271368373.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.6528040340124577
r2 Score 값: 0.7819884963807099


[32m[I 2022-08-23 22:53:37,645][0m Trial 9 finished with value: 0.6528040340124577 and parameters: {'eta': 0.0758896628357744, 'max_depth': 5, 'subsample': 0.6904557284544302}. Best is trial 7 with value: 0.49830139271368373.[0m


- 이번에는 여러 소스코드를 MLFLow에 추가하는 코드를 작성해본다. 

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

# Optuna 설정 for mlflow
mlflc = MLflowCallback(
    tracking_uri = 'mlruns', 
    metric_name='rmsle', 
) 

# Optuna 모델 정의
def create_model(trial):
    
    regressor_name = trial.suggest_categorical('regressor', ['XGBoost', 'RandomForest', 'DecisionTree'])
    
    ## 모형에 따라 다르게 하이퍼 파라미터를 저장할 수 있다. 
    if regressor_name == 'XGBoost':
        params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1), 
                  'max_depth' : trial.suggest_int('max_depth', 2, 10), 
                  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)
                 }
    
        # (4) 모델 정의
        pipe_reg_model = Pipeline(
            steps = [("preprocessor", preprocessor), ("xgbrg", XGBRegressor(**params, random_state=42))]
        )
    
        # mlflow logging 코드 작성
        mlflow.log_param('eta', params['eta'])
        mlflow.log_param('max_depth', params['max_depth'])
        mlflow.log_param('subsample', params['subsample'])
        
    elif regressor_name == 'RandomForest':
        params = {'n_estimators' : trial.suggest_int('n_estimators', 100, 3000), 
                  'max_depth' : trial.suggest_int('max_depth', 2, 10)
                 }
    
        # (4) 모델 정의
        pipe_reg_model = Pipeline(
            steps = [("preprocessor", preprocessor), ("rf_reg", RandomForestRegressor(**params, random_state=42))]
        )
    
        # mlflow logging 코드 작성
        mlflow.log_param('n_estimators', params['n_estimators'])
        mlflow.log_param('max_depth', params['max_depth'])
    else:
        params = {
                  'max_depth' : trial.suggest_int('max_depth', 2, 10)
                 }
    
        # (4) 모델 정의
        pipe_reg_model = Pipeline(
            steps = [("preprocessor", preprocessor), ("dt_reg", DecisionTreeRegressor(**params, random_state=42))]
        )
    
        # mlflow logging 코드 작성
        mlflow.log_param('max_depth', params['max_depth'])
        
    return pipe_reg_model

@mlflc.track_in_mlflow()
def objective(trial):
    
    # 타깃값 로그변환
    log_y_train = np.log(y_train)
    log_y_valid = np.log(y_valid)
    
    model = create_model(trial)
    model.fit(X_train, log_y_train)
    
    # 모형 예측값 반환
    preds = model.predict(X_valid)
    
    print("RMSLE 값:",  rmsle(log_y_valid, preds, True))
    print("r2 Score 값:",  r2_score(log_y_valid, preds))
    mlflow.log_metric("r2_score", r2_score(log_y_valid, preds))
    
    rmsle_score = rmsle(log_y_valid, preds, True)
    mlflow.sklearn.log_model(model, 'xgboost')
    
    mlflow.end_run()
    
    return rmsle_score

  mlflc = MLflowCallback(
  @mlflc.track_in_mlflow()


In [17]:
study = optuna.create_study(study_name='bike_sharing_multi_model_regression', direction='minimize')
study.optimize(objective, n_trials=30, callbacks=[mlflc])

[32m[I 2022-08-23 23:04:56,043][0m A new study created in memory with name: bike_sharing_multi_model_regression[0m
2022/08/23 23:04:56 INFO mlflow.tracking.fluent: Experiment with name 'bike_sharing_multi_model_regression' does not exist. Creating a new experiment.


RMSLE 값: 0.8273215509184015
r2 Score 값: 0.6524071987885736


[32m[I 2022-08-23 23:04:59,960][0m Trial 0 finished with value: 0.8273215509184015 and parameters: {'regressor': 'DecisionTree', 'max_depth': 6}. Best is trial 0 with value: 0.8273215509184015.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 1.0017586531040157
r2 Score 값: 0.498044645905105


[32m[I 2022-08-23 23:05:03,771][0m Trial 1 finished with value: 1.0017586531040157 and parameters: {'regressor': 'XGBoost', 'eta': 0.025464301055013547, 'max_depth': 2, 'subsample': 0.8894785083600223}. Best is trial 0 with value: 0.8273215509184015.[0m


RMSLE 값: 0.7816759799306753
r2 Score 값: 0.6894872490579149


[32m[I 2022-08-23 23:05:07,282][0m Trial 2 finished with value: 0.7816759799306753 and parameters: {'regressor': 'DecisionTree', 'max_depth': 7}. Best is trial 2 with value: 0.7816759799306753.[0m


RMSLE 값: 0.9092903381686565
r2 Score 값: 0.582104194475249


[32m[I 2022-08-23 23:05:10,846][0m Trial 3 finished with value: 0.9092903381686565 and parameters: {'regressor': 'DecisionTree', 'max_depth': 4}. Best is trial 2 with value: 0.7816759799306753.[0m


RMSLE 값: 0.7588159028200248
r2 Score 값: 0.7068425197186899


[32m[I 2022-08-23 23:05:14,399][0m Trial 4 finished with value: 0.7588159028200248 and parameters: {'regressor': 'DecisionTree', 'max_depth': 8}. Best is trial 4 with value: 0.7588159028200248.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.6357929145810279
r2 Score 값: 0.7939672497628094


[32m[I 2022-08-23 23:05:18,886][0m Trial 5 finished with value: 0.6357929145810279 and parameters: {'regressor': 'XGBoost', 'eta': 0.03672473010264438, 'max_depth': 6, 'subsample': 0.7304132822711364}. Best is trial 5 with value: 0.6357929145810279.[0m


RMSLE 값: 0.9785979600329554
r2 Score 값: 0.516748781370505


[32m[I 2022-08-23 23:05:22,658][0m Trial 6 finished with value: 0.9785979600329554 and parameters: {'regressor': 'DecisionTree', 'max_depth': 3}. Best is trial 5 with value: 0.6357929145810279.[0m


RMSLE 값: 0.5769715563616967
r2 Score 값: 0.8283325763643311


[32m[I 2022-08-23 23:06:07,323][0m Trial 7 finished with value: 0.5769715563616967 and parameters: {'regressor': 'RandomForest', 'n_estimators': 1777, 'max_depth': 10}. Best is trial 7 with value: 0.5769715563616967.[0m
  params = {'eta' : trial.suggest_uniform('eta', 0.0001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 0.9)


RMSLE 값: 0.6364118754739613
r2 Score 값: 0.7936901484446484


[32m[I 2022-08-23 23:06:22,104][0m Trial 8 finished with value: 0.6364118754739613 and parameters: {'regressor': 'XGBoost', 'eta': 0.03587832252190005, 'max_depth': 6, 'subsample': 0.5597404619997418}. Best is trial 7 with value: 0.5769715563616967.[0m


RMSLE 값: 0.7588159028200248
r2 Score 값: 0.7068425197186899


[32m[I 2022-08-23 23:06:28,680][0m Trial 9 finished with value: 0.7588159028200248 and parameters: {'regressor': 'DecisionTree', 'max_depth': 8}. Best is trial 7 with value: 0.5769715563616967.[0m


RMSLE 값: 0.5768926942016853
r2 Score 값: 0.8283889310047176


[32m[I 2022-08-23 23:07:37,461][0m Trial 10 finished with value: 0.5768926942016853 and parameters: {'regressor': 'RandomForest', 'n_estimators': 1946, 'max_depth': 10}. Best is trial 10 with value: 0.5768926942016853.[0m


RMSLE 값: 0.5770503647525148
r2 Score 값: 0.828289625690025


[32m[I 2022-08-23 23:08:19,735][0m Trial 11 finished with value: 0.5770503647525148 and parameters: {'regressor': 'RandomForest', 'n_estimators': 1804, 'max_depth': 10}. Best is trial 10 with value: 0.5768926942016853.[0m


RMSLE 값: 0.5768867842348688
r2 Score 값: 0.8283951435185044


[32m[I 2022-08-23 23:09:25,265][0m Trial 12 finished with value: 0.5768867842348688 and parameters: {'regressor': 'RandomForest', 'n_estimators': 1968, 'max_depth': 10}. Best is trial 12 with value: 0.5768867842348688.[0m


RMSLE 값: 0.6043766003370089
r2 Score 값: 0.8119578508466199


[32m[I 2022-08-23 23:10:51,482][0m Trial 13 finished with value: 0.6043766003370089 and parameters: {'regressor': 'RandomForest', 'n_estimators': 2898, 'max_depth': 9}. Best is trial 12 with value: 0.5768867842348688.[0m


RMSLE 값: 0.5768634610865644
r2 Score 값: 0.8284045690016143


[32m[I 2022-08-23 23:11:44,165][0m Trial 14 finished with value: 0.5768634610865644 and parameters: {'regressor': 'RandomForest', 'n_estimators': 2069, 'max_depth': 10}. Best is trial 14 with value: 0.5768634610865644.[0m


RMSLE 값: 0.6380916890740257
r2 Score 값: 0.7907382788557517


[32m[I 2022-08-23 23:11:55,178][0m Trial 15 finished with value: 0.6380916890740257 and parameters: {'regressor': 'RandomForest', 'n_estimators': 437, 'max_depth': 8}. Best is trial 14 with value: 0.5768634610865644.[0m


RMSLE 값: 0.6043901226184247
r2 Score 값: 0.8119492790600271


[32m[I 2022-08-23 23:12:43,987][0m Trial 16 finished with value: 0.6043901226184247 and parameters: {'regressor': 'RandomForest', 'n_estimators': 2634, 'max_depth': 9}. Best is trial 14 with value: 0.5768634610865644.[0m
[33m[W 2022-08-23 23:12:45,755][0m Trial 17 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "C:\Users\j2hoo\OneDrive\Desktop\ml_optuna_mlflow\venv\lib\site-packages\optuna\study\_optimize.py", line 207, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\j2hoo\OneDrive\Desktop\ml_optuna_mlflow\venv\lib\site-packages\optuna\integration\mlflow.py", line 214, in wrapper
    return func(trial)
  File "C:\Users\j2hoo\AppData\Local\Temp\ipykernel_2516\1909800325.py", line 68, in objective
    model.fit(X_train, log_y_train)
  File "C:\Users\j2hoo\OneDrive\Desktop\ml_optuna_mlflow\venv\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)


KeyboardInterrupt: 

In [14]:
ml_run_id = 'runs:/eaa03bc57f224a6d95c7c85c7f29c7ef/xgboost'
loaded_model = mlflow.sklearn.load_model(ml_run_id)

# preprocessor.fit_transform(test)
# final_preds = loaded_model.predict(test)
submission['count'] = np.exp(final_preds)
submission.to_csv(DATA_PATH + 'submission.csv', index=False)

In [15]:
submission2 = pd.read_csv(DATA_PATH + 'submission.csv')
submission2.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,12.069974
1,2011-01-20 01:00:00,7.462033
2,2011-01-20 02:00:00,6.319709
3,2011-01-20 03:00:00,4.516503
4,2011-01-20 04:00:00,3.660737
