<font color="#CC3D3D"><p>
# [Competition] Building a `XGBoost` Model with `Pipeline+Optuna`

<font color="blue"><p>
#### XGB모형 구축절차
1. 수치형 피처
 - 결측값처리: SimpleImputer(strategy=`???`)
 - 이상값처리: FunctionTransformer((remove_outlier, kw_args={'q':`???`})))
 - 스케일링: PowerTransformer()
2. 범주형 피처
 - 결측값처리: SimpleImputer(strategy="most_frequent")
 - 인코딩: OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=int)
3. 공통
 - Feature Selection: SelectPercentile(percentile=`???`)
 - Modeling: XGBoost(`???`)
 - Hyperparametor Optimization: `OptunaSearchCV`
 - OOF Prediction   

In [None]:
XGB_VERSION = 1.0

In [None]:
!pip install xgboost

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from xgboost import XGBRegressor
import optuna
from optuna.distributions import CategoricalDistribution, IntDistribution, FloatDistribution
from optuna.integration import OptunaSearchCV, ShapleyImportanceEvaluator
from sklearn.neural_network import MLPRegressor


#### Load data

In [None]:
X_train = pd.read_csv('X_train.csv', encoding='cp949').drop(columns='ID')
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv('X_test.csv', encoding='cp949')
test_id = X_test.ID
X_test = X_test.drop(columns='ID')

In [None]:
X_train.info()

In [None]:
X_train.head()

In [None]:
sns.distplot(y_train); plt.show() 

#### 수치형/범주형 피처 분리 & 학습/평가 데이터 분할

In [None]:
numeric_features = ['대학성적']
categorical_features = ['직종','세부직종','직무태그','근무경력','근무형태','근무지역','출신대학','대학전공','어학시험','자격증']

X_train = X_train[numeric_features+categorical_features]  # 순서 주의!!!
X_test = X_test[numeric_features+categorical_features]

####  파이프라인 구축

In [None]:
def remove_outlier(X, q=0.05):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':0.05})), # 함수를 전처리기로 변환하여 sklearn에 없는 새로운 전처리기를 만듬
        ("scaler", PowerTransformer()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=int)),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer), 
        ("selector", SelectPercentile(percentile=100)),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", MLPRegressor(hidden_layer_sizes=[1024, 512, 4], max_iter=5000, alpha=0.005, random_state=42)),
    ]
)

set_config(display="diagram")  # To view the text pipeline, change to display='text'.
model

#### XGB Baseline 성능 확인

In [None]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

print("Default XGB CV scores: ", np.sqrt(-1*scores))
print("Default XGB CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))

#### `파이프라인+Optuna`를 통한 XGB 하이퍼파라미터 최적화

In [None]:
%%time

param_distributions = {
    "preprocessor__column__num__imputer__strategy": CategoricalDistribution(["mean","median"]),
    "preprocessor__column__num__outlier__kw_args": CategoricalDistribution([{'q':0.01},{'q':0.05},{'q':0.1}]),
    "preprocessor__selector__percentile": IntDistribution(50,100,step=10),
    "classifier__reg_alpha": FloatDistribution(1e-8,1.0,log=True),
    "classifier__reg_lambda": FloatDistribution(1e-8,1.0,log=True),
    "classifier__subsample": FloatDistribution(0.2,1.0),
    "classifier__colsample_bytree": FloatDistribution(0.2,1.0),
    "classifier__learning_rate": FloatDistribution(1e-8,1.0,log=True),  
    "classifier__gamma": FloatDistribution(1e-8,1.0,log=True),      
    "classifier__max_depth": IntDistribution(3,9,step=2),
    "classifier__min_child_weight": IntDistribution(2,10),    
}

optuna.logging.set_verbosity(optuna.logging.WARNING)
optuna_search = OptunaSearchCV(model, param_distributions, cv=5, 
                               scoring='neg_mean_squared_error', n_trials=20,
                               study=optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction='maximize'))
optuna_search.fit(X_train, y_train)

In [None]:
print(f"\nBest params: {optuna_search.best_params_}")
print(f"\nBest score: {np.sqrt(-1*optuna_search.best_score_):.2f}")

#### Submission 생성

In [None]:
# 최적화된 하이퍼파라미터로 파이프라인 재설정
model.set_params(**optuna_search.best_params_)

# OOF Prediction
models = cross_validate(model, 
                        X_train, y_train, 
                        cv=5, 
                        scoring='neg_mean_squared_error', 
                        return_estimator=True)
oof_pred = np.array([m.predict(X_test) for m in models['estimator']]).mean(axis=0)

scores = models['test_score']
print("\nTuned XGB CV scores: ", np.sqrt(-1*scores))
print("Tuned XGB CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))

In [None]:
# submission 화일 생성
filename = f'lgbm_{XGB_VERSION}_{np.sqrt(-1*scores.mean()):.2f}.csv'
pd.DataFrame({'ID':test_id, 'Salary':oof_pred}).to_csv(filename, index=False)

<font color="#CC3D3D"><p>
# End