<font color="#CC3D3D"><p>
# [Competition] Building a `LightGBM` Model with `Pipeline+Optuna`

<font color="blue"><p>
#### LGBM 모형 구축절차
1. 수치형 피처
 - 결측값처리: SimpleImputer(strategy=`???`)
 - 이상값처리: FunctionTransformer((remove_outlier, kw_args={'q':`???`})))
 - 스케일링: PowerTransformer()
2. 범주형 피처
 - 결측값처리: SimpleImputer(strategy="most_frequent")
 - 인코딩: OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=int)
3. 공통
 - Feature Selection: SelectPercentile(percentile=`???`)
 - Modeling: LGBMRegressor(`???`)
 - Hyperparametor Optimization: `LightGBMTunerCV`
 - OOF Prediction   

In [1]:
LGBM_VERSION = 1.0   # submission 화일명에 사용

In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, log_loss, get_scorer_names
from sklearn import set_config
from lightgbm import LGBMRegressor
import lightgbm as lgb
import optuna
from optuna.distributions import CategoricalDistribution, IntDistribution, FloatDistribution
from optuna.integration import OptunaSearchCV
from optuna.integration.lightgbm import LightGBMTunerCV
import re
import statsmodels.api as sm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD


In [3]:
CATBOOST_VERSION = 3.0
NFOLDS = 5
SEED = 0
NCOMP = 50
P = 0.05

In [4]:
X_train = pd.read_csv('X_train.csv', encoding='cp949').drop(columns='ID')
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv('X_test.csv', encoding='cp949')
test_id = X_test.ID
X_test = X_test.drop(columns='ID')

In [5]:
## 어학시험 결측치 처리
X_train['어학시험'] = X_train['어학시험'].fillna('없음')
X_train = X_train.replace({'어학시험' :' '},'없음') 

X_test['어학시험'] = X_test['어학시험'].fillna('없음')
X_test = X_test.replace({'어학시험' :' '},'없음') 

In [6]:
for i , j in  zip(X_train['대학전공'].str.contains('호텔'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'호텔',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('관광'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'관광',inplace=True)   
        
for i , j in  zip(X_train['대학전공'].str.contains('체육'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'체육',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('무역'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'무역',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('국제통상'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('통계'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'통계',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('조리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'조리',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('경영'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'경영',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('경제'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'경제',inplace=True)

for i , j in  zip(X_train['대학전공'].str.contains('디자인'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'디자인',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('경제'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'경제',inplace=True)

for i , j in  zip(X_train['대학전공'].str.contains('컴퓨터'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'컴퓨터',inplace=True)

for i , j in  zip(X_train['대학전공'].str.contains('교육'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'교육',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('스포츠'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'체육',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('영어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'영어영문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('중국어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'중어중문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('중국'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'중국학과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('광고'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'광고',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('미술'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'미술',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('정치'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'정치',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('전기'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'전기',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('국문'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'국문',inplace=True)

for i , j in  zip(X_train['대학전공'].str.contains('의류'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'의류',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('패션'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'의류',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('화학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'화학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('생명'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'생명',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('물리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'물리',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('사학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'사학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('행정'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'행정',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('통상'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('산업'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'산업공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('심리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'심리',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('법'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'법학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('조형'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'조형',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('수학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'수학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('문헌'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'문헌정보',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('회계'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'회계',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('정보'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'정보통신',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('중어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'중어중문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('전자'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'전자과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('독어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'독어독문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('일어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'일어일문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('불어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'불어일문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('토목'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'토목',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('건축'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'건축',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('전산'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'전산학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('기계'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'기계공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('복지'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'복지',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('사회'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'사회학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('보험'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'보험',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('금융'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'금융',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('환경'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'환경',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('금속'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'금속공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('도시'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'도시',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('미디어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'멀티미디어',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('제어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'제어계측',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('신소재'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'신소재',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('신문'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'신문방송',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('통신'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'정보통신',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('식품'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'식품영양',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('생물'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'생물학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('유전'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'유전공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('가정'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'가정',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('소프트'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'컴퓨터',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('지리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'지리학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('지리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'지리학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('섬유'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'섬유공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('의상'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'의상학과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('보건'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'보건관리',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('신학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'신학과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('신학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'신학과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('자원'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'자원공학',inplace=True)
                                
for i , j in  zip(X_train['대학전공'].str.contains('국제'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('공예'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'공예',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('일본'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'일어일문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('메카'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'메카트로닉스',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('영문'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'영어영문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('간호'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'간호학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('영양'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'식품영양',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('분자'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'고분자',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('아동'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'아동학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('무용'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'무용',inplace=True)



In [7]:
for i , j in  zip(X_test['대학전공'].str.contains('호텔'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'호텔',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('관광'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'관광',inplace=True)   
        
for i , j in  zip(X_test['대학전공'].str.contains('체육'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'체육',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('무역'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'무역',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('국제통상'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('통계'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'통계',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('조리'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'조리',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('경영'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'경영',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('경제'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'경제',inplace=True)

for i , j in  zip(X_test['대학전공'].str.contains('디자인'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'디자인',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('경제'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'경제',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('컴퓨터'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'컴퓨터',inplace=True)

for i , j in  zip(X_test['대학전공'].str.contains('교육'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'교육',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('스포츠'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'체육',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('영어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'영어영문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('중국어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'중어중문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('중국'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'중국학과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('광고'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'광고',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('미술'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'미술',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('정치'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'정치',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('전기'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'전기',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('국문'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'국문',inplace=True)

for i , j in  zip(X_test['대학전공'].str.contains('의류'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'의류',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('패션'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'의류',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('화학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'화학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('생명'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'생명',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('물리'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'물리',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('사학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'사학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('행정'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'행정',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('통상'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('산업'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'산업공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('심리'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'심리',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('법'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'법학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('조형'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'조형',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('수학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'수학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('문헌'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'문헌정보',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('회계'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'회계',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('정보'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'정보통신',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('중어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'중어중문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('전자'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'전자과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('독어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'독어독문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('일어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'일어일문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('불어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'불어일문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('토목'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'토목',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('건축'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'건축',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('전산'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'전산학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('기계'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'기계공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('복지'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'복지',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('사회'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'사회학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('보험'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'보험',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('금융'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'금융',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('환경'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'환경',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('금속'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'금속공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('도시'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'도시',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('미디어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'멀티미디어',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('제어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'제어계측',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('신소재'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'신소재',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('신문'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'신문방송',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('통신'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'정보통신',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('식품'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'식품영양',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('생물'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'생물학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('유전'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'유전공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('가정'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'가정',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('소프트'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'컴퓨터',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('지리'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'지리학',inplace=True)

for i , j in  zip(X_test['대학전공'].str.contains('섬유'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'섬유공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('의상'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'의상학과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('보건'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'보건관리',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('신학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'신학과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('신학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'신학과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('자원'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'자원공학',inplace=True)
                                
for i , j in  zip(X_test['대학전공'].str.contains('국제'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('공예'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'공예',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('일본'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'일어일문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('메카'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'메카트로닉스',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('영문'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'영어영문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('간호'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'간호학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('영양'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'식품영양',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('분자'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'고분자',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('아동'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'아동학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('무용'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'무용',inplace=True)


In [8]:
df=pd.concat([X_train, X_test])

In [9]:
df['근무형태'] = df['근무형태'].fillna('신입')

df['근무형태']=df['근무형태'].replace(
{
'정규직, 계약직, ':'정규직, 계약직',
'정규직, ':'정규직',
'정규직, 계약직, 해외취업, ':'정규직, 계약직, 해외취업',
'정규직, 계약직, 파견직, ':'정규직, 계약직, 파견직',
'정규직, 계약직, 해외취업, 파견직, ':'정규직, 계약직, 해외취업, 파견직',
'계약직, ':'계약직',
'정규직, 계약직, 해외취업, 인턴, ':'정규직, 계약직, 해외취업, 인턴',
'정규직, 해외취업, ':'정규직, 해외취업',
'정규직, 계약직, 인턴, 파견직, ':'정규직, 계약직, 인턴, 파견직',
'정규직, 계약직, 인턴, ':'정규직, 계약직, 인턴',
'정규직, 파견직, ':'정규직, 파견직',
'인턴, ':'인턴',
'정규직, 계약직, 해외취업, 인턴, 파견직, ':'정규직, 계약직, 해외취업, 인턴, 파견직',
'정규직, 해외취업, 파견직, ':'정규직, 해외취업, 파견직',
'계약직, 파견직, ':'계약직, 파견직',
'계약직, 인턴, ':'계약직, 인턴',
'정규직, 인턴, ':'정규직, 인턴',
'계약직, 해외취업, 파견직, ':'계약직, 해외취업, 파견직',
'정규직, 해외취업, 인턴, ':'정규직, 해외취업, 인턴',
'정규직, 계약직, 해외취업, 병역특례, 인턴, 파견직, ':'정규직, 계약직, 해외취업, 병역특례, 인턴, 파견직',
})

In [10]:
df['근무경력'].str.split(" ").str[0]
df['근무경력'].str.split(" ").str[-1]

df['연']=df['근무경력'].str.split(" ").str[0]
df['개월']=df['근무경력'].str.split(" ").str[-1]
df1=pd.concat([df.연,df.개월],axis=1)

df1['연'].loc[df1['연'] == '0개월'] = '0년'
df1['연'].loc[df1['연'] == '1개월'] = '0년'
df1['연'].loc[df1['연'] == '2개월'] = '0년'
df1['연'].loc[df1['연'] == '3개월'] = '0년'
df1['연'].loc[df1['연'] == '4개월'] = '0년'
df1['연'].loc[df1['연'] == '5개월'] = '0년'
df1['연'].loc[df1['연'] == '6개월'] = '0년'
df1['연'].loc[df1['연'] == '7개월'] = '0년'
df1['연'].loc[df1['연'] == '8개월'] = '0년'
df1['연'].loc[df1['연'] == '9개월'] = '0년'
df1['연'].loc[df1['연'] == '10개월'] = '0년'
df1['연'].loc[df1['연'] == '11개월'] = '0년'

df1['A'] = df1['연'].str.rstrip("년")
df1['B'] = df1['개월'].str.rstrip("개월")
df1['A'].unique()

df1['연차를 개월로 환산'] = df1['A'].astype(dtype='int64')*12
df1['B'] = df1['B'].astype(dtype='int64')*1
df1['개월수'] = df1['연차를 개월로 환산']+df1['B']
df['근무경력'] = df1['개월수']
df['근무경력'].value_counts()

df=df.drop(['연'],axis=1)
df=df.drop(['개월'],axis=1)

In [11]:

df.loc[(df['근무경력'] != 0) & (df['근무형태'] == '신입'), '근무형태'] = '정규직'
df.loc[(df['근무경력'] == 0) & (df['근무형태'] != '신입'), '근무형태'] = '신입'

In [12]:
X_train = df[:16570]
X_test = df[16570:]

In [13]:
X_train

Unnamed: 0,직종,세부직종,직무태그,근무경력,근무형태,근무지역,출신대학,대학전공,어학시험,자격증,대학성적
0,문화·예술·신문·방송,영상·음향·사진·카메라,"취재기자, 편집 기사, 유아 사이트 기획, 시나리오 작성",0,신입,"서울,경기,서울",성균관대학교,국문,없음,無,70.0
1,경영·기획·회계·사무,사무·총무·법무,,35,정규직,"부산,서울,일본",신라대학교,관광,JLPT,無,
2,IT·게임,하드웨어설계·개발·관리,"하드웨어 , 무선통신 , MICOM , ASM , RF , CDMA , Firm W...",0,신입,"서울,경기,",수원대학교,정보통신,없음,有,60.0
3,영업·판매·TM,기술영업,반도체,12,정규직,"경기,서울,충북",수원대학교,정보통신,없음,有,70.0
4,기술·과학·산업,기타 기술·과학·산업,기술직,0,신입,"전국,전국,전국",한밭대학교,화학,없음,無,70.0
...,...,...,...,...,...,...,...,...,...,...,...
16565,전문·교육·자격,금융·증권·투자,"고객지원,증권,생명,금융",0,신입,"전북,서울,경기",전주대학교,컴퓨터,없음,有,80.0
16566,경영·기획·회계·사무,경리·회계,"회계,세무,자금, 결산,급여자산관리,",0,신입,"서울,경기,",성균관대학교,회계,없음,有,70.0
16567,경영·기획·회계·사무,사무·총무·법무,"사무,회계,인사,관리,무역",0,신입,"서울,경기,인천",수원대학교,무역,TOEIC,有,70.0
16568,통신·모바일,기타 통신·모바일,"리서치 , 통계 , 조사분석 , EXCEL , SAS , SPSS",0,신입,"서울,서울,서울",성신여자대학교,통계,없음,無,70.0


In [14]:
numeric_features = ['근무경력','대학성적']
categorical_features = ['직종','세부직종','출신대학','대학전공','어학시험','자격증']
binary_features = ['직무태그','근무지역','근무형태']

X_train = X_train[numeric_features+categorical_features+binary_features]  # 순서 주의!!!
X_test = X_test[numeric_features+categorical_features+binary_features]

# CatBoost의 cat_features 파라미터에 지정할 범주형 피처 위치
cat_index = [list(X_train.columns).index(c) for c in categorical_features]

In [15]:
# 상하한값 제한을 통한 결측값 처리 함수: FunctionTransformer를 통해 호출
def remove_outlier(X, q=0.05):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

# 회귀분석의 계수검정을 이용한 피처선택 전처리기 클래스
class MyFeatureSelector(TransformerMixin, BaseEstimator):
    # 전처리기 생성 즉, MyFeatureSelector() 호출시 실행
    def __init__(self, p=0.01):
        self.p = p

    # 전처리기의 fit() 호출시 실행
    def fit(self, X, y=None):
        X = sm.add_constant(X)
        results = sm.OLS(y, X).fit()
        self.cols = list(results.pvalues[1:] <= self.p)
        return self
    
    # 전처리기의 transform() 호출시 실행
    def transform(self, X):
        return X[:,self.cols].astype(np.int64)        
    
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':0.02})),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999, dtype=np.object)),
    ]
)

binary_transformer = Pipeline(
    steps=[
        ("impuer", FunctionTransformer(lambda x: x.fillna('없음'))),      
        ("corpus", FunctionTransformer(lambda x: x.str.replace('·',',').str.split(',').str.join(" "))),
        ("BoW", CountVectorizer()),
        ("dense", FunctionTransformer(lambda x: x.toarray().astype(int), accept_sparse=True)),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("bin1", make_pipeline(binary_transformer, TruncatedSVD(n_components=NCOMP,random_state=SEED)), binary_features[0]),
        ("bin2", make_pipeline(binary_transformer, MyFeatureSelector(p=P)), binary_features[1]),
        ("bin3", make_pipeline(binary_transformer, MyFeatureSelector(p=P)), binary_features[2]),
    ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer), 
    ]
)

set_config(display="diagram")
preprocessor

In [18]:
# 전처리 파이프라인만 수행
X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

In [19]:
tuner = LightGBMTunerCV(
    params={
        "objective": "regression",   # 지도학습 유형(regression/binary/multiclass)
        "metric": "rmse",            # rmse 제공해준다
        "verbosity": -1,             # 진행과정 출력안함
        "boosting_type": "gbdt",     # 실행하고자 하는 알고리즘 유형(gdbt/rf/dart/goss)
        "seed": 0,
    },
    train_set=lgb.Dataset(X_train, y_train), # LGBM 데이터셋으로 변환
    nfold=5,
    num_boost_round=200,                     # boosting 반복횟수
    callbacks=[lgb.early_stopping(100)],     # 모든 정지 라운드 동안 validation 성능이 개선되지 않으면 조기종료
    time_budget=60,                          # 튜닝 허용 최대시간(초단위)
    optuna_seed=0,
)

tuner.run()

[32m[I 2022-11-23 12:33:16,444][0m A new study created in memory with name: no-name-f3731360-de70-4ebc-b7b4-d9e9ec37d5cf[0m
feature_fraction, val_score: inf:   0%|                                                          | 0/7 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 848.441725:  14%|######1                                    | 1/7 [00:01<00:06,  1.14s/it][32m[I 2022-11-23 12:33:17,583][0m Trial 0 finished with value: 848.4417252117815 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 848.4417252117815.[0m
feature_fraction, val_score: 848.441725:  14%|######1                                    | 1/7 [00:01<00:06,  1.14s/it]

Did not meet early stopping. Best iteration is:
[191]	cv_agg's rmse: 848.442 + 4.90194


feature_fraction, val_score: 847.823486:  29%|############2                              | 2/7 [00:02<00:05,  1.13s/it][32m[I 2022-11-23 12:33:18,701][0m Trial 1 finished with value: 847.8234858535336 and parameters: {'feature_fraction': 0.5}. Best is trial 1 with value: 847.8234858535336.[0m
feature_fraction, val_score: 847.823486:  29%|############2                              | 2/7 [00:02<00:05,  1.13s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.823 + 2.41695


feature_fraction, val_score: 847.609614:  43%|##################4                        | 3/7 [00:03<00:04,  1.16s/it][32m[I 2022-11-23 12:33:19,903][0m Trial 2 finished with value: 847.6096140085647 and parameters: {'feature_fraction': 0.7}. Best is trial 2 with value: 847.6096140085647.[0m
feature_fraction, val_score: 847.609614:  43%|##################4                        | 3/7 [00:03<00:04,  1.16s/it]

Did not meet early stopping. Best iteration is:
[171]	cv_agg's rmse: 847.61 + 3.42153


feature_fraction, val_score: 847.176145:  57%|########################5                  | 4/7 [00:04<00:03,  1.11s/it][32m[I 2022-11-23 12:33:20,934][0m Trial 3 finished with value: 847.1761447521519 and parameters: {'feature_fraction': 0.4}. Best is trial 3 with value: 847.1761447521519.[0m
feature_fraction, val_score: 847.176145:  57%|########################5                  | 4/7 [00:04<00:03,  1.11s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


feature_fraction, val_score: 847.176145:  71%|##############################7            | 5/7 [00:05<00:02,  1.18s/it][32m[I 2022-11-23 12:33:22,240][0m Trial 4 finished with value: 849.5925878279622 and parameters: {'feature_fraction': 0.8}. Best is trial 3 with value: 847.1761447521519.[0m
feature_fraction, val_score: 847.176145:  71%|##############################7            | 5/7 [00:05<00:02,  1.18s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


feature_fraction, val_score: 847.176145:  86%|####################################8      | 6/7 [00:07<00:01,  1.24s/it][32m[I 2022-11-23 12:33:23,588][0m Trial 5 finished with value: 851.7195136641305 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 3 with value: 847.1761447521519.[0m
feature_fraction, val_score: 847.176145:  86%|####################################8      | 6/7 [00:07<00:01,  1.24s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


feature_fraction, val_score: 847.176145: 100%|###########################################| 7/7 [00:08<00:00,  1.28s/it][32m[I 2022-11-23 12:33:24,964][0m Trial 6 finished with value: 851.9804286708537 and parameters: {'feature_fraction': 1.0}. Best is trial 3 with value: 847.1761447521519.[0m
feature_fraction, val_score: 847.176145: 100%|###########################################| 7/7 [00:08<00:00,  1.22s/it]


Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:   5%|##4                                             | 1/20 [00:03<01:00,  3.16s/it][32m[I 2022-11-23 12:33:28,132][0m Trial 7 finished with value: 856.6928762596519 and parameters: {'num_leaves': 141}. Best is trial 7 with value: 856.6928762596519.[0m
num_leaves, val_score: 847.176145:   5%|##4                                             | 1/20 [00:03<01:00,  3.16s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  10%|####8                                           | 2/20 [00:06<01:04,  3.56s/it][32m[I 2022-11-23 12:33:31,967][0m Trial 8 finished with value: 860.5853589688926 and parameters: {'num_leaves': 184}. Best is trial 7 with value: 856.6928762596519.[0m
num_leaves, val_score: 847.176145:  10%|####8                                           | 2/20 [00:07<01:04,  3.56s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  15%|#######2                                        | 3/20 [00:10<00:58,  3.44s/it][32m[I 2022-11-23 12:33:35,270][0m Trial 9 finished with value: 858.5725251269607 and parameters: {'num_leaves': 155}. Best is trial 7 with value: 856.6928762596519.[0m
num_leaves, val_score: 847.176145:  15%|#######2                                        | 3/20 [00:10<00:58,  3.44s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  20%|#########6                                      | 4/20 [00:13<00:55,  3.48s/it][32m[I 2022-11-23 12:33:38,814][0m Trial 10 finished with value: 859.4253125087296 and parameters: {'num_leaves': 140}. Best is trial 7 with value: 856.6928762596519.[0m
num_leaves, val_score: 847.176145:  20%|#########6                                      | 4/20 [00:13<00:55,  3.48s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  25%|############                                    | 5/20 [00:16<00:47,  3.18s/it][32m[I 2022-11-23 12:33:41,456][0m Trial 11 finished with value: 853.0985404912706 and parameters: {'num_leaves': 110}. Best is trial 11 with value: 853.0985404912706.[0m
num_leaves, val_score: 847.176145:  25%|############                                    | 5/20 [00:16<00:47,  3.18s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  30%|##############4                                 | 6/20 [00:20<00:47,  3.40s/it][32m[I 2022-11-23 12:33:45,288][0m Trial 12 finished with value: 859.9941084248394 and parameters: {'num_leaves': 166}. Best is trial 11 with value: 853.0985404912706.[0m
num_leaves, val_score: 847.176145:  30%|##############4                                 | 6/20 [00:20<00:47,  3.40s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  35%|################7                               | 7/20 [00:22<00:40,  3.15s/it][32m[I 2022-11-23 12:33:47,911][0m Trial 13 finished with value: 853.5789407337122 and parameters: {'num_leaves': 113}. Best is trial 11 with value: 853.0985404912706.[0m
num_leaves, val_score: 847.176145:  35%|################7                               | 7/20 [00:22<00:40,  3.15s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  40%|###################2                            | 8/20 [00:27<00:43,  3.61s/it][32m[I 2022-11-23 12:33:52,507][0m Trial 14 finished with value: 865.6910895459262 and parameters: {'num_leaves': 229}. Best is trial 11 with value: 853.0985404912706.[0m
num_leaves, val_score: 847.176145:  40%|###################2                            | 8/20 [00:27<00:43,  3.61s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  45%|#####################6                          | 9/20 [00:32<00:44,  4.02s/it][32m[I 2022-11-23 12:33:57,417][0m Trial 15 finished with value: 864.8322490317939 and parameters: {'num_leaves': 247}. Best is trial 11 with value: 853.0985404912706.[0m
num_leaves, val_score: 847.176145:  45%|#####################6                          | 9/20 [00:32<00:44,  4.02s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  50%|#######################5                       | 10/20 [00:34<00:34,  3.48s/it][32m[I 2022-11-23 12:33:59,686][0m Trial 16 finished with value: 851.9917753163882 and parameters: {'num_leaves': 99}. Best is trial 16 with value: 851.9917753163882.[0m
num_leaves, val_score: 847.176145:  50%|#######################5                       | 10/20 [00:34<00:34,  3.48s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  55%|#########################8                     | 11/20 [00:35<00:24,  2.72s/it][32m[I 2022-11-23 12:34:00,697][0m Trial 17 finished with value: 848.39735016061 and parameters: {'num_leaves': 33}. Best is trial 17 with value: 848.39735016061.[0m
num_leaves, val_score: 847.176145:  55%|#########################8                     | 11/20 [00:35<00:24,  2.72s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  60%|############################2                  | 12/20 [00:36<00:17,  2.20s/it][32m[I 2022-11-23 12:34:01,687][0m Trial 18 finished with value: 849.2339747831144 and parameters: {'num_leaves': 32}. Best is trial 17 with value: 848.39735016061.[0m
num_leaves, val_score: 847.176145:  60%|############################2                  | 12/20 [00:36<00:17,  2.20s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  65%|##############################5                | 13/20 [00:37<00:12,  1.77s/it][32m[I 2022-11-23 12:34:02,485][0m Trial 19 finished with value: 849.9252074906066 and parameters: {'num_leaves': 22}. Best is trial 17 with value: 848.39735016061.[0m
num_leaves, val_score: 847.176145:  65%|##############################5                | 13/20 [00:37<00:12,  1.77s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  70%|################################9              | 14/20 [00:37<00:08,  1.37s/it][32m[I 2022-11-23 12:34:02,925][0m Trial 20 finished with value: 861.304019800297 and parameters: {'num_leaves': 7}. Best is trial 17 with value: 848.39735016061.[0m
num_leaves, val_score: 847.176145:  70%|################################9              | 14/20 [00:37<00:08,  1.37s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  75%|###################################2           | 15/20 [00:39<00:07,  1.43s/it][32m[I 2022-11-23 12:34:04,509][0m Trial 21 finished with value: 849.252707434076 and parameters: {'num_leaves': 56}. Best is trial 17 with value: 848.39735016061.[0m
num_leaves, val_score: 847.176145:  75%|###################################2           | 15/20 [00:39<00:07,  1.43s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  80%|#####################################6         | 16/20 [00:41<00:05,  1.47s/it][32m[I 2022-11-23 12:34:06,047][0m Trial 22 finished with value: 849.252707434076 and parameters: {'num_leaves': 56}. Best is trial 17 with value: 848.39735016061.[0m
num_leaves, val_score: 847.176145:  80%|#####################################6         | 16/20 [00:41<00:05,  1.47s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.176145:  85%|#######################################9       | 17/20 [00:42<00:04,  1.50s/it][32m[I 2022-11-23 12:34:07,632][0m Trial 23 finished with value: 849.8667505173711 and parameters: {'num_leaves': 59}. Best is trial 17 with value: 848.39735016061.[0m
num_leaves, val_score: 847.176145:  85%|#######################################9       | 17/20 [00:42<00:04,  1.50s/it]

Did not meet early stopping. Best iteration is:
[200]	cv_agg's rmse: 847.176 + 4.27266


num_leaves, val_score: 847.053860:  90%|##########################################3    | 18/20 [00:43<00:02,  1.35s/it][32m[I 2022-11-23 12:34:08,635][0m Trial 24 finished with value: 847.0538598618143 and parameters: {'num_leaves': 30}. Best is trial 24 with value: 847.0538598618143.[0m
num_leaves, val_score: 847.053860:  90%|##########################################3    | 18/20 [00:43<00:02,  1.35s/it]

Did not meet early stopping. Best iteration is:
[157]	cv_agg's rmse: 847.054 + 3.51374


num_leaves, val_score: 847.053860:  95%|############################################6  | 19/20 [00:45<00:01,  1.58s/it][32m[I 2022-11-23 12:34:10,739][0m Trial 25 finished with value: 848.6629712155334 and parameters: {'num_leaves': 74}. Best is trial 24 with value: 847.0538598618143.[0m
num_leaves, val_score: 847.053860:  95%|############################################6  | 19/20 [00:45<00:01,  1.58s/it]

Did not meet early stopping. Best iteration is:
[157]	cv_agg's rmse: 847.054 + 3.51374


num_leaves, val_score: 847.053860: 100%|###############################################| 20/20 [00:47<00:00,  1.77s/it][32m[I 2022-11-23 12:34:12,966][0m Trial 26 finished with value: 850.6838959839666 and parameters: {'num_leaves': 88}. Best is trial 24 with value: 847.0538598618143.[0m
num_leaves, val_score: 847.053860: 100%|###############################################| 20/20 [00:48<00:00,  2.40s/it]


Did not meet early stopping. Best iteration is:
[157]	cv_agg's rmse: 847.054 + 3.51374


bagging, val_score: 847.053860:  10%|#####1                                             | 1/10 [00:01<00:09,  1.07s/it][32m[I 2022-11-23 12:34:14,037][0m Trial 27 finished with value: 852.3559602842195 and parameters: {'bagging_fraction': 0.7292881023569437, 'bagging_freq': 6}. Best is trial 27 with value: 852.3559602842195.[0m
bagging, val_score: 847.053860:  10%|#####1                                             | 1/10 [00:01<00:09,  1.07s/it]

Did not meet early stopping. Best iteration is:
[157]	cv_agg's rmse: 847.054 + 3.51374


bagging, val_score: 847.053860:  20%|##########2                                        | 2/10 [00:02<00:09,  1.14s/it][32m[I 2022-11-23 12:34:15,218][0m Trial 28 finished with value: 851.1333831663399 and parameters: {'bagging_fraction': 0.7616580256435892, 'bagging_freq': 4}. Best is trial 28 with value: 851.1333831663399.[0m
bagging, val_score: 847.053860:  20%|##########2                                        | 2/10 [00:02<00:09,  1.14s/it]

Did not meet early stopping. Best iteration is:
[157]	cv_agg's rmse: 847.054 + 3.51374


bagging, val_score: 847.053860:  30%|###############3                                   | 3/10 [00:03<00:07,  1.13s/it][32m[I 2022-11-23 12:34:16,348][0m Trial 29 finished with value: 855.7451814196971 and parameters: {'bagging_fraction': 0.6541928796037666, 'bagging_freq': 5}. Best is trial 28 with value: 851.1333831663399.[0m
bagging, val_score: 847.053860:  30%|###############3                                   | 3/10 [00:03<00:07,  1.13s/it]

Did not meet early stopping. Best iteration is:
[157]	cv_agg's rmse: 847.054 + 3.51374


bagging, val_score: 847.053860:  40%|####################4                              | 4/10 [00:04<00:06,  1.16s/it][32m[I 2022-11-23 12:34:17,560][0m Trial 30 finished with value: 855.855188957493 and parameters: {'bagging_fraction': 0.6625523267580531, 'bagging_freq': 7}. Best is trial 28 with value: 851.1333831663399.[0m
bagging, val_score: 847.053860:  40%|####################4                              | 4/10 [00:04<00:06,  1.15s/it]


Did not meet early stopping. Best iteration is:
[157]	cv_agg's rmse: 847.054 + 3.51374


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                           | 0/20 [00:00<?, ?it/s]
  0%|                                                                                            | 0/5 [00:00<?, ?it/s]


In [20]:
print(f"\nBest params: {tuner.best_params}")
print(f"\nBest score: {tuner.best_score:.2f}")


Best params: {'objective': 'regression', 'metric': 'rmse', 'verbosity': -1, 'boosting_type': 'gbdt', 'seed': 0, 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 30, 'feature_fraction': 0.4, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20}

Best score: 847.05


In [21]:
# 최적화된 하이퍼파라미터로 OOF를 수행하여 최종 LGBM 모형 생성
models = cross_validate(LGBMRegressor(**tuner.best_params), # 최적화된 hyperparameter 사용
                        X_train, y_train, 
                        cv=5, 
                        scoring='neg_mean_squared_error', 
                        return_estimator=True)
oof_pred = np.array([m.predict(X_test) for m in models['estimator']]).mean(axis=0)

scores = models['test_score']
print("\nTuned LGBM CV scores: ", np.sqrt(-1*scores))
print("Tuned LGBM CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))


Tuned LGBM CV scores:  [787.74532699 788.18545732 885.14880363 914.04106146 981.91148023]
Tuned LGBM CV mean = 874.63 with std = 362.67


In [22]:
# submission 화일 생성
filename = f'lgbm_{LGBM_VERSION}_{np.sqrt(-1*scores.mean()):.2f}.csv'
pd.DataFrame({'ID':test_id, 'Salary':oof_pred}).to_csv(filename, index=False)