In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.model_selection import train_test_split, KFold, cross_validate, ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from catboost import CatBoostRegressor
import re
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import category_encoders as ce
from lightgbm import LGBMRegressor
import pickle

- Set Global Hyperparameters

In [2]:
CATBOOST_VERSION = 3.0
NFOLDS = 10
SEED = 42
NCOMP = 550
P = 0.05
Q = 0.05

- Load feature engineered data 

In [3]:
with open('feature_engineered_data.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

X_train = loaded_data[:16570]
X_test = loaded_data[16570:]
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

- Classify features according to their form [numeric, categorical(One-Hot Encoding & Target Encoding), binary]

numeric_features = ['근무경력','대학성적']
categorical_features_ohe = ['직종','세부직종','출신대학','어학시험','자격증', '대학전공']
# categorical_features_target = ['대학전공']
binary_features = ['직무태그','근무지역','근무형태']

X_train = X_train[numeric_features + categorical_features_ohe + categorical_features_target + binary_features]
X_test = X_test[numeric_features + categorical_features_ohe + categorical_features_target + binary_features]

In [4]:
numeric_features = ['근무경력','대학성적']
categorical_features_ohe = ['직종','세부직종','출신대학','어학시험','자격증']
# categorical_features_target = ['대학전공']
binary_features = ['직무태그','근무지역','근무형태', '대학전공']

X_train = X_train[numeric_features + categorical_features_ohe + binary_features]
X_test = X_test[numeric_features + categorical_features_ohe + binary_features]



- Make outlier remover and Feature selector

In [5]:
# 상하한값 제한을 통한 결측값 처리 함수: FunctionTransformer를 통해 호출
def remove_outlier(X, q=Q):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

# 회귀분석의 계수검정을 이용한 피처선택 전처리기 클래스
class MyFeatureSelector(TransformerMixin, BaseEstimator):
    # 전처리기 생성 즉, MyFeatureSelector() 호출시 실행
    def __init__(self, p=P):
        self.p = p

    # 전처리기의 fit() 호출시 실행
    def fit(self, X, y=None):
        X = sm.add_constant(X)
        results = sm.OLS(y, X).fit()
        self.cols = list(results.pvalues[1:] <= self.p)
        return self
    
    # 전처리기의 transform() 호출시 실행
    def transform(self, X):
        return X[:,self.cols].astype(np.int64)   

- Build a pipeline for 1. Preprocessing that I might miss at the last task 2. Encoding 3. Transforming

In [6]:
# Pipeline for numeric features preprocessing
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':Q})),
    ]
)

# Pipeline for categorical features which need One-Hot encoding
categorical_transformer_ohe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(sparse=True, handle_unknown='ignore')),
    ]
)

# Pipeline for categorical features which need Target encoding
categorical_transformer_target = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", ce.TargetEncoder(smoothing = 1, handle_unknown='ignore')),
    ]
)

binary_transformer = Pipeline(
    steps=[
        ("impuer", FunctionTransformer(lambda x: x.fillna('없음'))),      
        ("corpus", FunctionTransformer(lambda x: x.str.replace('·',',').str.replace(' ,',',').str.replace('/',',').str.replace(' , ',',').str.replace(', ',',').str.split(',').str.join(" "))),
        ("BoW", CountVectorizer()),
        ("dense", FunctionTransformer(lambda x: x.toarray().astype(int), accept_sparse=True)),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat1", categorical_transformer_ohe, categorical_features_ohe),
        # ("cat2", categorical_transformer_target, categorical_features_target),
        ("bin1", make_pipeline(binary_transformer, TruncatedSVD(n_components=NCOMP, random_state=SEED)), binary_features[0]),
        ("bin2", make_pipeline(binary_transformer, MyFeatureSelector(p=P)), binary_features[1]),
        ("bin3", make_pipeline(binary_transformer, MyFeatureSelector(p=P)), binary_features[2]),
        ("bin4", make_pipeline(binary_transformer, TruncatedSVD(n_components=NCOMP, random_state=SEED)), binary_features[3])
    ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer), 
    ]
)

set_config(display="diagram")
preprocessor

In [None]:
# how to make the standardscaler code?


In [7]:
X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

In [8]:
dfdf = pd.DataFrame(X_train)
dfdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1280,1281,1282,1283,1284,1285,1286,1287,1288,1289
0,0.0,70.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.942456e-05,-8.543333e-04,8.809073e-04,-6.595717e-04,-4.041801e-04,-3.750841e-04,-3.095286e-04,2.867381e-04,5.085425e-05,1.513794e-04
1,35.0,71.704069,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-4.748806e-16,-4.926615e-16,2.482823e-16,-3.616031e-15,-1.222546e-15,6.656622e-15,-6.461845e-16,6.647460e-15,-6.238499e-16,3.906597e-15
2,0.0,60.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-7.320533e-16,-4.978656e-16,1.457168e-16,1.205633e-15,1.118897e-15,3.122502e-16,-3.354522e-16,6.062859e-16,-3.989864e-16,3.679348e-15
3,12.0,70.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-7.320533e-16,-4.978656e-16,1.457168e-16,1.205633e-15,1.118897e-15,3.122502e-16,-3.354522e-16,6.062859e-16,-3.989864e-16,3.679348e-15
4,0.0,70.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,5.502641e-06,-3.669785e-06,-5.342868e-06,3.859363e-06,3.846892e-06,4.784385e-06,-8.740311e-06,8.731335e-06,-1.028541e-05,-1.252458e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16565,0.0,80.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.246831e-06,-4.220478e-06,-9.006405e-07,2.029779e-06,6.464555e-06,5.995959e-06,-2.300393e-06,1.351326e-05,5.368373e-06,1.104960e-06
16566,0.0,70.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,9.072960e-06,9.960135e-07,-1.213717e-06,-1.346494e-06,-7.136991e-06,-4.803606e-06,-2.521519e-06,-1.835351e-07,-1.855522e-07,3.990689e-06
16567,0.0,70.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,6.591949e-17,2.775558e-17,2.775558e-17,2.775558e-17,-2.081668e-17,2.428613e-17,2.255141e-17,2.775558e-17,-2.428613e-17,-1.040834e-17
16568,0.0,70.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.733245e-12,1.046274e-12,-5.729948e-13,-5.889933e-13,-1.136392e-12,-2.992576e-12,3.387148e-12,1.759046e-12,-1.417596e-12,-6.588801e-12


In [9]:
%%time

model = LGBMRegressor(
    boosting_type='gbdt',  
    num_leaves=31,         
    learning_rate=0.05,    
    n_estimators=100,      
    objective='regression',
    metric='neg_mean_squared_error',
    random_state=SEED
)

models = cross_validate(model,
                        X_train, y_train, 
                        cv=5, 
                        scoring='neg_mean_squared_error', 
                        return_estimator=True)
oof_pred = np.array([m.predict(X_test) for m in models['estimator']]).mean(axis=0)

scores = models['test_score']
print("\nCatBoost CV scores: ", np.sqrt(-1*scores))
print("CatBoost CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 280760
[LightGBM] [Info] Number of data points in the train set: 13256, number of used features: 1236
[LightGBM] [Info] Start training from score 2874.460622
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 280774
[LightGBM] [Info] Number of data points in the train set: 13256, number of used features: 1238
[LightGBM] [Info] Start training from score 2872.748189
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 280748
[LightGBM] [Info] Number of data points in the train set: 13256, number of used features: 1236
[LightGBM] [Info] Start training from score 2860.648008
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 280750
[LightGBM] [Info] Number of data points in the train set: 13256, number of used features: 1237
[LightGBM] [Info] Start training from score 2787.375528
You can set `force_c

# submission 화일 생성
filename = f'catboost_{CATBOOST_VERSION}_{np.sqrt(-1*scores.mean()):.2f}.csv'
pd.DataFrame({'ID':test_id, 'Salary':oof_pred}).to_csv(filename, index=False)