In [119]:
!pip install pandas==1.4.2





In [46]:
LGBM_VERSION = 2.0   # submission 화일명에 사용
NFOLDS = 15
SEED = 0
TIMEOUT = 180

In [133]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, ShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import TransformedTargetRegressor
from sklearn.cluster import KMeans
from lightgbm import LGBMRegressor
import lightgbm as lgb
from category_encoders import CatBoostEncoder
import optuna
from optuna.distributions import CategoricalDistribution, IntDistribution, FloatDistribution
from optuna.integration import OptunaSearchCV
from optuna.integration.lightgbm import LightGBMTunerCV
from scipy.sparse import csr_matrix
import re

In [134]:
X_train = pd.read_csv('X_train_피처추가1124지협지0357.csv', encoding='cp949')#.drop(columns='ID')
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv('X_test_피처추가1124지협지0357.csv', encoding='cp949')

X_test_1 = pd.read_csv('X_test.csv', encoding='cp949')
test_id = X_test_1.ID

In [135]:
for df in [X_train, X_test]:
    numeric_features = df.dtypes[df.dtypes != "object"].index.tolist()

    categorical_features = df.dtypes[df.dtypes == "object"].index.tolist()


binary_features = ['직종','세부직종','근무지역']
pca_features = ['직무태그']
categorical_features = [x for x in categorical_features if x not in binary_features]
categorical_features = [x for x in categorical_features if x not in pca_features]
    

X_train = X_train[numeric_features+categorical_features+binary_features+pca_features]  # 순서 주의!!!
X_test = X_test[numeric_features+categorical_features+binary_features+pca_features]

In [136]:
def remove_outlier(X, q=0.05):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':0.05})), 
        ("scaler", PowerTransformer()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=int)),
    ]
)
    
binary_transformer = Pipeline(
    steps=[
        ("corpus", FunctionTransformer(lambda x: x.str.replace('·',',').str.split(',').str.join(" "))),
        ("BoW", CountVectorizer()),
    ]
)

pca_transformer = Pipeline(
    steps=[
        ("impuer", FunctionTransformer(lambda x: x.fillna('없음'))),      
        ("corpus", FunctionTransformer(lambda x: x.str.replace('·',',').str.split(',').str.join(" "))),
        ("BoW", CountVectorizer()),
        ("dense", FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
        ("pca", PCA(n_components=200)),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("bin1", binary_transformer, binary_features[0]), 
        ("bin2", binary_transformer, binary_features[1]), 
        ("bin3", binary_transformer, binary_features[2]), 
        ("pca1", pca_transformer, pca_features[0]), 
   ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer), 
        ("selector", SelectPercentile(percentile=80)),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", LGBMRegressor(random_state=0)),
    ]
)

set_config(display="diagram")  # To view the text pipeline, change to display='text'.
model

In [137]:
sscv = ShuffleSplit(test_size=.3334, n_splits=NFOLDS, random_state=SEED) # Public LB 사이즈와 동일하게 평가데이터 사이즈 설정 
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=sscv)

print("Default LGBM CV scores: ", np.sqrt(-1*scores))
print("Default LGBM CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))

Default LGBM CV scores:  [828.43910839 808.47241461 839.1477948  832.85869597 811.27329088
 812.32549201 832.4977696  848.63100021 807.509188   831.39438638
 794.51300938 817.83880383 834.96372962 811.6948762  834.38369015]
Default LGBM CV mean = 823.19 with std = 154.29


In [138]:
%%time

param_distributions = {
    "preprocessor__column__num__imputer__strategy": CategoricalDistribution(["mean","median"]),
    "preprocessor__column__num__outlier__kw_args": CategoricalDistribution([{'q':0.01},{'q':0.05},{'q':0.1}]),
    "preprocessor__column__pca1__pca__n_components": IntDistribution(100,500,step=100),  
    "preprocessor__selector__percentile": IntDistribution(50,100,step=10),  
}

optuna_search = OptunaSearchCV(model, 
                               param_distributions, 
                               cv=sscv, scoring='neg_mean_squared_error', 
                               n_trials=20,
                               timeout=TIMEOUT,   # 튜닝 허용 최대시간(초단위)
                               study=optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize"))

optuna.logging.set_verbosity(optuna.logging.WARNING)
optuna_search.fit(X_train, y_train)

print(f"Best params: {optuna_search.best_params_}")
print("Best score: %.2f" % (-1*optuna_search.best_score_)**0.5)

Best params: {'preprocessor__column__num__imputer__strategy': 'mean', 'preprocessor__column__num__outlier__kw_args': {'q': 0.05}, 'preprocessor__column__pca1__pca__n_components': 100, 'preprocessor__selector__percentile': 90}
Best score: 822.37
Wall time: 3min 58s


In [139]:
# 최적값으로 파이프라인 재설정
model.set_params(**optuna_search.best_params_)

# 전처리 파이프라인만 수행
X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

In [140]:
# X = pd.DataFrame(X_train)
# Y = pd.DataFrame(y_train)
# Xt = pd.DataFrame(X_test)

In [141]:
# kmeans = KMeans(n_clusters=5, init='k-means++')
# kmeans.fit_transform(X,Y)
# kmeans.transform(Xt)

# pre_X5 = kmeans.predict(X)
# k_means_5 = pd.DataFrame(pre_X5)
# X['cluster5'] = k_means_5 

# pre_Xt5 = kmeans.predict(Xt)
# k_means_t5 = pd.DataFrame(pre_Xt5)
# Xt['cluster5'] = k_means_t5

In [142]:
# kmeans10 = KMeans(n_clusters=10, init='k-means++')
# kmeans10.fit_transform(X,Y)
# kmeans10.transform(Xt)

# pre_X10 = kmeans10.predict(X)
# k_means_10 = pd.DataFrame(pre_X10)
# X['cluster10'] = k_means_10 

# pre_Xt10 = kmeans10.predict(Xt)
# k_means_t10 = pd.DataFrame(pre_Xt10)
# Xt['cluster10'] = k_means_t10 

In [143]:
# kmeans15 = KMeans(n_clusters=15, init='k-means++')
# kmeans15.fit_transform(X,Y)
# kmeans15.transform(Xt)

# pre_X15 = kmeans15.predict(X)
# k_means_15 = pd.DataFrame(pre_X15)
# X['cluster15'] = k_means_15 

# pre_Xt15 = kmeans15.predict(Xt)
# k_means_t15 = pd.DataFrame(pre_Xt15)
# Xt['cluster15'] = k_means_t15

In [144]:
# X_train = np.array(X)
# X_test = np.array(Xt)
# y_train = np.array(Y)

In [145]:
tuner = LightGBMTunerCV(
    params={
        "objective": "regression",   # 지도학습 유형(regression/binary/multiclass)
        "metric": "rmse",
        "verbosity": -1,             # 진행과정 출력안함
        "boosting_type": "gbdt",     # 실행하고자 하는 알고리즘 유형(gbdt/rf/dart/goss)
        "seed": 0,
    },
    train_set=lgb.Dataset(X_train, y_train), # LGBM 데이터셋으로 변환
    nfold=15,
    num_boost_round=200,                     # boosting 반복횟수
    callbacks=[lgb.early_stopping(100)],     # 모든 정지 라운드 동안 validation 성능이 개선되지 않으면 조기종료
    time_budget=TIMEOUT,                     # 튜닝 허용 최대시간(초단위)
    optuna_seed=0,
)

tuner.run()

feature_fraction, val_score: inf:   0%|                                                          | 0/7 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 811.288815:  14%|######1                                    | 1/7 [00:39<03:54, 39.00s/it]

Did not meet early stopping. Best iteration is:
[174]	cv_agg's rmse: 811.289 + 21.6553


feature_fraction, val_score: 810.453648:  29%|############2                              | 2/7 [01:11<03:05, 37.11s/it]

Did not meet early stopping. Best iteration is:
[198]	cv_agg's rmse: 810.454 + 23.7698


feature_fraction, val_score: 810.453648:  43%|##################4                        | 3/7 [01:48<02:27, 36.99s/it]

Did not meet early stopping. Best iteration is:
[198]	cv_agg's rmse: 810.454 + 23.7698


feature_fraction, val_score: 809.930998:  57%|########################5                  | 4/7 [02:15<01:42, 34.03s/it]

Did not meet early stopping. Best iteration is:
[174]	cv_agg's rmse: 809.931 + 19.8721


feature_fraction, val_score: 808.031556:  71%|##############################7            | 5/7 [03:07<01:15, 37.56s/it]
  0%|                                                                                           | 0/20 [00:00<?, ?it/s]
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]
  0%|                                                                                            | 0/6 [00:00<?, ?it/s]
  0%|                                                                                           | 0/20 [00:00<?, ?it/s]
  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Did not meet early stopping. Best iteration is:
[157]	cv_agg's rmse: 808.032 + 22.9114





In [146]:
print(f"\nBest params: {tuner.best_params}")
print(f"\nBest score: {tuner.best_score:.2f}")


Best params: {'objective': 'regression', 'metric': 'rmse', 'verbosity': -1, 'boosting_type': 'gbdt', 'seed': 0, 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 31, 'feature_fraction': 0.8999999999999999, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20}

Best score: 808.03


In [147]:
# 최적화된 하이퍼파라미터로 OOF를 수행하여 최종 LGBM 모형 생성
models = cross_validate(LGBMRegressor(**tuner.best_params,learning_rate = 0.01, num_iterations = 10000), # 최적화된 hyperparameter 사용
                        X_train, y_train, 
                        cv=sscv, 
                        scoring='neg_mean_squared_error', 
                        return_estimator=True)
oof_pred = np.array([m.predict(X_test) for m in models['estimator']]).mean(axis=0)

scores = models['test_score']
print("\nTuned LGBM CV scores: ", np.sqrt(-1*scores))
print("Tuned LGBM CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))


Tuned LGBM CV scores:  [818.31540138 806.63462018 829.13671504 826.10925065 806.94084598
 805.2513291  824.9867977  843.99037956 805.46347541 826.38294889
 787.5009531  812.40137985 820.36741361 812.22687069 823.91644406]
Tuned LGBM CV mean = 816.75 with std = 146.05


In [148]:
filename = f'lgbm_지협지11241705_{np.sqrt(-1*scores.mean()):.2f}.csv'
pd.DataFrame({'ID':test_id, 'Salary':oof_pred}).to_csv(filename, index=False)