### Import Libraries

In [1]:
import pandas as pd
import random
import os
import numpy as np
from functools import partial
from lightgbm import LGBMRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [4]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [5]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [6]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [7]:
def get_binary_target(df):
    """
    @Description: transform numeric target to binary
    @Param1 df, pandas dataframe
    @Param2 y_range, list of lists with min-max
    @return labels, binary labels
    """
    
    ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
          'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
          'Y_11', 'Y_12', 'Y_13', 'Y_14']
    ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], [7, 19], [22, 36.5], [-19.2, 19], 
                 [2.4, 4], [-29.2, -24], [-29.2, -24],[-30.6, -20], [19.6, 26.6], 
                 [-29.2, -24], [-29.2, -24], [-29.2, -24]]
    labels = pd.DataFrame()
    for idx in range(len(ys)):
        y_series = ~df[ys[idx]].between(ys_bounds[idx][0], ys_bounds[idx][1], inclusive='both')
        labels = pd.concat([labels, y_series.astype(int)], axis = 1)
    return labels


In [8]:

def get_splitted_data(binary_target, col, train_x_df, train_y_df, test_size = 0.2):
    
    train = pd.concat([train_x_df, train_y_df[col]], axis = 1) # 학습데이터에 수치형 타겟 칼럼 추가 
    target = binary_target[col] # 칼럼 이진 데이터 (불량 vs. 정상)
    X_train, X_test, y_train, y_test = train_test_split(train, target, random_state=1, test_size=test_size, stratify=target)
    
    # 여기서 X_test, y_test 는 이진 데이터이므로 사용하지 않음
    # 나눠진 데이터에서 불량/정상 데이터 비율 확인 
    print("학습 데이터에서의 불량/정상 Ratio : ", sum(y_train ==0) / sum(y_train))
    print("테스트 데이터에서의 불량/정상 Ratio: ", sum(y_test ==0) / sum(y_test))
    
    train_numerical_target = X_train[col] # 나눠진 *학습* 데이터에서 수치형 데이터 다시 추출
    train_feature = X_train.drop([col], axis = 1) # 나눠진 *학습* 데이터에서 수치형 데이터 제거

    test_numerical_target = X_test[col] # 나눠진 *테스트* 데이터에서 수치형 데이터 다시 추출
    test_feature = X_test.drop([col], axis = 1) # 나눠진 *테스트* 데이터에서 수치형 데이터 제거
    
    return train_feature, train_numerical_target, test_feature, test_numerical_target


In [9]:
train_df = pd.read_csv('data/train.csv')
test_x = pd.read_csv('data/test.csv')
train_x, train_y = dataset_split_X_y(train_df)

cols_with_zero_variance = zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)
y_binary_label = get_binary_target(train_y)


In [10]:

train_feature, train_target, test_feature, test_target = get_splitted_data(y_binary_label, 'Y_01', train_x, train_y, test_size=0.2)


학습 데이터에서의 불량/정상 Ratio :  25.82895850973751
테스트 데이터에서의 불량/정상 Ratio:  25.854237288135593


In [11]:
train_feature = pd.DataFrame(train_feature)
train_target = pd.DataFrame(train_target)
test_feature = pd.DataFrame(test_feature)
test_target = pd.DataFrame(test_target)

In [12]:
train_feature.reset_index(inplace=True)
train_target.reset_index(inplace=True)
test_feature.reset_index(inplace=True)
test_target.reset_index(inplace=True)



In [13]:
train_feature = train_feature.iloc[:, 1:]
train_target = train_target.iloc[:, 1:]
test_feature = test_feature.iloc[:, 1:]
test_target = test_target.iloc[:, 1:]

In [14]:
train_target

Unnamed: 0,Y_01
0,1.226
1,1.821
2,1.532
3,1.032
4,1.793
...,...
31680,1.355
31681,1.775
31682,1.392
31683,2.013


In [12]:
lst = []
for i in df_indicator.columns:  # 불량 데이터 (행) 인덱스 추출
    lst.append(df_indicator[df_indicator[i] == 1].index)
    
ans=set() # 유니크한 인덱스
for i in lst:
    for k in i:
        ans.add(k)

ans = list(ans)
ans.sort()
train_data_spec = train_df.loc[ans, :]  # 불량 데이터

NameError: name 'df_indicator' is not defined

In [None]:
train_data_norm = train_df.drop(train_data_spec.index) # 정상 데이터

In [None]:
print(len(train_data_norm))

train_x_norm, train_y_norm = dataset_split_X_y(train_data_norm)

In [None]:
train_x_spec, train_y_spec = dataset_split_X_y(train_data_spec)
print(len(train_x_spec))


In [None]:
new_train_x_spec_plus_norm = pd.concat([train_x_norm, train_x_spec], axis = 0)
new_train_y_spec_plus_norm = pd.concat([train_y_norm, train_y_spec], axis = 0)

In [None]:
new_train_x_spec_plus_norm

In [None]:
# msk1 = np.random.rand(len(train_x_norm)) < 0.8
# msk2 = np.random.rand(len(train_x_spec)) < 0.8

# tv_train_x_norm = train_x_norm[msk1]
# tv_valid_x_norm = train_x_norm[~msk1]
# tv_train_y_norm = train_y_norm[msk1]
# tv_valid_y_norm = train_y_norm[~msk1]

# tv_train_x_spec = train_x_spec[msk2]
# tv_valid_x_spec = train_x_spec[~msk2]
# tv_train_y_spec = train_y_spec[msk2]
# tv_valid_y_spec = train_y_spec[~msk2]

# tv_train_x = pd.concat([tv_train_x_norm, tv_train_x_spec], axis=0)
# tv_valid_x = pd.concat([tv_valid_x_norm, tv_valid_x_spec], axis=0)
# tv_train_y = pd.concat([tv_train_y_norm, tv_train_y_spec], axis=0)
# tv_valid_y = pd.concat([tv_valid_y_norm, tv_valid_y_spec], axis=0)

# tv_train_x.reset_index(inplace = True)
# tv_valid_x.reset_index(inplace = True)
# tv_train_y.reset_index(inplace = True)
# tv_valid_y.reset_index(inplace = True)

In [None]:
tv_train_x_norm

In [None]:
print(len(tv_train_x))
print(len(tv_train_y))
print('-------------------------------------------')
print(len(tv_valid_x))
print(len(tv_valid_y))

In [None]:
tv_train_x = tv_train_x.iloc[:, 1:]
tv_train_y = tv_train_y.iloc[:, 1:]
tv_valid_x = tv_valid_x.iloc[:, 1:]
tv_valid_y = tv_valid_y.iloc[:, 1:]

In [None]:
tv_train_x.columns

In [None]:
#cols_with_zero_variance = zero_variance(tv_train_x) # 분산이 0 (통과 여부)
#tv_train_x = tv_train_x.drop(cols_with_zero_variance, axis = 1)
#tv_valid_x = tv_train_x.drop(cols_with_zero_variance, axis = 1)

tv_train_x = tv_train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
tv_valid_x = tv_valid_x.drop(['X_10', 'X_11'], axis = 1)

#tv_valid_x = tv_valid_x.drop('ID', axis=1)

In [17]:
train_feature = pd.DataFrame(train_feature)
train_target = pd.DataFrame(train_target)

In [21]:
train_feature

Unnamed: 0,Y_01
15549,1.226
26431,1.821
6075,1.532
25127,1.032
8710,1.793
...,...
19701,1.355
6206,1.775
5378,1.392
28898,2.013


In [15]:
def lgbm_objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.5f}'.format(params['colsample_bytree']),
        'subsample': '{:.5f}'.format(params['subsample']),
        'min_split_gain': '{:.5f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.5f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.5f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.5f}'.format(params['reg_lambda']),
        'learning_rate': '{:.5f}'.format(params['learning_rate']),   
    }

    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 1,
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_feature, train_target, cv=10,scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_01']))
    
    print("NRMSE Loss {:.5f} params {}".format(losses.mean(), params))
    return losses.mean()

In [16]:
space_lgbm = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 1),
    'max_depth': hp.quniform('max_depth', 5, 250, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 200, 5),
    'min_child_samples': hp.quniform('min_child_samples', 10, 150, 5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 500),
    'reg_lambda': hp.uniform('reg_lambda', 0, 500),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = lgbm_objective,
            space = space_lgbm,
            algo = tpe.suggest,
            max_evals = 200)

print(best)
best['n_estimators'] = int(best['n_estimators'])
best['num_leaves'] = int(best['num_leaves'])
best['max_depth'] = int(best['max_depth'])
best['min_child_samples'] = int(best['min_child_samples'])

NRMSE Loss 0.25674 params {'n_estimators': 276, 'max_depth': 89, 'num_leaves': 180, 'min_child_samples': 15, 'colsample_bytree': '0.35485', 'subsample': '0.32233', 'min_split_gain': '0.45316', 'scale_pos_weight': '8.17551', 'reg_alpha': '25.15811', 'reg_lambda': '316.91357', 'learning_rate': '0.07763'}
NRMSE Loss 0.25786 params {'n_estimators': 951, 'max_depth': 132, 'num_leaves': 80, 'min_child_samples': 35, 'colsample_bytree': '0.51974', 'subsample': '0.81896', 'min_split_gain': '0.41096', 'scale_pos_weight': '9.85671', 'reg_alpha': '70.33169', 'reg_lambda': '265.46612', 'learning_rate': '0.42966'}
NRMSE Loss 0.26290 params {'n_estimators': 1431, 'max_depth': 122, 'num_leaves': 190, 'min_child_samples': 120, 'colsample_bytree': '0.50923', 'subsample': '0.52416', 'min_split_gain': '0.37497', 'scale_pos_weight': '4.43873', 'reg_alpha': '445.48074', 'reg_lambda': '209.76655', 'learning_rate': '0.09856'}
NRMSE Loss 0.25784 params {'n_estimators': 907, 'max_depth': 160, 'num_leaves': 185,

NRMSE Loss 0.25921 params {'n_estimators': 424, 'max_depth': 12, 'num_leaves': 160, 'min_child_samples': 55, 'colsample_bytree': '0.45721', 'subsample': '0.46986', 'min_split_gain': '0.32645', 'scale_pos_weight': '9.11140', 'reg_alpha': '140.40273', 'reg_lambda': '45.14901', 'learning_rate': '0.33089'}
NRMSE Loss 0.25593 params {'n_estimators': 572, 'max_depth': 231, 'num_leaves': 175, 'min_child_samples': 20, 'colsample_bytree': '0.33106', 'subsample': '0.58967', 'min_split_gain': '0.46315', 'scale_pos_weight': '6.18212', 'reg_alpha': '0.40427', 'reg_lambda': '116.06658', 'learning_rate': '0.06731'}
NRMSE Loss 0.25729 params {'n_estimators': 845, 'max_depth': 100, 'num_leaves': 175, 'min_child_samples': 25, 'colsample_bytree': '0.36953', 'subsample': '0.58702', 'min_split_gain': '0.47697', 'scale_pos_weight': '5.93219', 'reg_alpha': '46.60323', 'reg_lambda': '118.62368', 'learning_rate': '0.06547'}
NRMSE Loss 0.25872 params {'n_estimators': 599, 'max_depth': 179, 'num_leaves': 200, 'm

NRMSE Loss 0.25645 params {'n_estimators': 206, 'max_depth': 111, 'num_leaves': 100, 'min_child_samples': 45, 'colsample_bytree': '0.74923', 'subsample': '0.90297', 'min_split_gain': '0.07020', 'scale_pos_weight': '5.23307', 'reg_alpha': '23.60080', 'reg_lambda': '415.52243', 'learning_rate': '0.01526'}
NRMSE Loss 0.25999 params {'n_estimators': 288, 'max_depth': 86, 'num_leaves': 135, 'min_child_samples': 70, 'colsample_bytree': '0.84913', 'subsample': '0.97070', 'min_split_gain': '0.02697', 'scale_pos_weight': '3.99594', 'reg_alpha': '209.11159', 'reg_lambda': '317.12871', 'learning_rate': '0.01001'}
NRMSE Loss 0.26134 params {'n_estimators': 502, 'max_depth': 132, 'num_leaves': 125, 'min_child_samples': 55, 'colsample_bytree': '0.89778', 'subsample': '0.84131', 'min_split_gain': '0.18718', 'scale_pos_weight': '2.96998', 'reg_alpha': '271.11133', 'reg_lambda': '452.60973', 'learning_rate': '0.01998'}
NRMSE Loss 0.25812 params {'n_estimators': 391, 'max_depth': 8, 'num_leaves': 60, 'm

NRMSE Loss 0.25893 params {'n_estimators': 719, 'max_depth': 179, 'num_leaves': 140, 'min_child_samples': 10, 'colsample_bytree': '0.42722', 'subsample': '0.40719', 'min_split_gain': '0.59831', 'scale_pos_weight': '3.80663', 'reg_alpha': '108.14912', 'reg_lambda': '168.63745', 'learning_rate': '0.16927'}
NRMSE Loss 0.25660 params {'n_estimators': 638, 'max_depth': 201, 'num_leaves': 180, 'min_child_samples': 30, 'colsample_bytree': '0.49842', 'subsample': '0.45034', 'min_split_gain': '0.01645', 'scale_pos_weight': '6.81136', 'reg_alpha': '61.27422', 'reg_lambda': '153.79570', 'learning_rate': '0.22171'}
NRMSE Loss 0.25570 params {'n_estimators': 1088, 'max_depth': 168, 'num_leaves': 170, 'min_child_samples': 15, 'colsample_bytree': '0.60066', 'subsample': '0.38337', 'min_split_gain': '0.44531', 'scale_pos_weight': '6.10485', 'reg_alpha': '0.29239', 'reg_lambda': '198.70901', 'learning_rate': '0.07231'}
NRMSE Loss 0.26290 params {'n_estimators': 1097, 'max_depth': 184, 'num_leaves': 165

NRMSE Loss 0.25963 params {'n_estimators': 970, 'max_depth': 118, 'num_leaves': 200, 'min_child_samples': 10, 'colsample_bytree': '0.59618', 'subsample': '0.77283', 'min_split_gain': '0.22468', 'scale_pos_weight': '3.23472', 'reg_alpha': '164.76529', 'reg_lambda': '277.67292', 'learning_rate': '0.03040'}
NRMSE Loss 0.25716 params {'n_estimators': 654, 'max_depth': 150, 'num_leaves': 190, 'min_child_samples': 30, 'colsample_bytree': '0.52941', 'subsample': '0.52403', 'min_split_gain': '0.49997', 'scale_pos_weight': '4.50244', 'reg_alpha': '41.50257', 'reg_lambda': '107.19247', 'learning_rate': '0.04815'}
NRMSE Loss 0.25984 params {'n_estimators': 780, 'max_depth': 136, 'num_leaves': 145, 'min_child_samples': 20, 'colsample_bytree': '0.74091', 'subsample': '0.96542', 'min_split_gain': '0.69087', 'scale_pos_weight': '6.52776', 'reg_alpha': '152.27036', 'reg_lambda': '62.92666', 'learning_rate': '0.23212'}
 56%|████████████████████████▉                    | 111/200 [10:28<08:23,  5.66s/tri

KeyboardInterrupt: 

In [79]:

def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
    }
    
    model = LGBMRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **params))
    
    losses = np.sqrt(-cross_val_score(model, train_x, train_y['Y_01'], cv=10, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y['Y_01']))
    
    print("NRMSE Loss {:.5f} params {}".format(losses, params))
    return loss


In [82]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 500,
            rstate=np.random.default_rng(1))


print(best)
best['n_estimators'] = int(best['n_estimators'])
best['num_leaves'] = int(best['num_leaves'])
best['max_depth'] = int(best['max_depth'])
best['min_child_samples'] = int(best['min_child_samples'])

  0%|                                                                          | 0/500 [00:00<?, ?trial/s, best loss=?]

job exception: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\sklearn.py", line 748, in fit
    self._Booster = train(
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\engine.py", line 271, in train
    booster = Booster(params=params, train_set=train_set)


  0%|                                                                          | 0/500 [00:00<?, ?trial/s, best loss=?]


ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\sklearn.py", line 748, in fit
    self._Booster = train(
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\engine.py", line 271, in train
    booster = Booster(params=params, train_set=train_set)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\basic.py", line 2605, in __init__
    train_set.construct()
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\basic.py", line 1815, in construct
    self._lazy_init(self.data, label=self.label,
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\basic.py", line 1517, in _lazy_init
    params_str = param_dict_to_str(params)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\lightgbm\basic.py", line 294, in param_dict_to_str
    raise TypeError(f'Unknown type of parameter:{key}, got:{type(val).__name__}')
TypeError: Unknown type of parameter:boosting_type, got:LGBMRegressor


In [62]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    
loss = -cross_val_score(model, train_feature, train_target, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))

Traceback (most recent call last):
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\sklearn\metrics\_scorer.py", line 267, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\dlwl9\AppData\Local\Temp\ipykernel_9628\1799931290.py", line 13, in lg_nrmse
    rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\pandas\core\indexing.py", line 961, in __getitem__
    return self._getitem_tuple(key)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\pandas\core\indexing.py", line 1458, in _getit

Traceback (most recent call last):
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\sklearn\metrics\_scorer.py", line 267, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\dlwl9\AppData\Local\Temp\ipykernel_9628\1799931290.py", line 13, in lg_nrmse
    rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\pandas\core\indexing.py", line 961, in __getitem__
    return self._getitem_tuple(key)
  File "C:\Users\dlwl9\anaconda3\envs\dacon\lib\site-packages\pandas\core\indexing.py", line 1458, in _getit

KeyboardInterrupt: 

In [None]:
print(len(tv_train_x.columns))
print(len(tv_valid_x.columns))

In [70]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(train_feature, train_target)
preds = model.predict(test_feature)

In [71]:
preds

array([[1.30549529],
       [1.29075044],
       [1.36952193],
       ...,
       [1.39878205],
       [1.33216841],
       [1.37291204]])

In [72]:
a = pd.DataFrame(preds)

In [75]:
a

Unnamed: 0,0
0,1.305495
1,1.290750
2,1.369522
3,1.229990
4,1.353248
...,...
7917,1.394789
7918,1.459305
7919,1.398782
7920,1.332168


In [76]:
print(max(preds), min(preds))

[1.611301] [1.03060427]


In [73]:
for i in a.columns:
    print('{}  /  Min : {}    ,   Min : {}'.format(i, min(a[i]), max(a[i])) )

0  /  Min : 1.030604271077466    ,   Min : 1.6113010015745468


In [None]:
submit = pd.read_csv('data/validation_test_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
#submit.to_csv('data/param_test.csv', index = False)

In [None]:
submit.head()

df_indicator = pd.DataFrame()

for i, k in enumerate(submit.columns):
    if k == 'ID':
        continue
    y_series = ~submit[k].between(y_feature_spec_info['최소'][i-1], y_feature_spec_info['최대'][i-1])
    if i == 1:
        df_indicator = y_series
    else:
        df_indicator = df_indicator + y_series


In [None]:
df_indicator

In [None]:
print(df_indicator.value_counts())
df_indicator[df_indicator==True] = 1
df_indicator[df_indicator==False] = 0

In [None]:
df_indicator

In [None]:
tv_valid_x['X_57'] = df_indicator



In [None]:
tv_valid_x

In [None]:
tv_valid_x['X_57'] = tv_valid_x['X_57'].astype('int')

In [None]:
tv_valid_x.dtypes

In [None]:
print(len(tv_train_x))
print(len(tv_valid_x))

In [None]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    
loss = -cross_val_score(model, tv_valid_x, tv_valid_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))

In [None]:
submit

In [None]:
submit = submit.drop('ID', axis=1)

In [None]:
lg_nrmse(submit, tv_valid_y)