In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
from tqdm import tqdm
import requests
from urllib.parse import urlparse
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import datetime
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'
import gc
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False
font_path = './NanumFontSetup_TTF_BARUNGOTHIC/NanumBarunGothic.ttf'
import matplotlib.font_manager as fm
fontprop = fm.FontProperties(fname=font_path, size=18)
import seaborn as sns
import shap

In [2]:
%%time
# Read Data
basedir = './Training/training_data_ver_16/'
training_data_df = pd.read_csv(basedir + 'training_data_ver_16.csv').drop(columns=['주택가격지수']).dropna()
#training_data_df = training_data_df[(training_data_df['가격면적'] >= 200) & (training_data_df['가격면적'] <= 1500)]
training_data_df.columns = [col.replace(' ','_').replace('·','_') for col in training_data_df.columns]

Wall time: 1min 26s


In [3]:
training_data_df = training_data_df[(training_data_df['가격면적'] >= 200) & (training_data_df['가격면적'] <= 3000)]

In [4]:
training_data_df['호수대비부속건축물면적'] = training_data_df['부속건축물면적'] / training_data_df['총호수']

In [5]:
training_data_df = training_data_df[(training_data_df['호수대비부속건축물면적'] < 100)]

In [6]:
training_data_df = training_data_df.reset_index(drop=True)

In [7]:
cols_to_log = ['총건축면적', '총연면적', '총용적률연면적', '토지면적', '부속건축물면적']
for col in cols_to_log:
    training_data_df[col] = training_data_df[col].apply(np.log1p)

In [8]:
cat_cols_df = pd.read_csv(basedir + 'cat_cols.csv')

In [9]:
cat_cols_list = [col.replace(' ','_').replace('·','_') for col in cat_cols_df['colname'].tolist()]
len(cat_cols_list)

296

In [10]:
cat_cols = [col for col in training_data_df.columns if col in cat_cols_list]
len(cat_cols)

296

In [11]:
cat_cols[:10]

['건물나이',
 'trade_type',
 '표준지여부',
 '지목명',
 '용도지역명1',
 '용도지역명2',
 '토지이동상황',
 '지형높이',
 '지형형상',
 '도로접면']

In [12]:
training_data_df['토지면적over총연면적times공시지가'] = training_data_df['토지면적'] / training_data_df['총연면적'] * training_data_df['공시지가(만원)']

In [13]:
shap_summary_df = pd.read_csv(basedir + 'shap_top_36.csv', index_col=[0]).sort_values(['shap_summary_mean'], ascending=False)
print(shap_summary_df.shape)
shap_summary_df.head()

(36, 6)


Unnamed: 0,model_0_shap_summary,model_1_shap_summary,model_2_shap_summary,model_3_shap_summary,model_4_shap_summary,shap_summary_mean
trade_type,93.883975,96.78944,95.676615,91.307413,95.812986,94.694086
토지면적over총연면적times공시지가,71.798361,75.450018,74.559728,75.656161,74.839162,74.460686
year_linear,66.046958,67.115133,64.616693,62.829061,66.577451,65.437059
건축년도,50.745131,50.318326,49.530523,51.125563,49.264659,50.19684
전용면적,37.663049,40.305701,35.925046,37.538836,37.998428,37.886212


In [14]:
shap_summary_df.tail(50)

Unnamed: 0,model_0_shap_summary,model_1_shap_summary,model_2_shap_summary,model_3_shap_summary,model_4_shap_summary,shap_summary_mean
trade_type,93.883975,96.78944,95.676615,91.307413,95.812986,94.694086
토지면적over총연면적times공시지가,71.798361,75.450018,74.559728,75.656161,74.839162,74.460686
year_linear,66.046958,67.115133,64.616693,62.829061,66.577451,65.437059
건축년도,50.745131,50.318326,49.530523,51.125563,49.264659,50.19684
전용면적,37.663049,40.305701,35.925046,37.538836,37.998428,37.886212
ynorm,32.979992,34.373101,33.867562,34.391293,32.689793,33.660348
xy,23.269583,22.479291,20.672874,22.702364,24.383246,22.701472
x_2nd,20.439473,20.52579,17.587809,19.614992,19.215703,19.476753
층mean,13.221159,12.170351,11.587628,13.440068,13.378779,12.759597
주건축물수,12.342852,11.590904,12.635774,10.983418,11.691039,11.848797


In [15]:
target_cols = ['금액(만원)', '가격면적', 'target_log_transformed', 'targetarea_log_transformed', '지번주소']

In [16]:
from sklearn.model_selection import KFold, GroupKFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [17]:
shap_summary1_df = shap_summary_df.copy()

In [18]:
for i in range(20):
    gc.collect()
    shortened_df = shap_summary1_df.iloc[:35-i,:]
    training_data_df = training_data_df[shortened_df.index.tolist()+target_cols]
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    cat_cols = [col for col in cat_cols if col in training_data_df.columns.tolist()]
    
    print(shortened_df.shape)

    oof = np.zeros(training_data_df.shape[0])

    for train_idx, val_idx in kf.split(training_data_df):
        train_data = training_data_df.loc[train_idx]
        val_data = training_data_df.loc[val_idx]

        x_train = train_data.drop(columns=target_cols)
        y_train = train_data['가격면적']

        x_val = val_data.drop(columns=target_cols)
        y_val = val_data['가격면적']

        params = {
            'objective': 'huber',
            'learning_rate': 10,
            'seed': 42,
            'max_depth': 12,
            'num_leaves': 37,
            'lambda_l2': 0,
            'metric': 'huber',
            'num_threads': 6,
        }

        train_dataset = lgb.Dataset(x_train, label=y_train, categorical_feature=cat_cols)
        val_dataset = lgb.Dataset(x_val, label=y_val, categorical_feature=cat_cols)

        num_rounds = 10000

        bst = lgb.train(params, train_dataset, num_rounds, valid_sets=[val_dataset], early_stopping_rounds=500, verbose_eval=1000)

        preds = bst.predict(x_val, num_iteration=bst.best_iteration)
        oof[val_idx] = preds
        
    print(np.sqrt(mean_squared_error(training_data_df['가격면적'], oof)))

(35, 6)


Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4882
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 35


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.7745
[2000]	valid_0's huber: 38.1895
[3000]	valid_0's huber: 36.6206
[4000]	valid_0's huber: 35.4938
[5000]	valid_0's huber: 35.0119
[6000]	valid_0's huber: 34.7615
[7000]	valid_0's huber: 34.6447
[8000]	valid_0's huber: 34.5933
[9000]	valid_0's huber: 34.5085
[10000]	valid_0's huber: 34.4539
Did not meet early stopping. Best iteration is:
[9959]	valid_0's huber: 34.4536
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4883
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 35
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.8157
[2000]	valid_0's huber: 38.0955
[3000]	valid_0's huber: 36.3264
[4000]	valid_0's h

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4878
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 34


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.8993
[2000]	valid_0's huber: 38.2936
[3000]	valid_0's huber: 36.4905
[4000]	valid_0's huber: 35.7789
[5000]	valid_0's huber: 35.6062
[6000]	valid_0's huber: 35.2832
[7000]	valid_0's huber: 35.0087
[8000]	valid_0's huber: 34.933
[9000]	valid_0's huber: 34.8624
[10000]	valid_0's huber: 34.8039
Did not meet early stopping. Best iteration is:
[9998]	valid_0's huber: 34.8037
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4879
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 34
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.0592
[2000]	valid_0's huber: 38.3805
[3000]	valid_0's huber: 36.6168
[4000]	valid_0's hu

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4869
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 33


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.8495
[2000]	valid_0's huber: 38.2434
[3000]	valid_0's huber: 36.4104
[4000]	valid_0's huber: 35.6131
[5000]	valid_0's huber: 35.1247
[6000]	valid_0's huber: 35.0158
[7000]	valid_0's huber: 34.8871
[8000]	valid_0's huber: 34.8183
[9000]	valid_0's huber: 34.7206
[10000]	valid_0's huber: 34.672
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 34.672
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4870
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 33
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.2034
[2000]	valid_0's huber: 38.3992
[3000]	valid_0's huber: 36.6122
[4000]	valid_0's hu

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4616
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 32


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.9426
[2000]	valid_0's huber: 38.2572
[3000]	valid_0's huber: 36.5632
[4000]	valid_0's huber: 35.8546
[5000]	valid_0's huber: 35.3235
[6000]	valid_0's huber: 35.1774
[7000]	valid_0's huber: 35.1231
[8000]	valid_0's huber: 35.055
[9000]	valid_0's huber: 35.0001
[10000]	valid_0's huber: 34.9621
Did not meet early stopping. Best iteration is:
[9970]	valid_0's huber: 34.9618
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4617
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 32
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.9521
[2000]	valid_0's huber: 38.2393
[3000]	valid_0's huber: 36.5116
[4000]	valid_0's hu

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4609
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 31


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.024
[2000]	valid_0's huber: 38.2493
[3000]	valid_0's huber: 36.52
[4000]	valid_0's huber: 35.6934
[5000]	valid_0's huber: 35.1112
[6000]	valid_0's huber: 34.8533
[7000]	valid_0's huber: 34.7975
[8000]	valid_0's huber: 34.7093
[9000]	valid_0's huber: 34.6584
[10000]	valid_0's huber: 34.6316
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 34.6316
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4610
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 31
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.9068
[2000]	valid_0's huber: 38.2951
[3000]	valid_0's huber: 36.4633
[4000]	valid_0's hub

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4564
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 30


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.8971
[2000]	valid_0's huber: 38.2899
[3000]	valid_0's huber: 36.521
[4000]	valid_0's huber: 35.3729
[5000]	valid_0's huber: 34.9454
[6000]	valid_0's huber: 34.8714
[7000]	valid_0's huber: 34.8001
[8000]	valid_0's huber: 34.7562
[9000]	valid_0's huber: 34.6785
[10000]	valid_0's huber: 34.6209
Did not meet early stopping. Best iteration is:
[9949]	valid_0's huber: 34.6205
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4565
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 30
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.9173
[2000]	valid_0's huber: 38.2626
[3000]	valid_0's huber: 36.4371
[4000]	valid_0's hu

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4456
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 29


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.9357
[2000]	valid_0's huber: 38.2041
[3000]	valid_0's huber: 36.5389
[4000]	valid_0's huber: 35.4325
[5000]	valid_0's huber: 35.0631
[6000]	valid_0's huber: 34.9341
[7000]	valid_0's huber: 34.7742
[8000]	valid_0's huber: 34.6402
[9000]	valid_0's huber: 34.5628
[10000]	valid_0's huber: 34.5289
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 34.5289
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4456
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 29
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.8431
[2000]	valid_0's huber: 38.247
[3000]	valid_0's huber: 36.4621
[4000]	valid_0's h

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4234
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 28


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.949
[2000]	valid_0's huber: 38.3334
[3000]	valid_0's huber: 36.5318
[4000]	valid_0's huber: 35.4316
[5000]	valid_0's huber: 35.0454
[6000]	valid_0's huber: 34.9455
[7000]	valid_0's huber: 34.8022
[8000]	valid_0's huber: 34.7874
[9000]	valid_0's huber: 34.7631
[10000]	valid_0's huber: 34.694
Did not meet early stopping. Best iteration is:
[9990]	valid_0's huber: 34.6939
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4232
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 28
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.8792
[2000]	valid_0's huber: 38.2088
[3000]	valid_0's huber: 36.5331
[4000]	valid_0's hub

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4230
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 27


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.8352
[2000]	valid_0's huber: 38.2454
[3000]	valid_0's huber: 36.5946
[4000]	valid_0's huber: 35.5045
[5000]	valid_0's huber: 34.883
[6000]	valid_0's huber: 34.6387
[7000]	valid_0's huber: 34.4929
[8000]	valid_0's huber: 34.3402
[9000]	valid_0's huber: 34.2741
[10000]	valid_0's huber: 34.216
Did not meet early stopping. Best iteration is:
[9977]	valid_0's huber: 34.2155
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4228
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 27
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.0281
[2000]	valid_0's huber: 38.3197
[3000]	valid_0's huber: 36.6145
[4000]	valid_0's hub

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4224
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 26


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.029
[2000]	valid_0's huber: 38.3858
[3000]	valid_0's huber: 36.5871
[4000]	valid_0's huber: 35.4474
[5000]	valid_0's huber: 34.715
[6000]	valid_0's huber: 34.479
[7000]	valid_0's huber: 34.3122
[8000]	valid_0's huber: 34.1912
[9000]	valid_0's huber: 34.0494
[10000]	valid_0's huber: 34.0201
Did not meet early stopping. Best iteration is:
[9993]	valid_0's huber: 34.02
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4222
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 26
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 41.9792
[2000]	valid_0's huber: 38.4407
[3000]	valid_0's huber: 36.6715
[4000]	valid_0's huber:

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4211
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 25


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.0236
[2000]	valid_0's huber: 38.3331
[3000]	valid_0's huber: 36.5228
[4000]	valid_0's huber: 35.4551
[5000]	valid_0's huber: 34.9357
[6000]	valid_0's huber: 34.7368
[7000]	valid_0's huber: 34.5646
[8000]	valid_0's huber: 34.429
[9000]	valid_0's huber: 34.316
[10000]	valid_0's huber: 34.2284
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 34.2284
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4209
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 25
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.1805
[2000]	valid_0's huber: 38.4256
[3000]	valid_0's huber: 36.6556
[4000]	valid_0's hu

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4169
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 24


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.0581
[2000]	valid_0's huber: 38.4964
[3000]	valid_0's huber: 36.7613
[4000]	valid_0's huber: 35.6539
[5000]	valid_0's huber: 35.1957
[6000]	valid_0's huber: 34.8816
[7000]	valid_0's huber: 34.6739
[8000]	valid_0's huber: 34.5886
[9000]	valid_0's huber: 34.4709
[10000]	valid_0's huber: 34.4346
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 34.4346
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4167
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 24
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.0587
[2000]	valid_0's huber: 38.4244
[3000]	valid_0's huber: 36.6653
[4000]	valid_0's 

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4165
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 23


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.101
[2000]	valid_0's huber: 38.3104
[3000]	valid_0's huber: 36.5629
[4000]	valid_0's huber: 35.4411
[5000]	valid_0's huber: 34.9346
[6000]	valid_0's huber: 34.6486
[7000]	valid_0's huber: 34.5031
[8000]	valid_0's huber: 34.3894
[9000]	valid_0's huber: 34.3286
[10000]	valid_0's huber: 34.2799
Did not meet early stopping. Best iteration is:
[9999]	valid_0's huber: 34.2799
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4163
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 23
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.1818
[2000]	valid_0's huber: 38.4803
[3000]	valid_0's huber: 36.8055
[4000]	valid_0's hu

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3943
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 22


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.0109
[2000]	valid_0's huber: 38.2962
[3000]	valid_0's huber: 36.5805
[4000]	valid_0's huber: 35.4547
[5000]	valid_0's huber: 34.8347
[6000]	valid_0's huber: 34.6272
[7000]	valid_0's huber: 34.462
[8000]	valid_0's huber: 34.2996
Early stopping, best iteration is:
[8315]	valid_0's huber: 34.2811
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3942
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 22
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.0522
[2000]	valid_0's huber: 38.3599
[3000]	valid_0's huber: 36.5844
[4000]	valid_0's huber: 35.4148
[5000]	valid_0's huber: 34.6542
[6000]	valid_0's huber: 34.3513
[

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3930
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 21


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.2356
[2000]	valid_0's huber: 38.5044
[3000]	valid_0's huber: 36.7231
[4000]	valid_0's huber: 35.8682
[5000]	valid_0's huber: 35.3727
[6000]	valid_0's huber: 35.0783
[7000]	valid_0's huber: 34.8845
[8000]	valid_0's huber: 34.7819
[9000]	valid_0's huber: 34.6496
[10000]	valid_0's huber: 34.5449
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 34.5449
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3929
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 21
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.2924
[2000]	valid_0's huber: 38.6362
[3000]	valid_0's huber: 36.7893
[4000]	valid_0's 

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3675
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 20


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.2534
[2000]	valid_0's huber: 38.5326
[3000]	valid_0's huber: 36.7409
[4000]	valid_0's huber: 35.6301
[5000]	valid_0's huber: 35.1374
[6000]	valid_0's huber: 34.8788
[7000]	valid_0's huber: 34.6738
[8000]	valid_0's huber: 34.5203
[9000]	valid_0's huber: 34.4016
[10000]	valid_0's huber: 34.3287
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 34.3287
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3674
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 20
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.3056
[2000]	valid_0's huber: 38.683
[3000]	valid_0's huber: 36.8555
[4000]	valid_0's h

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3426
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 19


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.2558
[2000]	valid_0's huber: 38.651
[3000]	valid_0's huber: 36.8678
[4000]	valid_0's huber: 35.6921
[5000]	valid_0's huber: 35.174
[6000]	valid_0's huber: 34.8732
[7000]	valid_0's huber: 34.6546
[8000]	valid_0's huber: 34.5207
[9000]	valid_0's huber: 34.4155
[10000]	valid_0's huber: 34.3601
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 34.3601
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3425
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 19
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.3329
[2000]	valid_0's huber: 38.7026
[3000]	valid_0's huber: 36.8941
[4000]	valid_0's hu

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3171
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 18


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.4363
[2000]	valid_0's huber: 38.7992
[3000]	valid_0's huber: 36.9674
[4000]	valid_0's huber: 35.808
[5000]	valid_0's huber: 35.2074
[6000]	valid_0's huber: 34.8574
[7000]	valid_0's huber: 34.7104
[8000]	valid_0's huber: 34.6
[9000]	valid_0's huber: 34.5143
[10000]	valid_0's huber: 34.4819
Did not meet early stopping. Best iteration is:
[9984]	valid_0's huber: 34.4819
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3170
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 18
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 42.4003
[2000]	valid_0's huber: 38.7855
[3000]	valid_0's huber: 36.9894
[4000]	valid_0's huber

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3118
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 17


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 43.3784
[2000]	valid_0's huber: 39.6952
[3000]	valid_0's huber: 38.054
[4000]	valid_0's huber: 37.1183
[5000]	valid_0's huber: 36.5924
[6000]	valid_0's huber: 36.2613
[7000]	valid_0's huber: 36.0693
[8000]	valid_0's huber: 35.9411
[9000]	valid_0's huber: 35.8725
[10000]	valid_0's huber: 35.803
Did not meet early stopping. Best iteration is:
[9993]	valid_0's huber: 35.8026
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3116
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 17
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 43.1955
[2000]	valid_0's huber: 39.6272
[3000]	valid_0's huber: 37.9821
[4000]	valid_0's hub

Using categorical_feature in Dataset.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2863
[LightGBM] [Info] Number of data points in the train set: 2926580, number of used features: 16


Overriding the parameters from Reference Dataset.
categorical_column in param dict is overridden.


[LightGBM] [Info] Start training from score 524.640869
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 43.3284
[2000]	valid_0's huber: 39.7978
[3000]	valid_0's huber: 38.0565
[4000]	valid_0's huber: 37.1097
[5000]	valid_0's huber: 36.5397
[6000]	valid_0's huber: 36.1968
[7000]	valid_0's huber: 36.0439
[8000]	valid_0's huber: 35.9132
[9000]	valid_0's huber: 35.8627
[10000]	valid_0's huber: 35.7938
Did not meet early stopping. Best iteration is:
[9994]	valid_0's huber: 35.7934
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2861
[LightGBM] [Info] Number of data points in the train set: 2926581, number of used features: 16
[LightGBM] [Info] Start training from score 524.755320
Training until validation scores don't improve for 500 rounds
[1000]	valid_0's huber: 43.3568
[2000]	valid_0's huber: 39.8694
[3000]	valid_0's huber: 38.1862
[4000]	valid_0's h