In [1]:
import os
import sys
sys.path.append('../../')

import pandas as pd
import xgboost as xgb
import numpy as np
import swifter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from datetime import datetime, timedelta
from tqdm import tqdm, tqdm_notebook
from category_encoders import TargetEncoder
import core.config as conf

In [2]:
tqdm_notebook().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


0it [00:00, ?it/s]

## 01. Train Data Load

In [3]:
## load preprocessing data (past_d)
train_df = pd.read_csv('./result/train_past_d.csv')
test_df = pd.read_csv('./result/test_past_d.csv')

In [4]:
path = conf.data_path

In [15]:
train_df = pd.read_csv(f'{path}'+'stk_hld_train.csv')

In [6]:
train_df.head()

Unnamed: 0,act_id,iem_cd,byn_dt,hold_d
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A006360,20180726,11
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005930,20180131,80
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005070,20180517,5
3,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A003520,20201112,22
4,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A002310,20180905,324


In [7]:
cus_df = pd.read_csv(f'{path}'+'cus_info.csv')

In [8]:
cus_df.head()

Unnamed: 0,act_id,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd
0,64aae8dd71e5c0761000db9f9a6779e504e7a4aa9dc097...,1,4,99,1,2,3,3,6,16
1,5f7c3a8f37d9c268d06130ff0be5d32a1b9ef68c13049f...,1,6,4,4,2,5,2,6,1
2,1119c23c3a504ca7b75060277410c0f6fb9018ec7638c2...,2,7,4,3,2,5,5,6,9
3,6d497facfa1ea5901b827335553331f8555fec02a8184f...,2,6,4,4,8,5,3,6,16
4,b727c78d2cfc246c97b677f29a034399a0c7e7873fff44...,1,5,2,2,2,5,5,5,16


In [9]:
iem_df = pd.read_csv(f'{path}'+'iem_info_20210902.csv')

In [10]:
iem_df.head()

Unnamed: 0,iem_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd
0,A000020,동화약품,8,2,99
1,A000080,하이트진로,14,2,1
2,A000180,성창기업지주,5,3,99
3,A000227,유유제약2우B,8,99,99
4,A000325,노루홀딩스우,2,99,99


In [11]:
L_encoder = LabelEncoder()
L_encoder.fit(iem_df["iem_krl_nm"])

LabelEncoder()

In [12]:
iem_df['iem_krl_nm'] = L_encoder.transform(iem_df["iem_krl_nm"])

In [13]:
train_df = pd.merge(left = train_df, right = cus_df, how='left', on='act_id')

In [14]:
train_df = pd.merge(train_df, iem_df, how='left', on='iem_cd')

## 02. Test Data Load

In [15]:
test_df = pd.read_csv(f'{path}'+'stk_hld_test.csv')

In [16]:
test_df = pd.merge(left = test_df, right = cus_df, how='left', on='act_id')

In [17]:
test_df = pd.merge(test_df, iem_df, how='left', on='iem_cd')

In [18]:
test_df.head()

Unnamed: 0,act_id,iem_cd,byn_dt,hist_d,submit_id,hold_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A032640,20200522,153,IDX00001,0,1,9,3,2,2,9,5,5,8,418,4,1,1
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A160600,20190823,335,IDX00002,0,1,9,3,2,2,9,5,5,8,2230,10,3,99
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A234340,20200611,139,IDX00003,0,1,9,3,2,2,9,5,5,8,1515,13,2,99
3,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A131760,20200120,236,IDX00004,0,1,9,3,2,2,9,5,5,8,2681,13,3,99
4,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A293490,20201217,9,IDX00005,0,1,9,3,2,2,9,5,5,8,2450,13,1,2


## 03. Stock Hist

In [6]:
stk_df = pd.read_csv(f'{path}'+'stk_bnc_hist.csv')

In [6]:
stk_df.head()

Unnamed: 0,act_id,bse_dt,iem_cd,bnc_qty,tot_aet_amt,stk_par_pr
0,1119c23c3a504ca7b75060277410c0f6fb9018ec7638c2...,20200820,A008770,40.0,2828000.0,5000.0
1,1119c23c3a504ca7b75060277410c0f6fb9018ec7638c2...,20200623,A008770,20.0,1390000.0,5000.0
2,1119c23c3a504ca7b75060277410c0f6fb9018ec7638c2...,20160104,A005940,311.0,2982490.0,5000.0
3,1119c23c3a504ca7b75060277410c0f6fb9018ec7638c2...,20200814,A005930,40.0,2320000.0,100.0
4,1119c23c3a504ca7b75060277410c0f6fb9018ec7638c2...,20200623,A005930,20.0,1028000.0,100.0


In [11]:
# past_d

In [7]:
def working_day(s_dt, e_dt) : # 공휴일 제외
    s_dt = str(s_dt).split('.')[0]
    e_dt = str(e_dt).split('.')[0]
    s_dt = s_dt[:4] + '-' + s_dt[4:6] + '-' + s_dt[6:]
    e_dt = e_dt[:4] + '-' + e_dt[4:6] + '-' + e_dt[6:]
    return np.busday_count(s_dt, e_dt, weekmask='1111100')

## Data Preprocessing

#### Past_d

In [26]:
stk_df = stk_df.sort_values(by=['bse_dt'])

In [31]:
train_df['past_d'] = train_df.swifter.progress_bar().apply(lambda x : stk_hist(x['act_id'], x['iem_cd']), axis = 1)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


Dask Apply:   0%|          | 0/40 [00:00<?, ?it/s]

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [40]:
test_df['past_d'] = test_df.apply(lambda x : stk_hist(x['act_id'], x['iem_cd']), axis = 1)

  after removing the cwd from sys.path.


#### user avg past_d : 사용자 별 past_d
#### iem avg past_d : 종목 별 평균 past_d
#### ivs  icn avg past_d : 투자성향 별 평균 past_d
#### mkt_pr_tal_scl_tp_cd : 시가총액 규모 별 평균 past_d
-> 제대로 하려면 stk_hist 파일 돌려서 다시 구해야 함

In [7]:
user_avg_pastd = train_df['past_d'].groupby(train_df['act_id']).mean()

In [8]:
iem_avg_pastd = train_df['past_d'].groupby(train_df['iem_cd']).mean()

In [9]:
ivs_icn_avg_pastd = train_df['past_d'].groupby(train_df['ivs_icn_cd']).mean()

In [10]:
user_avg_pastd_test = test_df['past_d'].groupby(test_df['act_id']).mean()

In [11]:
iem_avg_pastd_test = test_df['past_d'].groupby(test_df['iem_cd']).mean()

In [12]:
mkt_pr_avg_pastd = train_df['past_d'].groupby(train_df['mkt_pr_tal_scl_tp_cd']).mean()

In [13]:
train_df['user_avg_past_d'] = train_df.apply(lambda x : user_avg_pastd[x['act_id']], axis = 1)

In [14]:
train_df['iem_avg_past_d'] = train_df.apply(lambda x : iem_avg_pastd[x['iem_cd']], axis = 1)

In [15]:
train_df['ivs_icn_past_d'] = train_df.apply(lambda x : ivs_icn_avg_pastd[x['ivs_icn_cd']], axis = 1)

In [16]:
train_df['mkt_pr_past_d'] = train_df.apply(lambda x : mkt_pr_avg_pastd[x['mkt_pr_tal_scl_tp_cd']], axis = 1)

In [17]:
test_df['user_avg_past_d'] = test_df.apply(lambda x : user_avg_pastd[x['act_id']] if x['act_id'] in user_avg_pastd else user_avg_pastd_test[x['act_id']], axis = 1)

In [18]:
test_df['iem_avg_past_d'] = test_df.apply(lambda x : iem_avg_pastd[x['iem_cd']] if x['iem_cd'] in iem_avg_pastd else iem_avg_pastd_test[x['iem_cd']], axis = 1)

In [19]:
test_df['ivs_icn_past_d'] = test_df.apply(lambda x : ivs_icn_avg_pastd[x['ivs_icn_cd']], axis = 1)

In [20]:
test_df['mkt_pr_past_d'] = test_df.apply(lambda x : mkt_pr_avg_pastd[x['mkt_pr_tal_scl_tp_cd']], axis = 1)

In [21]:
train_df.head()

Unnamed: 0,act_id,iem_cd,byn_dt,hold_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,...,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd,past_d,hist_d,user_avg_past_d,iem_avg_past_d,ivs_icn_past_d,mkt_pr_past_d
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A006360,20180726,11,1,9,3,2,2,9,...,101,1,1,1,195.0,6.6,41.074468,66.392544,39.918768,58.505146
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005930,20180131,80,1,9,3,2,2,9,...,1361,9,1,1,86.0,48.0,41.074468,101.057144,39.918768,58.505146
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005070,20180517,5,1,9,3,2,2,9,...,2530,12,2,99,6.0,3.0,41.074468,44.85715,39.918768,40.24891
3,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A003520,20201112,22,1,9,3,2,2,9,...,1969,8,2,1,22.0,13.2,41.074468,44.580025,39.918768,40.24891
4,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A002310,20180905,324,1,9,3,2,2,9,...,1696,10,3,99,347.0,194.4,41.074468,81.323587,39.918768,36.059944


In [22]:
test_df.head()

Unnamed: 0,act_id,iem_cd,byn_dt,hist_d,submit_id,hold_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,...,mrz_btp_dit_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd,past_d,user_avg_past_d,iem_avg_past_d,ivs_icn_past_d,mkt_pr_past_d
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A032640,20200522,153,IDX00001,0,1,9,3,2,...,8,418,4,1,1,159.0,41.074468,87.701799,39.918768,58.505146
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A160600,20190823,335,IDX00002,0,1,9,3,2,...,8,2230,10,3,99,354.0,41.074468,33.203922,39.918768,36.059944
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A234340,20200611,139,IDX00003,0,1,9,3,2,...,8,1515,13,2,99,145.0,41.074468,31.919831,39.918768,40.24891
3,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A131760,20200120,236,IDX00004,0,1,9,3,2,...,8,2681,13,3,99,248.0,41.074468,37.86849,39.918768,36.059944
4,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A293490,20201217,9,IDX00005,0,1,9,3,2,...,8,2450,13,1,2,10.0,41.074468,15.034166,39.918768,58.505146


#### hist d : 

In [5]:
train_df["hist_d"] = train_df["hold_d"]*0.6

In [6]:
train_df.head()

Unnamed: 0,act_id,iem_cd,byn_dt,hold_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd,past_d,hist_d
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A006360,20180726,11,1,9,3,2,2,9,5,5,8,101,1,1,1,195.0,6.6
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005930,20180131,80,1,9,3,2,2,9,5,5,8,1361,9,1,1,86.0,48.0
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005070,20180517,5,1,9,3,2,2,9,5,5,8,2530,12,2,99,6.0,3.0
3,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A003520,20201112,22,1,9,3,2,2,9,5,5,8,1969,8,2,1,22.0,13.2
4,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A002310,20180905,324,1,9,3,2,2,9,5,5,8,1696,10,3,99,347.0,194.4


In [7]:
test_df.head()

Unnamed: 0,act_id,iem_cd,byn_dt,hist_d,submit_id,hold_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd,past_d
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A032640,20200522,153,IDX00001,0,1,9,3,2,2,9,5,5,8,418,4,1,1,159.0
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A160600,20190823,335,IDX00002,0,1,9,3,2,2,9,5,5,8,2230,10,3,99,354.0
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A234340,20200611,139,IDX00003,0,1,9,3,2,2,9,5,5,8,1515,13,2,99,145.0
3,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A131760,20200120,236,IDX00004,0,1,9,3,2,2,9,5,5,8,2681,13,3,99,248.0
4,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A293490,20201217,9,IDX00005,0,1,9,3,2,2,9,5,5,8,2450,13,1,2,10.0


## External Data Featuring

In [7]:
external_path = conf.external_path

In [8]:
stk_mean_df = pd.read_csv(external_path+'mean_stockdata.csv')

In [9]:
stk_mean_df['code'] = stk_mean_df.apply(lambda x : 'A' + x['code'], axis = 1)

In [10]:
stk_mean_df

Unnamed: 0,code,oepn,high,low,close,volume,chage
0,A000020,10628.321108,10875.158924,10395.338223,10611.340668,4.514417e+05,0.001178
1,A000080,13718.842105,13873.710526,13566.907895,14156.565789,1.592198e+06,0.000800
2,A000180,3205.013040,3268.528117,3137.779136,3229.211084,1.243295e+06,-0.000945
3,A000227,13186.233089,13408.596577,12970.257539,13402.982885,3.864996e+04,-0.000098
4,A000325,18285.819071,18510.423798,18045.867971,18283.928280,1.757751e+05,0.000082
...,...,...,...,...,...,...,...
2807,A156080,10585.937500,10747.812500,10474.375000,10595.937500,2.414732e+05,0.007861
2808,A160980,10113.500000,10179.500000,9927.500000,10037.000000,1.963900e+04,0.000063
2809,A161580,10127.000000,10232.000000,9951.000000,10080.000000,8.949426e+05,0.001339
2810,A163730,10156.500000,10307.250000,10123.000000,10264.500000,9.705650e+04,0.016875


In [11]:
train_df = pd.merge(left = train_df, right = stk_mean_df[['code', 'volume']], how='left', left_on='iem_cd', right_on = 'code')

In [12]:
test_df = pd.merge(left = test_df, right = stk_mean_df[['code', 'volume']], how='left', left_on='iem_cd', right_on = 'code')

## Target Encoding

In [13]:
target_encoder = TargetEncoder()

In [14]:
target_encoder.fit(train_df['iem_cd'], train_df['hold_d'])

TargetEncoder(cols=['iem_cd'])

In [15]:
train_df['iem_cd_te'] = target_encoder.transform(train_df['iem_cd'], train_df['hold_d'])

In [16]:
test_df['iem_cd_te'] = target_encoder.transform(test_df['iem_cd'], test_df['hold_d'])

In [17]:
test_df = test_df.fillna(0)

In [18]:
train_df = train_df.fillna(0)

In [19]:
#target_encoder.transform(train_df['ivs_icn_cd'], train_df['hold_d'])

## past_d 계산

In [23]:
id_stk_dur_df = pd.read_csv(conf.data_path + 'id_stk_dur_list.csv')

In [29]:
train_df = pd.merge(train_df, id_stk_dur_df[['act_id', 'iem_cd', 'mean']], how = 'left', on = ['act_id', 'iem_cd'])

In [30]:
test_df = pd.merge(test_df, id_stk_dur_df[['act_id', 'iem_cd', 'mean']], how = 'left', on = ['act_id', 'iem_cd'])

## Drop Features

In [32]:
train_df = train_df.drop(['act_id', 'iem_cd', 'code', 'byn_dt', 'past_d'], axis = 1)

In [33]:
used_features = train_df.columns.drop('hold_d')

In [34]:
test_df = test_df[used_features]

In [35]:
train_df.columns

Index(['hold_d', 'sex_dit_cd', 'cus_age_stn_cd', 'ivs_icn_cd',
       'cus_aet_stn_cd', 'mrz_pdt_tp_sgm_cd', 'lsg_sgm_cd', 'tco_cus_grd_cd',
       'tot_ivs_te_sgm_cd', 'mrz_btp_dit_cd', 'iem_krl_nm', 'btp_cfc_cd',
       'mkt_pr_tal_scl_tp_cd', 'stk_dit_cd', 'hist_d', 'volume', 'iem_cd_te',
       'mean'],
      dtype='object')

In [36]:
test_df.columns

Index(['sex_dit_cd', 'cus_age_stn_cd', 'ivs_icn_cd', 'cus_aet_stn_cd',
       'mrz_pdt_tp_sgm_cd', 'lsg_sgm_cd', 'tco_cus_grd_cd',
       'tot_ivs_te_sgm_cd', 'mrz_btp_dit_cd', 'iem_krl_nm', 'btp_cfc_cd',
       'mkt_pr_tal_scl_tp_cd', 'stk_dit_cd', 'hist_d', 'volume', 'iem_cd_te',
       'mean'],
      dtype='object')

## 03. XGBoost

In [37]:
col = train_df.columns

In [38]:
col = col.drop('hold_d')

In [39]:
TARGET = 'hold_d'

In [40]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df[col], train_df[TARGET], test_size = 0.2)

In [41]:
model = xgb.XGBRegressor(n_estimators=1000, max_depth=8, min_child_weighteta=0.1, subsample=0.7, colsample_bytree=0.8)

#### GridSearch

In [42]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'eta' :[0.05, 0.1],
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7, 8],
              'min_child_weight': [1, 2, 4, 6],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500, 1000]}

xgb_grid = GridSearchCV(model,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

In [27]:
model.fit(X_train, y_train, verbose=False)

Parameters: { "min_child_weighteta" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=8,
             min_child_weight=1, min_child_weighteta=0.1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=20,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.7, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [43]:
xgb_grid.fit(X_train, y_train, verbose=False)

Fitting 2 folds for each of 192 candidates, totalling 384 fits
Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "m



Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may 

Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may 

Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may 

Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may 

Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may 


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may

Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may 


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may

Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may 

Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may 


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may

Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=2,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=None, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=8, min_child_weight=None,
                                    min_child_weighteta=0.1, missing=nan,
                                    monotone_constraints=None,
                                    n_es...
                                    scale_pos_weight=None, subsample=0.7,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             n_jobs=5,
             param_grid={'

In [51]:
import joblib

In [52]:
joblib.dump(xgb_grid, 'xgb_gridcv.pkl')

['xgb_gridcv.pkl']


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [44]:
xgb.plot_importance(model)

NotFittedError: need to call fit or load_model beforehand

In [45]:
xgb_grid.score(X_train, y_train)   

0.9999891255029099

In [31]:
model.score(X_train, y_train)

0.9999867422859888

## Predict Score

In [None]:
pred = model.predict(X_valid)

In [46]:
pred = xgb_grid.predict(X_valid)

In [47]:
y_valid.values

array([ 1,  1, 28, ...,  4, 11, 19])

In [48]:
np.round(pred)

array([ 1.,  1., 28., ...,  4., 11., 19.], dtype=float32)

In [49]:
np.sqrt(mean_squared_error(y_valid.values, np.round(pred)))

0.2309231629177619

## 04. Submission

In [50]:
submission = pd.read_csv(f"{path}"+"sample_submission.csv")

In [51]:
y_pred = xgb_grid.predict(test_df) 

In [40]:
y_pred = model.predict(test_df)

In [52]:
y_pred

array([ 255.3505   ,  555.616    ,  232.19318  , ..., 1041.981    ,
         18.004068 ,    7.0241246], dtype=float32)

In [53]:
result = []
for i in y_pred:
    result.append(i)

In [54]:
result

[255.3505,
 555.616,
 232.19318,
 393.02863,
 14.885926,
 154.54553,
 167.63611,
 1042.3467,
 135.35214,
 841.32184,
 201.5839,
 1164.6857,
 749.91064,
 10.079292,
 18.046349,
 5.06686,
 187.89557,
 117.43706,
 469.3259,
 13.039008,
 10.044616,
 1155.4307,
 693.1723,
 27.056496,
 3.1005492,
 111.89685,
 111.93448,
 393.0884,
 752.306,
 468.92624,
 114.99458,
 115.11899,
 1036.2914,
 218.34941,
 20.051285,
 128.28542,
 218.40517,
 1149.0049,
 218.4102,
 1114.8121,
 1172.3646,
 716.7076,
 1154.1947,
 1169.2665,
 1119.3865,
 1012.66705,
 1121.8529,
 1178.7136,
 1169.2109,
 1171.6953,
 1165.0485,
 1119.4199,
 233.41718,
 1006.9923,
 1011.9559,
 242.47157,
 217.60568,
 993.34106,
 1001.89404,
 1031.3984,
 122.81199,
 1014.83636,
 964.8978,
 1032.8284,
 932.85126,
 980.8714,
 1012.96075,
 122.65766,
 201.60352,
 963.7537,
 1001.68427,
 997.8182,
 1005.80096,
 1151.5768,
 20.03546,
 250.40309,
 43.041367,
 39.96264,
 245.55884,
 18.031649,
 1.9789472,
 1098.1925,
 1163.6685,
 1060.4639,
 11.9

In [55]:
submission["hold_d"] = np.round(result)

In [56]:
submission

Unnamed: 0,submit_id,hold_d
0,IDX00001,255.0
1,IDX00002,556.0
2,IDX00003,232.0
3,IDX00004,393.0
4,IDX00005,15.0
...,...,...
70591,IDX70592,38.0
70592,IDX70593,173.0
70593,IDX70594,1042.0
70594,IDX70595,18.0


In [57]:
submission.to_csv("./result/gridcv_xgboost(iem_cd_target_encoding lib with hist_d volume past_d(mean)).csv", index = False)


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "min_child_weighteta", "silent" } might not be used.

  This may