In [1]:
import os
import sys
sys.path.append('../../')

import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

import core.config as conf

## 01. Train Data Load

In [2]:
path = conf.data_path

In [3]:
train_df = pd.read_csv(f'{path}'+'stk_hld_train.csv')

In [4]:
train_df.head()

Unnamed: 0,act_id,iem_cd,byn_dt,hold_d
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A006360,20180726,11
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005930,20180131,80
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A005070,20180517,5
3,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A003520,20201112,22
4,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A002310,20180905,324


In [5]:
cus_df = pd.read_csv(f'{path}'+'cus_info.csv')

In [6]:
cus_df.head()

Unnamed: 0,act_id,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd
0,64aae8dd71e5c0761000db9f9a6779e504e7a4aa9dc097...,1,4,99,1,2,3,3,6,16
1,5f7c3a8f37d9c268d06130ff0be5d32a1b9ef68c13049f...,1,6,4,4,2,5,2,6,1
2,1119c23c3a504ca7b75060277410c0f6fb9018ec7638c2...,2,7,4,3,2,5,5,6,9
3,6d497facfa1ea5901b827335553331f8555fec02a8184f...,2,6,4,4,8,5,3,6,16
4,b727c78d2cfc246c97b677f29a034399a0c7e7873fff44...,1,5,2,2,2,5,5,5,16


In [7]:
iem_df = pd.read_csv(f'{path}'+'iem_info_20210902.csv')

In [8]:
iem_df.head()

Unnamed: 0,iem_cd,iem_krl_nm,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd
0,A000020,동화약품,8,2,99
1,A000080,하이트진로,14,2,1
2,A000180,성창기업지주,5,3,99
3,A000227,유유제약2우B,8,99,99
4,A000325,노루홀딩스우,2,99,99


In [9]:
iem_df = iem_df.drop(['iem_krl_nm'], axis = 1)

In [10]:
train_df = pd.merge(left = train_df, right = cus_df, how='left', on='act_id')

In [11]:
train_df = pd.merge(train_df, iem_df, how='left', on='iem_cd')

In [12]:
train_df["hist_d"] = train_df["hold_d"]*0.6

## 02. Test Data Load

In [34]:
test_df = pd.read_csv(f'{path}'+'stk_hld_test.csv')

In [35]:
test_df = pd.merge(left = test_df, right = cus_df, how='left', on='act_id')

In [36]:
test_df = pd.merge(test_df, iem_df, how='left', on='iem_cd')

In [37]:
test_df.head()

Unnamed: 0,act_id,iem_cd,byn_dt,hist_d,submit_id,hold_d,sex_dit_cd,cus_age_stn_cd,ivs_icn_cd,cus_aet_stn_cd,mrz_pdt_tp_sgm_cd,lsg_sgm_cd,tco_cus_grd_cd,tot_ivs_te_sgm_cd,mrz_btp_dit_cd,btp_cfc_cd,mkt_pr_tal_scl_tp_cd,stk_dit_cd
0,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A032640,20200522,153,IDX00001,0,1,9,3,2,2,9,5,5,8,4,1,1
1,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A160600,20190823,335,IDX00002,0,1,9,3,2,2,9,5,5,8,10,3,99
2,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A234340,20200611,139,IDX00003,0,1,9,3,2,2,9,5,5,8,13,2,99
3,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A131760,20200120,236,IDX00004,0,1,9,3,2,2,9,5,5,8,13,3,99
4,0ad104dbed99be0cd858aa772765ddedade554601a981b...,A293490,20201217,9,IDX00005,0,1,9,3,2,2,9,5,5,8,13,1,2


## Data Preprocessing

In [17]:
train_df = train_df.drop(['act_id', 'iem_cd'], axis = 1)

In [38]:
test_df = test_df.drop(['act_id', 'iem_cd', 'hold_d', 'submit_id'], axis = 1)

In [39]:
test_df = test_df[['byn_dt', 'sex_dit_cd', 'cus_age_stn_cd', 'ivs_icn_cd',
       'cus_aet_stn_cd', 'mrz_pdt_tp_sgm_cd', 'lsg_sgm_cd', 'tco_cus_grd_cd',
       'tot_ivs_te_sgm_cd', 'mrz_btp_dit_cd', 'btp_cfc_cd',
       'mkt_pr_tal_scl_tp_cd', 'stk_dit_cd', 'hist_d']]

## 03. XGBoost

In [20]:
col = train_df.columns

In [21]:
col = col.drop('hold_d')

In [22]:
TARGET = 'hold_d'

In [23]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df[col], train_df[TARGET], test_size = 0.2)

In [24]:
model = xgb.XGBRegressor(n_estimators=1000, max_depth=8, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [25]:
model.fit(X_train, y_train, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.1, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.100000001, max_delta_step=0, max_depth=8,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=20, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
pred = model.predict(X_valid)

In [27]:
y_valid.values

array([ 5,  7,  1, ..., 15,  9, 34])

In [28]:
np.round(pred)

array([ 5.,  7.,  1., ..., 15.,  9., 34.], dtype=float32)

In [30]:
np.sqrt(mean_squared_error(y_valid.values, np.round(pred)))

0.40208379051122906

## 04. Submission

In [40]:
submission = pd.read_csv(f"{path}"+"sample_submission.csv")

In [42]:
y_pred = model.predict(test_df) 

In [43]:
y_pred

array([ 253.65376  ,  555.8629   ,  229.0863   , ..., 1123.4048   ,
         17.877684 ,    6.9197307], dtype=float32)

In [44]:
result = []
for i in y_pred:
    result.append(i)

In [45]:
result

[253.65376,
 555.8629,
 229.0863,
 386.91357,
 14.817522,
 151.34453,
 164.43169,
 1122.807,
 134.1382,
 845.0236,
 201.53052,
 1112.921,
 770.53424,
 9.837848,
 17.681734,
 5.140926,
 186.22305,
 116.843056,
 455.9402,
 12.731324,
 9.783273,
 1121.4222,
 699.7515,
 27.041754,
 3.095445,
 112.19086,
 112.090355,
 390.5968,
 768.7829,
 455.6685,
 114.96512,
 115.07804,
 1136.0293,
 214.02675,
 19.61697,
 127.800255,
 214.266,
 1141.8577,
 214.3108,
 1125.1442,
 1125.0527,
 716.32886,
 1126.3306,
 1132.0247,
 1125.2418,
 1016.9629,
 1125.2715,
 1126.1339,
 1122.9463,
 1124.9131,
 1129.8514,
 1125.8846,
 230.47643,
 1062.3998,
 1093.63,
 239.77728,
 217.13658,
 1022.1285,
 1020.95154,
 1109.2684,
 123.42528,
 1094.75,
 1091.3258,
 1108.2999,
 944.2114,
 1091.1276,
 1094.7117,
 122.41561,
 200.55888,
 1094.7117,
 1105.9716,
 1108.8269,
 1091.3657,
 1120.8569,
 19.827137,
 249.57256,
 42.871735,
 39.894653,
 244.37044,
 17.80071,
 1.9954343,
 1127.8661,
 1149.5524,
 1128.2268,
 11.790196,
 

In [46]:
submission["hold_d"] = np.round(result)

In [47]:
submission

Unnamed: 0,submit_id,hold_d
0,IDX00001,254.0
1,IDX00002,556.0
2,IDX00003,229.0
3,IDX00004,387.0
4,IDX00005,15.0
...,...,...
70591,IDX70592,38.0
70592,IDX70593,171.0
70593,IDX70594,1123.0
70594,IDX70595,18.0


In [48]:
submission.to_csv("xgboost_baseline.csv", index = False)