## 실행 환경
category-encoders==2.2.2  
fire==0.4.0  
joblib==1.0.1  
keras==2.6.0  
numpy==1.19.5  
pandas==1.3.3  
scikit-learn==0.24.2  
scipy==1.7.1  
sklearn==0.0  
swifter==1.0.9  
tqdm==4.62.3  
xgboost==1.4.2  

In [1]:
import os
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
from tqdm import tqdm
import xgboost as xgb
import pickle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
from category_encoders import TargetEncoder
import swifter

## 데이터 로드
외부 데이터 및 시간이 오래 걸리는 코드 전처리의 경우 메일로 첨부하였습니다.  
해당 실행 파일의 위치에 data 폴더와 external 폴더를 함께 위치 시켜주어야 실행 가능합니다.

In [2]:
data_path = './data/'

In [3]:
external_path = './external/'

In [4]:
train_data_path = data_path + 'train_data(final).csv'
test_data_path =  data_path + 'test_data(final).csv'

In [5]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

## 외부 데이터 병합
외부 데이터 병합의 경우 실행 시간 문제로 인하여 미리 전처리 하여 메일로 첨부하였습니다.  
새로운 데이터를 돌릴 경우에만 아래 코드를 실행하면 됩니다.

In [10]:
def mergeAll(df) :
    marcap_df = pd.read_csv(external_path + 'result/mean_marcapdata.csv')
    marcap_df['code'] = marcap_df.apply(lambda x: 'A' + x['code'], axis=1)
    exchange_df = pd.read_csv(external_path + 'result/exchange_rate.csv')
    nsi_df = pd.read_csv(external_path + 'result/nsi_data.csv')
    stk_mean_df = pd.read_csv(external_path + 'result/abs_mean_stockdata.csv')
    
    dtypes={'code':np.str, 'name':np.str,'open':np.int64, 'high':np.int64, 'low':np.int64, 'close':np.int64, 'volume':np.int64, 'amount':np.int64, 'changes':np.int64, 'changecode':np.str, 'changesratio':np.float64, 'marcap':np.int64, 'stocks':np.int64,'marketid':np.str, 'market':np.str, 'dept':np.str, 'rank':np.int64, 'date':np.int64}

    allmarcap_df = pd.read_csv(external_path + 'result/all_marcap_data2016_2020.csv',dtype=dtypes)
    allmarcap_df = allmarcap_df.drop(['name','market','dept','marketid'],axis=1) # 1차 드롭
    allmarcap_df = allmarcap_df.drop(['close','changecode','changes','open','high','low','volume'],axis=1) # finance에서 들어간 것 drop

    allmarcap_df['code'] = allmarcap_df.apply(lambda x : 'A' + x['code'], axis = 1)
    allmarcap_df.columns = ['code','changesratio_d', 'amount_d', 'marcap_d', 'stocks_d', 'rank_d', 'date'] # 컬럼명 변경
    

    stk_mean_df['code'] = stk_mean_df.apply(lambda x : 'A' + x['code'], axis = 1)
    stk_mean_df = stk_mean_df.drop(['sq_abs_change','sq_abs_change100000'], axis=1)
    stk_mean_df.columns = ['code','open_mean', 'high_mean', 'low_mean', 'close_mean', 'volume_mean','abs_change_mean']

    df = pd.merge(left = df, right = stk_mean_df, how='left', left_on='iem_cd', right_on = 'code')
    df = df.drop('code', axis=1)
    
    df = MergeFinanceData(df)
    df = MergeMarcap(df, allmarcap_df)
    df = MergeMeanMarcap(df, marcap_df)
    df = MergeDollar(df, exchange_df)
    df = MergeGDP(df)
    df = MergeNSI(df, nsi_df)
    
    df = df.drop_duplicates()
    df = df.drop(['close_d', 'date', 'high_d', 'low_d'], axis = 1)

    return df

# ==============================================================================================================
# 1. finance stock data - byn_dt 기준으로 open high low close volume change 추가
def MergeFinanceData(process_df):  #
    cd = process_df['iem_cd']
    date = process_df['byn_dt']

    di = external_path + 'crawling/stockdata'
    file_list = os.listdir(di)

    f_list = []
    for i in cd:
        i = i[1:]
        f_name = list(filter(lambda x: i in x, file_list))
        if len(f_name) != 0:
            f_list.append(f_name)
        else:
            f_list.append(None)

    result = pd.DataFrame(columns=['Open', 'High', 'Low', 'Close', 'Volume', 'Change'])
    for i in range(len(f_list)):
        if f_list[i] is not None:
            f = f_list[i][0]
            print(i, f)
            filepath = '{}/{}'.format(di, f)
            df = pd.read_csv(filepath)
            d = str(date[i])
            d = d[:4] + '-' + d[4:6] + '-' + d[6:]
            temp = df[df['Date'] == d]
            temp = temp.loc[:, ['Open', 'High', 'Low', 'Close', 'Volume', 'Change']]
            result = result.append(temp, ignore_index=True)
        else:
            result = result.append({'Open': 0, 'High': 0, 'Low': 0, 'Close': 0, 'Volume': 0, 'Change': 0},
                                   ignore_index=True)

    result.columns = ['open_d', 'high_d', 'low_d', 'close_d', 'volume_d', 'change_d']  # 열이름 변경
    process_df = pd.concat([process_df, result], axis=1)

    return process_df


# ==============================================================================================================
# 2. 그외 marcap mean data 추가 (mean_changesratio,  mean_amount, mean_marcap, mean_stocks, mean_rank)
def MergeMeanMarcap(process_df, marcap_df):
    process_df = pd.merge(left=process_df, right=marcap_df[
        {'code', 'mean_changesratio', 'mean_amount', 'mean_marcap', 'mean_stocks', 'mean_rank'}], how='left',
                          left_on='iem_cd', right_on='code')
    process_df = process_df.drop(['code'], axis=1)  # 중복열 제거

    return process_df


# ==============================================================================================================
# 3. dollar 추가 - byn_dt 기준으로 환율 추가
def MergeDollar(process_df, exchange_df):
    process_df = pd.merge(process_df, exchange_df, how='left', left_on='byn_dt', right_on='date')
    process_df = process_df.drop('date', axis=1)  # 중복열 제거

    return process_df


# ==============================================================================================================
# 4. gdp 추가 - 분기별 데이터 활용, 데이터를 직업 사용
def MergeGDP(process_df):
    conditions = [(process_df['byn_dt'] >= 20201001), (process_df['byn_dt'] >= 20200701),
                  (process_df['byn_dt'] >= 20200401), (process_df['byn_dt'] >= 20200101),
                  (process_df['byn_dt'] >= 20191001), (process_df['byn_dt'] >= 20190701),
                  (process_df['byn_dt'] >= 20190401), (process_df['byn_dt'] >= 20190101),
                  (process_df['byn_dt'] >= 20181001), (process_df['byn_dt'] >= 20180701),
                  (process_df['byn_dt'] >= 20180401), (process_df['byn_dt'] >= 20180101),
                  (process_df['byn_dt'] >= 20171001), (process_df['byn_dt'] >= 20170701),
                  (process_df['byn_dt'] >= 20170401), (process_df['byn_dt'] >= 20170101),
                  (process_df['byn_dt'] >= 20161001), (process_df['byn_dt'] >= 20160701),
                  (process_df['byn_dt'] >= 20160401), (process_df['byn_dt'] >= 20160101), ]
    choices1 = [1.1, 2.2, -3.2, -1.3, 2.3, 2.0, 2.1, 1.8, 3.1, 2.4, 3.1, 3.0,
                2.9, 3.9, 2.7, 3.1, 2.6, 2.8, 3.6, 2.8]
    process_df['gdp_rate'] = np.select(conditions, choices1, default=0)

    choices2 = [492100.10, 491181.50, 472328.10, 458202.40, 497064.80, 487177.00, 479907.00,
                454891.00, 489800.50, 485535.70, 473018.30, 449838.10, 475092.90, 473436.20,
                454141.70, 433027.30, 453412.40, 439421.60, 434462.90, 413482.60]
    process_df['gdp'] = np.select(conditions, choices2, default=0)

    return process_df


# ==============================================================================================================
# 5. nsi 추가
def MergeNSI(process_df, nsi_df):
    process_df = pd.merge(left=process_df, right=nsi_df, how='left', left_on='byn_dt', right_on='date')
    process_df.drop(['date'], axis=1)

    return process_df


# ==============================================================================================================
# 6. marcap_d추가
def MergeMarcap(process_df, allmarcap_df):
    process_df = pd.merge(left=process_df, right=allmarcap_df, how='left', left_on=['iem_cd', 'byn_dt'],
                          right_on=['code', 'date'])
    process_df = process_df.fillna(0)
    process_df = process_df.drop(['code', 'date'], axis=1)
    process_df = process_df.drop_duplicates()

    return process_df




In [6]:
#train_df = mergeAll(train_df)
#test_df = mergeAll(test_df)

## 파일 전처리 코드

정확한 날짜 계산을 위하여 공휴일 정보 불러오기

In [8]:
def get_holiday():
    holiday = pd.read_csv(external_path +'result/holiday_date.csv')

    holidays = pd.to_datetime(holiday['date'], infer_datetime_format=True).dt.strftime("%Y-%m-%d")
    holidays = list(holidays)
    for i in range(len(holidays)):
        tmp = holidays[i].split('-')
        holidays[i] = ''.join(tmp)
    holidays = list(map(int, holidays))
    return holidays

In [9]:
holidays = get_holiday()

In [10]:
def working_day(s_dt, e_dt) : # 공휴일 제외 # 공휴일이 주말인 경우 고려 
    count = 0
    for day in holidays:
        if day >= s_dt and day <= e_dt:
                count += 1
        elif day > e_dt:
                break
        else:
                continue
    s_dt = str(s_dt).split('.')[0]
    e_dt = str(e_dt).split('.')[0]
    s_dt = s_dt[:4] + '-' + s_dt[4:6] + '-' + s_dt[6:]
    e_dt = e_dt[:4] + '-' + e_dt[4:6] + '-' + e_dt[6:]
    w_day = np.busday_count(s_dt, e_dt, weekmask='1111100') # 주말제외

    result = w_day - count
    return result

학습 데이터에서 hist_d를 계산하기 위해서 6개월 단위로 계산

In [11]:
def to_integer(dt_time):
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day

In [12]:
def hist_d_half(buy, hold_d) :
    buy_d = str(buy)
    buy_d = datetime(int(buy_d[:4]), int(buy_d[4:6]), int(buy_d[6:]))
    sell = to_integer(buy_d+timedelta(days=hold_d))
    base = [20160101, 20160630, 20170102, 20170630, 20180102, 20180630, 
            20190102, 20190630, 20200102, 20200630, 20210101]

    for i in range(len(base)) :
        if sell < base[i] :
            if buy > base[i-1] :
                return hold_d*0.6
                #return working_day(buy, base[i])
            else :
                return working_day(buy, base[i-1])


종목에 따른 타겟 인코딩

In [13]:
def target_encoding(df, train) :
    if train : 
        target_encoder = TargetEncoder()
        target_encoder.fit(df['iem_cd'], df['hold_d'])
        df['iem_cd_te'] = target_encoder.transform(df['iem_cd'])
        with open('./data/preprocessing/' + 'iem_te.pkl', 'wb') as f:
            pickle.dump(target_encoder, f)
    else :
        with open('./data/preprocessing/' + 'iem_te.pkl', 'rb') as f:
            target_encoder = pickle.load(f)
        df['iem_cd_te'] = target_encoder.transform(df['iem_cd'])
    return df

학습 데이터의 주식 매매 날짜 기준 past_d 계산

In [14]:
def cal_past_d(df) : # train data
    df['past_d'] = 0
    df = df.sort_values(by = ['act_id', 'iem_cd', 'byn_dt'], axis = 0).reset_index(drop=True)
    iem = ''
    cnt = 1
    print('past_d calculating...')
    for idx, row in df.iterrows() :
        if row['iem_cd'] != iem :
            iem = row['iem_cd']
            cnt = 1
        else :
            df.at[idx, 'past_d'] = (df.iloc[idx-1]['past_d']*(cnt-1) + df.iloc[idx-1]['hold_d']) / cnt
            cnt += 1
    return df

In [15]:
def get_test_past_d(hist, cus, test_df):
    hist = hist
    cus = cus
    test_df = test_df[['act_id', 'iem_cd']]
    
    hist = pd.merge(hist, cus[['act_id', 'ivs_icn_cd']], how='left', on = 'act_id')
    
    df = hist[['act_id', 'iem_cd']]
    id_cd_count = df.groupby('act_id')['iem_cd'].count().reset_index(name='count')
    id_cd_list = df.groupby('act_id')['iem_cd'].apply(list).reset_index(name='iem_cd_list')

    for i in range(9992):
        cd_list = id_cd_list['iem_cd_list'][i]
        cd_set = set(cd_list)
        count = len(cd_set)
        id_cd_list['iem_cd_list'][i] = cd_set

    id_cd_list
    
    # 계좌번호 / 종목 / 날짜 별 보유기간 추가
    for i in range(len(id_cd_list['act_id'])):
        if i == 0:
            act_id = id_cd_list['act_id'][i]
            df1 = cal_duration(act_id, hist)
        else:
            act_id = id_cd_list['act_id'][i]
            df2 = cal_duration(act_id, hist)
            df1 = pd.concat([df1, df2], axis = 0)
    df2 = df1.reset_index(drop = True)
    df3 = df2[(df2['past_d'] != 0)]
    df4 = df3.groupby(['act_id', 'iem_cd'])['past_d'].apply(list).reset_index(name='past_d_list')
    df5 = df3.groupby(['act_id', 'iem_cd'])['past_d'].mean().reset_index(name='act_past_d_mean')
    past_d_mean = df5
    past_d_mean = past_d_mean.rename(columns = {'act_past_d_mean':'past_d'})
    test_df3 = pd.merge(test_df, past_d_mean, how = 'left', on = ['act_id', 'iem_cd'])
    
    test_df3.to_csv(data_path + 'preprocessing/test_past_d_mean.csv')
    return test_df3



테스트 데이터에서 past_d 구하기 (stk_hist 기준 모든 과거 데이터)

In [16]:
def stk_past_d(df) : # test data
    if os.path.exists(data_path + 'preprocessing/test_past_d_mean.csv') :
        print("test_past_d_mean file is already.")
        test_past_df = pd.read_csv(data_path + 'preprocessing/test_past_d_mean.csv')
    else :
        stk_df = pd.read_csv(data_path +'stk_bnc_hist.csv')
        cus_df = pd.read_csv(data_path + 'cus_info.csv')
        stk_df = stk_df.sort_values(by=['bse_dt'])
        test_past_df = get_test_past_d(stk_df, cus_df, df)
    df = pd.merge(df, test_past_df, how='left', on = ['act_id', 'iem_cd'])
    return df

In [17]:
def cal_duration(act_id, hist):
    df = hist[(hist['act_id'] == act_id)][['act_id', 'iem_cd', 'bse_dt', 'bnc_qty', 'tot_aet_amt', 'ivs_icn_cd']]
    sorted_df = df.sort_values(by=['iem_cd', 'bse_dt'], ascending = [True, True])
    sorted_df = sorted_df.reset_index(drop = True)
    sorted_df['past_d'] = 0
    cur_dur = {}
    #     dur_list = []
    for i in range(len(sorted_df)):    
        # 위에서부터 하나씩 읽어가는데 딕셔너리에 종목코드가 존재하지 않는다면 해당 날짜 추가
        if sorted_df['iem_cd'][i] not in cur_dur:
            if sorted_df['bnc_qty'][i] == 0 :
                sell = sorted_df['bse_dt'][i]
                buy = 20160102
                sorted_df['past_d'][i] = working_day(buy, sell)
            cur_dur[sorted_df['iem_cd'][i]] = sorted_df['bse_dt'][i]
            
        # 이미 존재하면 해당 잔고가 0인지 확인
        else :
            # 일단 보유기간은 사고/팔고 개념이 있으니깐 팔지 않은(0이 아닌) 주식은 계산 X
            # 해당 잔고가 0이면 그 날짜와 딕셔너리에 존재하는 날짜를 사용해 보유 기간 계산
            # 어떤 종목을 다 팔고 다시 사고/팔았을 경우도 생각
            if sorted_df['bnc_qty'][i] == 0:
                sell = sorted_df['bse_dt'][i]
                buy = cur_dur[sorted_df['iem_cd'][i]]
                dur = working_day(buy, sell)
                sorted_df['past_d'][i] = dur 
                # 다 팔았으니깐 딕셔너리에서 해당 종목코드 삭제(나중에 다시 살 경우 대비)
                del cur_dur[sorted_df['iem_cd'][i]]
            # 0이 없다: 다 팔지 않음 ==> 일단 보유기간 측정 안하는 걸로.
            else:
                continue
    last_day = 20201231
    for i in cur_dur.items() :
        past_d = working_day(i[1], last_day)
        sorted_df = sorted_df.append({'act_id' : act_id , 'iem_cd' : i[0], 'bse_dt' : last_day, 'bnc_qty' : 0, 'tot_aet_amt' : 0, 'past_d' : past_d} , ignore_index=True)
        
    return sorted_df



ivs 코드에 따른 종목 past_d 평균 데이터 

In [18]:
def cal_hist() : # for ivs_past_d
    hist = pd.read_csv(data_path + 'stk_bnc_hist.csv')
    cus = pd.read_csv(data_path + 'cus_info.csv')
    hist = pd.merge(hist, cus[['act_id', 'ivs_icn_cd']], how='left', on = 'act_id')
    ivs_lst = sorted(hist['ivs_icn_cd'].unique())
    df_lst = [0] * len(ivs_lst)
    for i in range(len(ivs_lst)) :
        ivs_code = ivs_lst[i]
        df = hist[hist['ivs_icn_cd'] == ivs_code]
        id_list = df['act_id'].unique()
        for j in range(len(id_list)):
            if j == 0:
                act_id = id_list[j]
                tmp_df1 = cal_duration(act_id, hist)
            else:
                act_id = id_list[j]
                tmp_df2 = cal_duration(act_id, hist)
                tmp_df1 = pd.concat([tmp_df1, tmp_df2], axis = 0)
        tmp_df1.reset_index(drop = True)
        df_lst[i] = tmp_df1
    return df_lst

In [19]:
def chk_ivs_past_d() :
    if os.path.exists('./data/preprocessing/ivs_past_d_mean.csv') :
        print("ivs_past_d_mean file is already.")
        return
    print("make ivs_past_d mean file")
    ivs_lst, df_lst = cal_hist()
    for i in range(len(ivs_lst)) :
        if i == 0 :
            df1 = df_lst[i].reset_index(drop=True)
            df1 = df1[(df1['past_d']) != 0]
            df1 = df1.groupby(['iem_cd'])['past_d'].mean().reset_index(name='ivs_past_d_mean')
            df1['ivs_icn_cd'] = ivs_lst[i]
        else :
            df2 = df_lst[i].reset_index(drop=True)
            df2 = df2[(df2['past_d']) != 0]
            df2 = df2.groupby(['iem_cd'])['past_d'].mean().reset_index(name='ivs_past_d_mean')
            df2['ivs_icn_cd'] = ivs_lst[i]
            df1 = pd.concat([df1, df2], axis = 0)
    df1.to_csv(data_path + 'preprocessing/' + 'ivs_past_d_mean.csv', index= False)



In [20]:
def apply_ivs_past_d(df) :
    chk_ivs_past_d()
    ivs_past_df = pd.read_csv(data_path + 'preprocessing/' + 'ivs_past_d_mean.csv')
    df = pd.merge(df, ivs_past_df, how = 'left', on = ['iem_cd', 'ivs_icn_cd'])
    return df

stk hist로부터 주식 관련 정보 로드

In [21]:
def bnc_count(hist):
    hist = hist
    tmp = hist.groupby(['act_id', 'iem_cd'])['bse_dt'].count().reset_index(name='bnc_count')
    tmp = pd.merge(hist, tmp, how = 'left', on = ['act_id', 'iem_cd'])
    return tmp

def qty_change(hist):
    hist = hist
    sorted_hist = hist.sort_values(by=['iem_cd', 'bse_dt'], ascending = [True, True])
    sorted_hist = sorted_hist.reset_index(drop=True)
    sorted_hist = sorted_hist.groupby(['act_id', 'iem_cd'])['bnc_qty'].apply(list).reset_index(name = 'qty_list')
    sorted_hist['qty_change'] = 0
    
    for i in range(len(sorted_hist)):
        qty_change = []
        tmp = sorted_hist['qty_list'][i]

        for j in range(len(tmp)-1):
            change = abs(tmp[j+1] - tmp[j])
            qty_change.append(change)

        qty_change = np.array(qty_change)
        qty_mean = np.mean(qty_change)
        sorted_hist['qty_change'][i] = qty_mean
        
    sorted_hist = sorted_hist.drop(['qty_list'], axis = 1)
    
    # stk_bnc_hist 데이터와 merge
    new_hist = pd.merge(hist, sorted_hist, how = 'left', on = ['act_id', 'iem_cd'])
    
    
    return new_hist

def aet_amt_change(hist):
    hist = hist
    sorted_hist = hist.sort_values(by=['iem_cd', 'bse_dt'], ascending = [True, True])
    sorted_hist = sorted_hist.reset_index(drop=True)
    sorted_hist = sorted_hist.groupby(['act_id', 'iem_cd'])['tot_aet_amt'].apply(list).reset_index(name = 'aet_amt_list')
    sorted_hist['aet_amt_change'] = 0
    
    for i in range(len(sorted_hist)):
        aet_change = []
        tmp = sorted_hist['aet_amt_list'][i]
        for j in range(len(tmp)-1):
            change = abs(tmp[j+1] - tmp[j])
            aet_change.append(change)
        aet_change = np.array(aet_change)
        aet_mean = np.mean(aet_change)

        sorted_hist['aet_amt_change'][i] = aet_mean
        
    sorted_hist = sorted_hist.drop(['aet_amt_list'], axis = 1)
    
    # stk_bnc_hist 데이터와 merge
    new_hist = pd.merge(hist, sorted_hist, how = 'left', on = ['act_id', 'iem_cd'])
    
    return new_hist

def end_price(hist):
    new_hist = hist
    new_hist['end_price'] = 0
    
    for i in range(len(new_hist)):
        tot = new_hist['tot_aet_amt'][i]
        qty = new_hist['bnc_qty'][i]
        end_price = tot // qty
        new_hist['end_price'][i] = end_price
    
    return new_hist

def get_from_stk(df) :
    hist = pd.read_csv(data_path + 'stk_bnc_hist.csv')
    new_hist = bnc_count(hist)
    new_hist = qty_change(new_hist)
    new_hist = aet_amt_change(new_hist)
    new_hist = end_price(new_hist)
    df = pd.merge(df, new_hist, how='left', left_on = ['act_id', 'iem_cd', 'byn_dt'], right_on = ['act_id', 'iem_cd', 'bse_dt'])
    return df

## 전처리

In [22]:
def preprocessing(df, train) :
    print('*********preprocessing...')

    #cus info merge
    print("*********cus info merge..")
    cus_df = pd.read_csv(data_path +'cus_info.csv')
    df = pd.merge(left = df, right = cus_df, how='left', on='act_id')
    iem_df = pd.read_csv(data_path +'iem_info_20210902.csv')
    df = pd.merge(df, iem_df, how='left', on='iem_cd')

    #iem target encoding
    print("*********iem target encoding..")
    df = target_encoding(df, train)

    # from stk hist
    print("*********from stk hist..(bnc_count, qty_change, aet_amt_change, end_price")
    df = get_from_stk(df)

    #ivs_past_d 계산
    print("*********calculating ivs past_d..")
    df = apply_ivs_past_d(df) 

    # 4. past_d 계산
    print("*********calculating total past_d..")
    if train :
        df = cal_past_d(df)
        df['past_d'] = df.apply(lambda x : x['ivs_past_d_mean'] if x['past_d'] == 0 else x['past_d'], axis = 1)
    else :
        df = stk_past_d(df)

    # 5. hist_d 계산
    if train :
        print("*********calculating train hist_d..")
        df['hist_d'] = df.apply(lambda x : hist_d_half(x['byn_dt'], x['hold_d']), axis = 1)

    print("**preprocessing successfully")

    df['bnc_count'] = df['bnc_count'] + 1
    return df

In [23]:
train_df = preprocessing(train_df, True)

*********preprocessing...
*********cus info merge..
*********iem target encoding..
*********from stk hist..(bnc_count, qty_change, aet_amt_change, end_price


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


*********calculating ivs past_d..
ivs_past_d_mean file is already.
*********calculating total past_d..
past_d calculating...
*********calculating train hist_d..
**preprocessing successfully


In [24]:
test_df = preprocessing(test_df, False)

*********preprocessing...
*********cus info merge..
*********iem target encoding..
*********from stk hist..(bnc_count, qty_change, aet_amt_change, end_price


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


*********calculating ivs past_d..
ivs_past_d_mean file is already.
*********calculating total past_d..
test_past_d_mean file is already.
**preprocessing successfully


## 학습하기

In [25]:
drop_features = ['act_id', 'iem_cd', 'bse_dt', 'iem_krl_nm']
used_features = ['byn_dt', 'high_mean', 'change_d', 'nsi', 'mean_amount',
       'volume_mean', 'mean_rank', 'volume_d', 'abs_change_mean', 'close_mean',
       'open_d', 'mean_stocks', 'mean_changesratio', 'open_mean', 'dollar',
       'gdp_rate', 'mean_marcap', 'low_mean', 'changesratio_d', 'amount_d',
       'marcap_d', 'stocks_d', 'rank_d', 'gdp', 'sex_dit_cd', 'cus_age_stn_cd',
       'ivs_icn_cd', 'cus_aet_stn_cd', 'mrz_pdt_tp_sgm_cd', 'lsg_sgm_cd',
       'tco_cus_grd_cd', 'tot_ivs_te_sgm_cd', 'mrz_btp_dit_cd', 'btp_cfc_cd',
       'mkt_pr_tal_scl_tp_cd', 'stk_dit_cd', 'iem_cd_te', 'bnc_qty',
       'tot_aet_amt', 'stk_par_pr', 'bnc_count', 'qty_change',
       'aet_amt_change', 'end_price', 'ivs_past_d_mean', 'past_d', 'hist_d']

In [26]:
train_df.columns

Index(['act_id', 'iem_cd', 'byn_dt', 'hold_d', 'high_mean', 'change_d', 'nsi',
       'mean_amount', 'volume_mean', 'mean_rank', 'volume_d',
       'abs_change_mean', 'close_mean', 'open_d', 'mean_stocks',
       'mean_changesratio', 'open_mean', 'dollar', 'gdp_rate', 'mean_marcap',
       'low_mean', 'changesratio_d', 'amount_d', 'marcap_d', 'stocks_d',
       'rank_d', 'gdp', 'sex_dit_cd', 'cus_age_stn_cd', 'ivs_icn_cd',
       'cus_aet_stn_cd', 'mrz_pdt_tp_sgm_cd', 'lsg_sgm_cd', 'tco_cus_grd_cd',
       'tot_ivs_te_sgm_cd', 'mrz_btp_dit_cd', 'iem_krl_nm', 'btp_cfc_cd',
       'mkt_pr_tal_scl_tp_cd', 'stk_dit_cd', 'iem_cd_te', 'bse_dt', 'bnc_qty',
       'tot_aet_amt', 'stk_par_pr', 'bnc_count', 'qty_change',
       'aet_amt_change', 'end_price', 'ivs_past_d_mean', 'past_d', 'hist_d'],
      dtype='object')

In [34]:
train_df = train_df.drop(drop_features, axis = 1)
train_df = train_df.fillna(0)
col = train_df.columns.drop('hold_d')

X_train, X_valid, y_train, y_valid = train_test_split(train_df[col], train_df['hold_d'], test_size = 0.2)

print("**************")
model = xgb.XGBRegressor(eta=0.01, gamma=0.5, max_depth=5, min_child_weight=1, n_estimators=1000, subsample=0.7)

print("===Training....===")
model.fit(X_train, y_train, verbose=True)
print("===Success===")
model.save_model('xgboost_final.json')
print("===model saved..====")

===Training....===
===Success===
===model saved..====


In [35]:
test_df = test_df[used_features]
test_df = test_df.fillna(0)

print("===Predict...===")
y_pred = model.predict(test_df)
print("===Success===")
test_df['y_pred'] = y_pred

===Predict...===
===Success===


예측 값 범위 조정 (예측 값이 hist_d + 146을 초과하지 않고, 0보다 작은 값이 나오지 않도록)

In [36]:
y_pred = test_df.apply(lambda x : x['y_pred'] if x['y_pred'] <= x['hist_d']+146 else x['hist_d']+146, axis = 1)
result = []
for i in y_pred:
    if i < 0 :
        i = 1
    result.append(i)

In [37]:
result

[260.8539733886719,
 481.0,
 226.87863159179688,
 353.1401062011719,
 15.564187049865723,
 163.47300720214844,
 182.1115264892578,
 866.0,
 147.57232666015625,
 656.0,
 215.48802185058594,
 1118.005615234375,
 613.0,
 10.682504653930664,
 34.16969299316406,
 8.857787132263184,
 196.98858642578125,
 120.27728271484375,
 426.0,
 23.530668258666992,
 19.666099548339844,
 1066.3857421875,
 565.0,
 38.149513244628906,
 11.692965507507324,
 115.89386749267578,
 115.53958129882812,
 351.9452209472656,
 612.0,
 426.0,
 118.84575653076172,
 118.13790893554688,
 1002.0,
 235.3580322265625,
 25.224040985107422,
 136.1689453125,
 232.79779052734375,
 1121.6318359375,
 225.8840789794922,
 1067.05126953125,
 1092.4423828125,
 581.0,
 1095.6715087890625,
 1112.2952880859375,
 1052.7152099609375,
 753.0,
 1077.8341064453125,
 1114.8197021484375,
 1092.3258056640625,
 1084.60009765625,
 1107.4056396484375,
 1086.147216796875,
 235.78358459472656,
 792.0,
 815.0,
 252.84942626953125,
 232.87564086914062

## 제출 파일 만들기

In [38]:
submission = pd.read_csv(data_path+"sample_submission.csv")

In [39]:
submission["hold_d"] = np.round(result)
submission.to_csv("stk_hld_test.csv", index = False)