In [93]:
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [63]:
date_start = datetime(2017,10,1)
date_end = datetime(2022,10,10)
# date_end = datetime(2022,3,31)
code_coin = 'KRW-BTC'

def define_data ( start , end , code, timeframe = 'daily', time_start = 9 ) :
    if timeframe == 'daily':
        if time_start == 9:
            data_all = pd.read_pickle("./data/data_coin_daily.pkl")
            data_all = data_all[data_all.coin == code]
            data_all = data_all[(data_all.date >= start)&(data_all.date <= end)]
            data_all = data_all[['open','high','low','close','value','volume','date']]
        else:
            data_hourly = pd.read_pickle('./data/data_coin_hourly.pkl')
            data_hourly = data_hourly[data_hourly.coin == code]
            start_datetime = start.replace(hour=time_start)
            data_hourly = data_hourly[data_hourly.date >= start_datetime]
            data_hourly.index = data_hourly.date
            dict_ohlcv = {'open':'first','high':'max','low':'min','close':'last','value':'sum','volume':'sum','date':'first'}
            data_all = data_hourly.resample('24H',origin=start_datetime).apply(dict_ohlcv)
            data_all = data_all[(data_all.date >= start)&(data_all.date <= end)]
    elif timeframe == 'hourly':
        data_all = pd.read_pickle('./data/data_coin_hourly.pkl')
        data_all = data_all[data_all.coin == code]
        data_all = data_all[(data_all.date >= start)&(data_all.date <= end)]
    else:
        print("Timeframe 확인")
    data_all['return'] = np.log10(data_all.close / data_all.close.shift(1))
    return data_all.reset_index(drop=True)
data_raw_temp = define_data(date_start, date_end, code_coin, 'daily', time_start=9)

In [73]:
def make_ml_dataset(data_raw):
    data_raw['target'] = np.where(data_raw['return'] > np.log10(1), 1, 0)

    data_raw['rtn_l1'] = data_raw['return'].rolling(5).sum().shift(1)
    data_raw['rtn_l2'] = data_raw['return'].rolling(10).sum().shift(1)
    data_raw['rtn_l3'] = data_raw['return'].rolling(20).sum().shift(1)

    data_raw['u_noise'] = 1- np.abs(data_raw['high']-data_raw['close'])/(data_raw['high']-data_raw['low'])
    data_raw['d_noise'] = 1- np.abs(data_raw['close']-data_raw['low'])/(data_raw['high']-data_raw['low'])

    data_raw['u_noise_l1'] = data_raw['u_noise'].rolling(5).mean().shift(1)
    data_raw['d_noise_l1'] = data_raw['d_noise'].rolling(5).mean().shift(1)

    data_raw['value_cng'] = data_raw['value'].pct_change()
    data_raw['value_cng_l1'] = data_raw['value_cng'].rolling(5).mean().shift(1)

    data_raw.dropna(inplace=True)
    ml_dataset = data_raw[['rtn_l1', 'rtn_l2', 'rtn_l3', 'u_noise_l1', 'd_noise_l1','value_cng_l1']].to_numpy()
    target = data_raw['target'].to_numpy()
    return data_raw
#     return ml_dataset, target

In [68]:
data_raw = pd.DataFrame()
for i in range(24):
    data_raw_temp = define_data(date_start, date_end, code_coin, 'daily', time_start=i)
    data_raw_temp = make_ml_dataset(data_raw_temp)
    data_raw = pd.concat([data_raw,data_raw_temp],axis=0)

In [72]:
data_raw

Unnamed: 0,open,high,low,close,value,volume,date,return,target,rtn_l1,rtn_l2,rtn_l3,u_noise,d_noise,u_noise_l1,d_noise_l1,value_cng,value_cng_l1
61,10777000.0,12680000.0,10350000.0,12264000.0,4.270351e+11,36564.197420,2017-12-01 00:00:00,0.059248,1,0.053355,0.146006,0.340430,0.821459,0.178541,0.668069,0.331931,-0.188571,0.378964
62,12278000.0,13180000.0,12081000.0,12640000.0,3.772563e+11,29567.672795,2017-12-02 00:00:00,0.013115,1,0.083795,0.225131,0.384653,0.508644,0.491356,0.664831,0.335169,-0.116568,0.212586
63,12640000.0,13487000.0,12538000.0,13304000.0,3.320132e+11,25549.344861,2017-12-03 00:00:00,0.022235,1,0.060118,0.237533,0.411020,0.807165,0.192835,0.609368,0.390632,-0.119927,0.115350
64,13304000.0,13560000.0,12850000.0,13432000.0,3.259701e+11,24396.970990,2017-12-04 00:00:00,0.004158,1,0.058104,0.250255,0.434940,0.819718,0.180282,0.625054,0.374946,-0.018201,0.068808
65,13433000.0,14370000.0,13307000.0,14222000.0,3.571227e+11,25680.120492,2017-12-05 00:00:00,0.024820,1,0.005466,0.252732,0.433185,0.860771,0.139229,0.632198,0.367802,0.095569,-0.078327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,28006000.0,28006000.0,26216000.0,27111000.0,2.471417e+11,9182.809132,2022-09-18 23:00:00,-0.014075,0,-0.021954,0.005525,-0.048130,0.500000,0.500000,0.422937,0.577063,2.268901,-0.247501
1814,27111000.0,27732000.0,26720000.0,26855000.0,1.667403e+11,6115.305205,2022-09-19 23:00:00,-0.004120,0,-0.020921,-0.012194,-0.044280,0.133399,0.866601,0.472301,0.527699,-0.325325,0.208043
1815,26859000.0,27283000.0,26582000.0,27217000.0,1.251117e+11,4653.851171,2022-09-20 23:00:00,0.005815,1,-0.022424,-0.014676,-0.060680,0.905849,0.094151,0.361772,0.638228,-0.249661,0.211706
1816,27217000.0,27974000.0,26199000.0,26951000.0,2.853706e+11,10570.073876,2022-09-21 23:00:00,-0.004265,0,-0.007216,-0.003987,-0.036642,0.423662,0.576338,0.525610,0.474390,1.280926,0.201509


In [50]:
# data_raw.dropna(inplace=True)
# dict_ohlcv = {'open':'first','high':'max','low':'min','close':'last','value':'sum','return':'sum'}
data_raw[['open']] = data_raw[['open']].shift(23)
data_raw[['high']] = data_raw[['high']].rolling(24).max()
data_raw[['low']] = data_raw[['low']].rolling(24).min()
data_raw[['value']] = data_raw[['value']].rolling(24).sum()
data_raw[['return']] = data_raw[['return']].rolling(24).sum()
data_raw[['open','high','low','close','value','return']]

Unnamed: 0,open,high,low,close,value,return
0,,,,4804000.0,,
1,,,,4790000.0,,
2,,,,4790000.0,,
3,,,,4806000.0,,
4,,,,4830000.0,,
...,...,...,...,...,...,...
43571,26523000.0,27698000.0,26478000.0,27572000.0,1.614473e+11,0.016600
43572,26500000.0,27698000.0,26479000.0,27495000.0,1.603059e+11,0.016008
43573,26638000.0,27698000.0,26600000.0,27567000.0,1.605339e+11,0.014855
43574,26890000.0,27698000.0,26650000.0,27483000.0,1.596177e+11,0.009473


In [6]:
data_raw['value'].pct_change()`

0            NaN
1      -0.822377
2       0.455448
3       1.569855
4      -0.267958
          ...   
1783   -0.464833
1784    0.459703
1785    0.081527
1786   -0.404230
1787   -0.923321
Name: value, Length: 1788, dtype: float64

In [80]:
data_raw.sort_values(by=['date'], ascending=True, inplace=True)

data_raw_train = pd.DataFrame()
data_raw_test = pd.DataFrame()

# for i in range(5):
#     df_train = data_raw.iloc[i*8430 : i*8430+6360]
#     df_test = data_raw.iloc[i*8430+6360 : i*8430+(6360+1590)]
#     df_pass = data_raw.iloc[i*8430+(6360+1590) : (i+1)*8430]
#     data_raw_train = pd.concat([data_raw_train, df_train], axis=0)
#     data_raw_test = pd.concat([data_raw_test, df_test], axis=0)

for i in range(10):
    df_train = data_raw.iloc[i*4220 : i*4220+2990]
    df_test = data_raw.iloc[i*4220 +2990: i*4220 +(2990+750)]
    df_pass = data_raw.iloc[i*4220 +(2990+750) : (i+1)*4220 ]
    data_raw_train = pd.concat([data_raw_train, df_train], axis=0)
    data_raw_test = pd.concat([data_raw_test, df_test], axis=0)


data_raw_train_mix = np.random.permutation(data_raw_train[['rtn_l1', 'rtn_l2', 'rtn_l3', 'u_noise_l1', 'd_noise_l1', 'value_cng_l1','target']].to_numpy())
data_raw_test_mix = np.random.permutation(data_raw_test[['rtn_l1', 'rtn_l2', 'rtn_l3', 'u_noise_l1', 'd_noise_l1', 'value_cng_l1','target']].to_numpy())

train_input  = data_raw_train[['rtn_l1', 'rtn_l2', 'rtn_l3', 'u_noise_l1', 'd_noise_l1', 'value_cng_l1']].to_numpy()
test_input  = data_raw_test[['rtn_l1', 'rtn_l2', 'rtn_l3', 'u_noise_l1', 'd_noise_l1', 'value_cng_l1']].to_numpy()
train_target = data_raw_train['target'].to_numpy()
test_target = data_raw_test['target'].to_numpy()


In [69]:
# data_raw['target'] = np.where(data_raw['return'] > 0.007987,3,np.where(data_raw['rtn'] > 0.000707, 2,np.where(data_raw['rtn'] > -0.006008, 1, 0)))
data_raw['target'] = np.where(data_raw['return'] > 0, 1, 0)
print('상승비율 : ', np.mean(data_raw['target']))

# data_raw['rtn_l1'] = data_raw['return'].rolling(5).sum().shift(1)
# data_raw['rtn_l2'] = data_raw['return'].rolling(20).sum().shift(1)
# data_raw['rtn_l3'] = data_raw['return'].rolling(60).sum().shift(1)

# # data_raw['noise'] = 1- np.abs(data_raw['Close']-data_raw['Open'])/(data_raw['High']-data_raw['Low'])
# data_raw['u_noise'] = 1- np.abs(data_raw['high']-data_raw['close'])/(data_raw['high']-data_raw['low'])
# data_raw['d_noise'] = 1- np.abs(data_raw['close']-data_raw['low'])/(data_raw['high']-data_raw['low'])

# data_raw['u_noise_l1'] = data_raw['u_noise'].rolling(5).mean().shift(1)
# data_raw['d_noise_l1'] = data_raw['d_noise'].rolling(5).mean().shift(1)

# data_raw['value_cng'] = data_raw['value'].pct_change()
# data_raw['value_cng_l1'] = data_raw['value_cng'].rolling(5).mean().shift(1)

# data_raw.drop(['Open','Close','High','Low'], axis='columns', inplace=True)
data_raw.dropna(inplace=True)

# data = data_raw.drop(['target'], axis='columns').to_numpy()
data = data_raw[['rtn_l1', 'rtn_l2', 'rtn_l3', 'u_noise_l1', 'd_noise_l1', 'value_cng_l1']].to_numpy()
target = data_raw['target'].to_numpy()


train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, shuffle=True, stratify=target, random_state=42)

print('train상승비율 : ', np.mean(train_target))
print('test상승비율 : ', np.mean(test_target))

dt = DecisionTreeClassifier(random_state = 42)

dt.fit(train_input, train_target)
pred = dt.predict(test_input)
print(roc_auc_score(test_target, pred))
print(dt.score(train_input, train_target))
print(dt.score(test_input, test_target))

print(dt.feature_importances_)

상승비율 :  0.5181520501138952
train상승비율 :  0.5181373275989916
test상승비율 :  0.5182109384268596
0.7406282387275369
1.0
0.7408945307865702
[0.18722666 0.228549   0.26691402 0.06733873 0.06173333 0.18823825]


In [17]:
clf.score(train_input, train_target)

0.6714178544636159

In [95]:
from scipy.stats import uniform, randint
# 'bootstrap' : [True],
params = {'n_estimators' : range(10,200,10), 
          'max_depth' : randint(10, 30), 
          'min_samples_leaf' : randint(1, 25), 
          'min_samples_split' : randint(2, 25), 
          'max_features' : [4] }

from sklearn.model_selection import RandomizedSearchCV

my_cv = TimeSeriesSplit(n_splits=5).split(train_input)

# clf = RandomizedSearchCV(RandomForestClassifier(), params, cv=my_cv, n_jobs=-1, n_iter=100)
clf = RandomizedSearchCV(GradientBoostingClassifier(), params, cv=my_cv, n_jobs=-1, n_iter=100)
clf.fit(train_input, train_target)

print(clf.best_params_)
print(clf.best_score_)

pred_con = clf.predict(test_input)
print(accuracy_score(test_target, pred_con))
print(roc_auc_score(test_target, pred_con))


{'max_depth': 11, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 18, 'n_estimators': 60}
0.49889624724061815
0.4765333333333333
0.47658429752465825


In [90]:
pred_con_inverse = np.where(pred_con == 0, 1, 0)

In [92]:
sum(np.array(data_raw_test['return']))

0.6138371044013343

In [91]:
sum(np.array(data_raw_test['return']) * pred_con_inverse)

7.483575499592382

In [52]:
n_estimators = range(10,200,10)
params = {'bootstrap' : [True], 
          'n_estimators' : n_estimators, 
          'max_depth' : range(4,13,2), 
          'min_samples_leaf' : range(2, 6, 1), 
          'min_samples_split' : range(2, 11, 2), 
          'max_features' : [4] }

my_cv = TimeSeriesSplit(n_splits=5).split(train_input)

clf = GridSearchCV(RandomForestClassifier(), params, cv=my_cv, n_jobs=-1)
# clf = GridSearchCV(GradientBoostingClassifier(), params, cv=my_cv, n_jobs=-1)
clf.fit(train_input, train_target)

print(clf.best_params_)
print(clf.best_score_)

pred_con = clf.predict(test_input)
print(accuracy_score(test_target, pred_con))
print(roc_auc_score(test_target, pred_con))



{'bootstrap': True, 'max_depth': 6, 'max_features': 4, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 180}
0.8562607795791652
0.8634325784572939
0.8627968060349986


In [82]:
import joblib
# 학습시킨 모델을 현재 경로에 knn_model.pkl 파일로 저장합니다.
joblib.dump(clf.best_estimator_, './model/model4.pkl')


['./model/model4.pkl']

In [30]:
loaded_model = joblib.load('./model/model2.pkl')
score = loaded_model.score(test_input, test_target)
score

0.5269461077844312

In [53]:
clf.best_estimator_.feature_importances_

array([0.62187394, 0.00864812, 0.00484978, 0.16550109, 0.18621364,
       0.01291343])