In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
from IPython.display import display, HTML
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import keras
from keras.models import *
from keras.layers import *
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
from keras import backend as K
K.set_session(session)
from exp_smooth import exp_smooth

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
DATASET_ROOT = './0608'
TRAIN_PATH = os.path.join(DATASET_ROOT, 'taetfp.csv')
AUX_PATH = os.path.join(DATASET_ROOT, 'tasharep.csv')
TS_IN=20
TS_OUT=5
EXP_ALPHA=0.3
fields = ['開盤價(元)', '最高價(元)', '最低價(元)', '收盤價(元)', '成交張數(張)']
fields_dict = dict()
for n, f in enumerate(fields):
    fields_dict[f] = n

In [3]:
train_csv = pd.read_csv(TRAIN_PATH)
train_csv.sort_values(by=['日期', '代碼'], ascending=True, inplace=True)
train_csv[fields] = train_csv[fields].applymap(lambda x: float(x.replace(',','')) if type(x)==str else x)
display(train_csv.head(n=10))

aux_csv = pd.read_csv(AUX_PATH)
aux_csv.sort_values(by=['日期', '代碼'], ascending=True, inplace=True) # 照 '日期' 和 '代碼' 排序 (重要)
aux_csv[fields] = aux_csv[fields].applymap(lambda x: float(x.replace(',','')) if type(x)==str else x)
display(aux_csv.head(n=10))

Unnamed: 0,代碼,日期,中文簡稱,開盤價(元),最高價(元),最低價(元),收盤價(元),成交張數(張)
0,50,20130102,元大台灣50,46.57,47.13,46.49,46.92,16487.0
1332,51,20130102,元大中型100,22.36,22.66,22.36,22.65,277.0
2664,52,20130102,富邦科技,26.57,26.95,26.57,26.92,26.0
3996,53,20130102,元大電子,19.84,19.84,19.52,19.81,42.0
5328,54,20130102,元大台商50,16.84,16.84,16.54,16.75,20.0
6660,55,20130102,元大MSCI金融,9.85,9.94,9.83,9.91,2210.0
7992,56,20130102,元大高股息,18.45,18.63,18.45,18.55,1623.0
9324,57,20130102,富邦摩台,30.35,30.69,30.35,30.69,69.0
10656,58,20130102,富邦發達,32.45,32.45,32.12,32.12,2.0
11988,59,20130102,富邦金融,24.65,24.65,24.65,24.65,2.0


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,代碼,日期,中文簡稱,開盤價(元),最高價(元),最低價(元),收盤價(元),成交張數(張)
0,1101,20130102,台泥,30.41,30.53,30.18,30.45,6374.0
1332,1102,20130102,亞泥,28.74,28.9,28.7,28.78,2856.0
2664,1103,20130102,嘉泥,12.29,12.33,12.24,12.24,315.0
3996,1104,20130102,環泥,11.87,12.17,11.87,12.06,551.0
5328,1108,20130102,幸福,5.18,5.22,5.16,5.18,447.0
6660,1109,20130102,信大,9.24,9.28,9.19,9.24,104.0
7992,1110,20130102,東泥,13.41,13.46,13.23,13.23,169.0
9324,1201,20130102,味全,35.69,37.74,35.44,37.69,8971.0
10656,1203,20130102,味王,16.61,16.78,16.61,16.73,143.0
11988,1210,20130102,大成,16.84,16.99,16.84,16.87,1373.0


In [4]:
codes = list(sorted(set(train_csv['代碼'])))
aux_codes = np.load('codes_subset.npy') # sorted
days  = list(sorted(set(train_csv['日期']))) # 在 taetfp 內出現的日期才被我們使用
#days_dict = dict()
#for n, day in enumerate(days):
#    days_dict[day] = n

In [5]:
aux_csv = aux_csv.loc[aux_csv['代碼'].isin(aux_codes) & aux_csv['日期'].isin(days)] # 只找在 subset 內的資料

In [6]:
split_days = [] # 同一天分一堆
for day in days:
    row = aux_csv.loc[aux_csv['日期']==day][fields]
    split_days.append(np.asarray(row, dtype=np.float32))
split_days = np.asarray(split_days, dtype=np.float32)

In [7]:
print(split_days.shape) # 先當作日期有對齊好 shape: (1317, 1388, 5) -> 1317 天, 每天有 1388 支股票, 每支股票有 5 個 feature

(1332, 1387, 5)


In [8]:
split_days = split_days.reshape(split_days.shape[0], -1) # shape: (1317, 1388*5=6940) , 攤平
print(split_days.shape)

(1332, 6935)


In [9]:
import pickle
from sklearn.manifold import Isomap
with open('isomap.pickle', 'rb') as fp:
    isomap = pickle.load(fp)

In [10]:
split_days_reduced = isomap.transform(split_days) # dimension reduction
print(split_days_reduced.shape)

(1332, 122)


In [11]:
split_train = []
for code in codes:
    row   = np.asarray(train_csv.loc[train_csv['代碼']==code][fields], dtype=np.float32)
    aux_pca_data = split_days_reduced[-row.shape[0]:,:] # 對齊最後幾天 (先當作這樣就對齊了)
    label = np.append(np.array([0], dtype=np.float32), (row[1:,-2] > row[:-1,-2]).astype(np.float32)).astype(np.float32)[...,np.newaxis]
    #row[day-0:, 收盤價] > row[:day-n, 收盤價]
    row = np.concatenate([row, aux_pca_data], axis=-1) # 127
    row = np.append(row, exp_smooth(row, alpha=EXP_ALPHA, dtype=np.float32), axis=-1) # 254
    row = np.append(row, label, axis=-1) # 255
    split_train.append(row[-TS_IN:])

In [12]:
for n, s in enumerate(split_train):
    print(codes[n], s.shape)

50 (20, 255)
51 (20, 255)
52 (20, 255)
53 (20, 255)
54 (20, 255)
55 (20, 255)
56 (20, 255)
57 (20, 255)
58 (20, 255)
59 (20, 255)
690 (20, 255)
692 (20, 255)
701 (20, 255)
713 (20, 255)
6201 (20, 255)
6203 (20, 255)
6204 (20, 255)
6208 (20, 255)


In [13]:
from keras.regularizers import *

def make_model(ts_in=60, ts_out=5, n_field=255, n_encode=256, n_decode=128, reg_a=0.1):
    input_ = Input(shape=(ts_in, n_field))
    noise = GaussianNoise(1e-6) (input_)
    lstm_1 = Bidirectional(CuDNNLSTM(n_encode, return_sequences=True, recurrent_regularizer=l2(reg_a)), merge_mode='sum') (noise)
    lstm_2 = Bidirectional(CuDNNLSTM(n_encode, return_sequences=False, recurrent_regularizer=l2(reg_a)), merge_mode='sum') (lstm_1)
    rep_vec_2 = RepeatVector(ts_out) (lstm_2)
    lstm_3 = CuDNNLSTM(n_decode, return_sequences=True, recurrent_regularizer=l2(reg_a)) (rep_vec_2)
    lstm_4 = CuDNNLSTM(n_decode, return_sequences=True, recurrent_regularizer=l2(reg_a)) (lstm_3)
    fc_4_regression     = TimeDistributed(Dense(1, kernel_regularizer=l2(reg_a)), name='regression') (lstm_4)
    fc_4_classification = TimeDistributed(Dense(1, kernel_regularizer=l2(reg_a), activation='sigmoid'), name='class') (lstm_4)
    return Model([input_], [fc_4_regression, fc_4_classification])
make_model(ts_in=TS_IN, ts_out=TS_OUT).summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 20, 255)      0                                            
__________________________________________________________________________________________________
gaussian_noise_1 (GaussianNoise (None, 20, 255)      0           input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 20, 256)      1050624     gaussian_noise_1[0][0]           
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 256)          1052672     bidirectional_1[0][0]            
__________________________________________________________________________________________________
repeat_vec

In [14]:
from tqdm import tqdm
with open('./submission.csv', 'w') as fp:
    fp.write('ETFid,Mon_ud,Mon_cprice,Tue_ud,Tue_cprice,Wed_ud,Wed_cprice,Thu_ud,Thu_cprice,Fri_ud,Fri_cprice\n')
    for model_n, X in tqdm(enumerate(split_train), total=len(split_train)):
        prefix = str(codes[model_n])
        name_ = '00'+prefix
        model = make_model(ts_in=TS_IN, ts_out=TS_OUT)
        model.load_weights(prefix+'_model.h5')
        pred_reg, pred_class = model.predict(X[np.newaxis,...], batch_size=1)
        K.clear_session()
        pred_reg = np.squeeze(pred_reg)
        pred_class = np.squeeze(pred_class)
        fp.write(name_)
        for p_c, p_r in zip(pred_class, pred_reg):
            fp.write(',{:d},{:.6f}'.format(1 if p_c>.5 else -1, p_r))
        fp.write('\n')
        

100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:20<00:00,  1.12s/it]


![](https://tbrain.trendmicro.com.tw/Content/img/18etf.png)

#### Todo:
1. Check missing data / value
2. Do more EDA on dataset
3. Try more baseline models (LSTM, GRU, CNN-1D, XGBoost, SVM, ...) 
4. Add Candlestick chart
5. Add n-fold cross validation
6. 是否天數有對齊?
