## Import package

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from sklearn import preprocessing

%matplotlib inline

## Read Data

In [2]:
train = pd.read_csv('./data/training_set.csv')
submit = pd.read_csv('./data/sampleSubmission.csv')
print(train.shape)
print(submit.shape)

(74278, 7)
(30, 2)


In [3]:
print(train.head())

           Time  Weekday     Open     High      Low    Close      Volume
0  170 05:00:00        0  1.12053  1.12079  1.12050  1.12067  302.690002
1  170 05:10:00        0  1.12066  1.12074  1.12051  1.12070  486.690001
2  170 05:20:00        0  1.12070  1.12071  1.12065  1.12070  212.120000
3  170 05:30:00        0  1.12070  1.12072  1.12050  1.12061  811.989999
4  170 05:40:00        0  1.12060  1.12079  1.12027  1.12029  502.870001


In [4]:
print(train.columns)

Index(['Time', 'Weekday', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')


## 資料預處理

In [5]:
print(train.isna().sum())

Time       0
Weekday    0
Open       0
High       0
Low        0
Close      0
Volume     0
dtype: int64


In [6]:
train['Close'] = round(train['Close'], 5)
train['Close'] = np.log1p(train['Close'])
close = train['Close']

target_list = np.expand_dims(close.values,axis=1) # 展開維度成兩維 (總數,feature數)


In [7]:
print(target_list.shape)

(74278, 1)


In [8]:
time = train['Time'].values

###### plt.figure(figsize = (15,7))
plt.plot(
    np.arange(len(close)),
    close,
    label = 'Close',
)
plt.xticks(np.arange(len(close))[::5000], train['Time'][::5000], rotation = 'vertical') # 如果只是看趨勢，也可以不畫X軸資訊，但如果要加建議設個間隔，避免畫面擠滿文數字
plt.grid()
plt.show()

In [9]:
# drop掉不用的Feature，資料已有時間序列排序，不需要再對時間日期做調整
train = train.drop(['Time','Weekday'], axis=1).values

In [10]:
print(train.shape)

(74278, 5)


In [11]:
def window_train (data, target, window_size,predict_length=1):
    X_train,y_train = [],[]
    for i in range(len(data) - (window_size+predict_length) + 1):
        X_train.append(data[i:i+window_size])   
        y_train.append(target[i+window_size:i+window_size+predict_length])
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    return X_train,y_train

In [12]:
window_size = 15
X,y = window_train(train, target_list, window_size)

In [13]:
X.shape # (總數,window_size,feature數)

(74263, 15, 5)

In [14]:
y.shape

(74263, 1, 1)

In [15]:
# 0:Open,1:High,2:Low,3:Close,4:Volume
X = X[:,:,3]          # 只取Close (第三個column)
print(X.shape)
X = X[:,:,np.newaxis] # 展維度
print(X.shape)

(74263, 15)
(74263, 15, 1)


In [16]:
# 前七萬筆當作訓練資料
# 不切training set, train完後產生預測資料
X_train = X[0:]
y_train = y[0:]

# 七萬筆後當驗證資料
#X_valid = X[70000:]
#y_valid = y[70000:]

print("X_train size: {}".format(X_train.shape))
print("y_train size: {}".format(y_train.shape))
#print("X_test size: {}".format(X_valid.shape))
#print("y_test size: {}".format(y_valid.shape))

X_train size: (74263, 15, 1)
y_train size: (74263, 1, 1)


## 建立、訓練模型

In [17]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, LSTM, SimpleRNN, GRU
from tensorflow.python.keras.layers import TimeDistributed, Flatten
from tensorflow.keras.optimizers import Adam, SGD

In [125]:
## OK Case
model = Sequential()
model.add(LSTM(units=32, activation='tanh', return_sequences=True))
#model.add(LSTM(units=16, activation='tanh', return_sequences=True))
model.add(LSTM(units=16, activation='tanh', return_sequences=False))
model.add(Flatten())
model.add(Dense(units=16))
model.add(Dense(units=8))
#model.add(Dense(units=8))
model.add(Dense(units=1))

opt = Adam(lr=0.001, decay=1e-6, clipvalue=5)
model.compile(loss='mse', optimizer=opt, metrics=['mse'])

In [126]:
## Case 2 RELU 0.00634 Not good....
# model = Sequential()
# model.add(LSTM(units=32, activation='relu', return_sequences=True))
# model.add(LSTM(units=16, activation='relu', return_sequences=True))
# model.add(LSTM(units=16, activation='relu', return_sequences=False))
# model.add(Flatten())
# model.add(Dense(units=16))
# model.add(Dense(units=8))
# #model.add(Dense(units=8))
# model.add(Dense(units=1))

# opt = Adam(lr=0.001, decay=1e-6, clipvalue=5)
# model.compile(loss='mse', optimizer=opt, metrics=['mse'])

In [127]:
# ## Case3 sigmoid Not good.....
# model = Sequential()
# model.add(LSTM(units=32, activation='sigmoid', return_sequences=True))
# #model.add(LSTM(units=16, activation='tanh', return_sequences=True))
# model.add(LSTM(units=16, activation='sigmoid', return_sequences=False))
# model.add(Flatten())
# model.add(Dense(units=16))
# model.add(Dense(units=8))
# #model.add(Dense(units=8))
# model.add(Dense(units=1))

# opt = Adam(lr=0.001, decay=1e-6, clipvalue=5)
# model.compile(loss='mse', optimizer=opt, metrics=['mse'])

In [128]:
# ## tanh with SGD Not good....
# model = Sequential()
# model.add(LSTM(units=32, activation='tanh', return_sequences=True))
# #model.add(LSTM(units=16, activation='tanh', return_sequences=True))
# model.add(LSTM(units=16, activation='tanh', return_sequences=False))
# model.add(Flatten())
# model.add(Dense(units=16))
# model.add(Dense(units=8))
# #model.add(Dense(units=8))
# model.add(Dense(units=1))

# opt = SGD(lr=0.001, decay=1e-6, clipvalue=5)
# model.compile(loss='mse', optimizer=opt, metrics=['mse'])

In [129]:
## OK Case 0.00074 Very Good !
model = Sequential()
model.add(GRU(units=32, activation='tanh', return_sequences=True))
#model.add(LSTM(units=16, activation='tanh', return_sequences=True))
model.add(GRU(units=16, activation='tanh', return_sequences=False))
model.add(Flatten())
model.add(Dense(units=16))
model.add(Dense(units=8))
#model.add(Dense(units=8))
model.add(Dense(units=1))

opt = Adam(lr=0.001, decay=1e-6, clipvalue=5)
model.compile(loss='mse', optimizer=opt, metrics=['mse'])

In [130]:
#model.fit(X_train, y_train, epochs=15, validation_data=(X_valid,y_valid))
model.fit(X_train, y_train, epochs=2, )
model.save("model.h5")

Train on 74263 samples
Epoch 1/2
Epoch 2/2


In [131]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    multiple                  3264      
_________________________________________________________________
gru_1 (GRU)                  multiple                  2352      
_________________________________________________________________
flatten_9 (Flatten)          multiple                  0         
_________________________________________________________________
dense_27 (Dense)             multiple                  272       
_________________________________________________________________
dense_28 (Dense)             multiple                  136       
_________________________________________________________________
dense_29 (Dense)             multiple                  9         
Total params: 6,033
Trainable params: 6,033
Non-trainable params: 0
____________________________________________________

## 評估模型準確率

In [132]:
valid_pred = model.predict(X_valid)

NameError: name 'X_valid' is not defined

In [133]:
plt.figure(figsize=(15,7))
plt.plot(np.squeeze(valid_pred),label='pred')
plt.plot(np.squeeze(y_valid),label='Original')
plt.legend()
plt.show()

NameError: name 'valid_pred' is not defined

<Figure size 1080x504 with 0 Axes>

## 預測資料

In [134]:
# 取X中最後一組資料筆數
test_data = np.array([i[0] for i in X[-1]])
print(test_data.shape)
print(test_data)

(15,)
[0.75522956 0.75511678 0.75513088 0.75516377 0.75493348 0.75457148
 0.75464671 0.75475015 0.75475955 0.75473134 0.75454797 0.75480186
 0.75469843 0.75459969 0.75458089]


In [135]:
# 將資料最後一筆做串接用於預測下一筆資料
test_data = np.append(test_data,y[-1])
print(test_data.shape)
print(test_data)

(16,)
[0.75522956 0.75511678 0.75513088 0.75516377 0.75493348 0.75457148
 0.75464671 0.75475015 0.75475955 0.75473134 0.75454797 0.75480186
 0.75469843 0.75459969 0.75458089 0.75444452]


In [136]:
for i in range(len(submit)):
    print(i)
    test_X_data = test_data[-len(test_data)+1+i:]
    print(test_X_data)
    
    # 將其reshape成三維shape
    test_X_data = test_X_data[np.newaxis,:,np.newaxis]
    
    # 用最後window_size筆預測下一筆
    test_pred = model.predict(test_X_data)
    print('pred: ',test_pred)

    # append 新預測至最後一筆
    test_data = np.append(test_data, test_pred)
    print('new data: ',test_data)

0
[0.75511678 0.75513088 0.75516377 0.75493348 0.75457148 0.75464671
 0.75475015 0.75475955 0.75473134 0.75454797 0.75480186 0.75469843
 0.75459969 0.75458089 0.75444452]
pred:  [[0.7543746]]
new data:  [0.75522956 0.75511678 0.75513088 0.75516377 0.75493348 0.75457148
 0.75464671 0.75475015 0.75475955 0.75473134 0.75454797 0.75480186
 0.75469843 0.75459969 0.75458089 0.75444452 0.75437462]
1
[0.75513088 0.75516377 0.75493348 0.75457148 0.75464671 0.75475015
 0.75475955 0.75473134 0.75454797 0.75480186 0.75469843 0.75459969
 0.75458089 0.75444452 0.75437462]
pred:  [[0.7543213]]
new data:  [0.75522956 0.75511678 0.75513088 0.75516377 0.75493348 0.75457148
 0.75464671 0.75475015 0.75475955 0.75473134 0.75454797 0.75480186
 0.75469843 0.75459969 0.75458089 0.75444452 0.75437462 0.75432128]
2
[0.75516377 0.75493348 0.75457148 0.75464671 0.75475015 0.75475955
 0.75473134 0.75454797 0.75480186 0.75469843 0.75459969 0.75458089
 0.75444452 0.75437462 0.75432128]
pred:  [[0.7542625]]
new data:

In [137]:
submit['Close'] = test_data[window_size+1:]
submit['Close'] = np.expm1(submit['Close'] )

In [138]:
submit

Unnamed: 0,Time,Close
0,895 19:00:00,1.126281
1,895 19:10:00,1.126168
2,895 19:20:00,1.126043
3,895 19:30:00,1.125913
4,895 19:40:00,1.12578
5,895 19:50:00,1.125647
6,895 20:00:00,1.125513
7,895 20:10:00,1.12538
8,895 20:20:00,1.125246
9,895 20:30:00,1.125113


In [139]:
submit.to_csv('./data/submit.xxx.csv',index=0)