# 神經網路實做

## 拿取加權指數資料

In [1]:
from finlab.data import Data

data = Data()
twii = data.get("發行量加權股價指數")

twii = twii['台股指數'].resample("15T").first().dropna()
twii.head()



date
2006-01-02 09:00:00    6548.34
2006-01-02 09:15:00    6478.09
2006-01-02 09:30:00    6474.88
2006-01-02 09:45:00    6471.12
2006-01-02 10:00:00    6480.50
Name: 台股指數, dtype: float64

## 製作features

In [2]:
import talib
import numpy as np
import pandas as pd

sma = talib.SMA(twii, timeperiod=120)
wma = talib.WMA(twii, timeperiod=120)
mom = talib.MOM(twii, timeperiod=120)
k, d = talib.STOCH  (twii, twii, twii, fastk_period=120, slowk_period=60, slowd_period=60)
k2, d2 = talib.STOCH(twii, twii, twii, fastk_period=240, slowk_period=120, slowd_period=120)
k3, d3 = talib.STOCH(twii, twii, twii, fastk_period=360, slowk_period=180, slowd_period=180)
k4, d4 = talib.STOCH(twii, twii, twii, fastk_period=480, slowk_period=240, slowd_period=240)
k5, d5 = talib.STOCH(twii, twii, twii, fastk_period=640, slowk_period=320, slowd_period=320)
k6, d6 = talib.STOCH(twii, twii, twii, fastk_period=720, slowk_period=360, slowd_period=360)
k7, d7 = talib.STOCH(twii, twii, twii, fastk_period=840, slowk_period=420, slowd_period=420)
k8, d8 = talib.STOCH(twii, twii, twii, fastk_period=960, slowk_period=480, slowd_period=480)

rsi = talib.RSI (twii, timeperiod=120)
rsi2 = talib.RSI(twii, timeperiod=240)
rsi3 = talib.RSI(twii, timeperiod=480)
rsi4 = talib.RSI(twii, timeperiod=640)
rsi5 = talib.RSI(twii, timeperiod=720)
rsi6 = talib.RSI(twii, timeperiod=840)

macd1, macd2, macd3 = talib.MACD(twii, fastperiod=120, slowperiod=60, signalperiod=60)
willr = talib.WILLR(twii, twii, twii, timeperiod=120)
cci = talib.CCI(twii, twii, twii, timeperiod=120)

dataset = pd.DataFrame({
    'RSIb': rsi / 50,
    'RSIb2': rsi2 / 50,
    'RSIb3': rsi3 / 50,
    'RSIb4': rsi4 / 50,
    'RSIb5': rsi5 / 50,
    'RSIb6': rsi6 / 50,
    'MOMb': mom - 0,
    'KDb': k - d,
    'KDb2': k2 - d2,
    'KDb3': k3 - d3,
    'KDb4': k4 - d4,
    'KDb5': k5 - d5,
    'KDb6': k6 - d6,
    'KDb7': k7 - d7,
    'KDb8': k8 - d8,
    
    'a5':   (twii.rolling(5).mean()   / twii),
    'a10':  (twii.rolling(10).mean()  / twii),
    'a20':  (twii.rolling(20).mean()  / twii),
    'a40':  (twii.rolling(40).mean()  / twii),
    'a80':  (twii.rolling(80).mean()  / twii),
    'a160': (twii.rolling(160).mean() / twii),
    'a320': (twii.rolling(320).mean() / twii),
    'a640': (twii.rolling(640).mean() / twii),
    'a720': (twii.rolling(720).mean() / twii),
    'a840': (twii.rolling(840).mean() / twii),
    'a960': (twii.rolling(960).mean() / twii),
    'a1024':(twii.rolling(1024).mean() / twii),
    'b1': twii/twii.shift(50),
    'b2': twii/twii.shift(100),
    'b3': twii/twii.shift(150),
    'b4': twii/twii.shift(200),
    'b5': twii/twii.shift(250),
    'b6': twii/twii.shift(300),
    'b7': twii/twii.shift(350),
    'LINEARREG_SLOPE0': talib.LINEARREG_SLOPE(twii, 60),
    'LINEARREG_SLOPE1': talib.LINEARREG_SLOPE(twii, 120),

    'ADXR0': talib.ADXR(twii, twii, twii, 60),
    'ADXR1': talib.ADXR(twii, twii, twii, 120),
    'ADXR2': talib.ADXR(twii, twii, twii, 240),
    'ADXR3': talib.ADXR(twii, twii, twii, 360),
    'ADXR4': talib.ADXR(twii, twii, twii, 480),
    'ADXR5': talib.ADXR(twii, twii, twii, 640),

    'return': twii.shift(-10) / twii,
})

feature_names = list(dataset.columns[:-1])

## 簡單處理一下

In [3]:
print("before dropping NaN", dataset.shape)
dataset = dataset.dropna()
print("after dropping NaN", dataset.shape)


before dropping NaN (79002, 43)
after dropping NaN (77074, 43)


In [4]:
import lightgbm as lgb
dataset_train = dataset[:'2020']

gbm = lgb.LGBMClassifier(n_estimators=100, random_state=5, learning_rate=0.01)

gbm.fit(dataset_train[feature_names], dataset_train['return'] > 1)


LGBMClassifier(learning_rate=0.01, random_state=5)

## 神經網路Preprocessing

In [5]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

dataset_scaled = ss.fit_transform(dataset)
dataset_scaled = pd.DataFrame(dataset_scaled, columns=dataset.columns, index=dataset.index)
dataset_scaled['return'] = dataset['return']
dataset_scaled.describe()

Unnamed: 0,RSIb,RSIb2,RSIb3,RSIb4,RSIb5,RSIb6,MOMb,KDb,KDb2,KDb3,...,b7,LINEARREG_SLOPE0,LINEARREG_SLOPE1,ADXR0,ADXR1,ADXR2,ADXR3,ADXR4,ADXR5,return
count,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0,...,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0,77074.0
mean,1.199013e-16,9.335501e-16,-1.323131e-16,-3.053847e-15,-6.476159e-15,-1.493484e-15,4.6345480000000004e-17,3.725039e-18,3.003655e-17,-5.315311e-18,...,-1.642748e-15,-3.0209410000000003e-17,1.932814e-17,1.102338e-16,-1.162235e-15,-1.507561e-16,-4.859721e-16,7.269761e-16,-6.370998e-16,1.000126
std,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,...,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,0.008246
min,-4.375234,-4.507799,-4.761163,-4.737261,-4.701149,-4.630243,-8.927808,-2.585161,-2.685509,-2.666064,...,-5.481681,-9.356083,-7.124939,-2.106558,-2.010208,-1.696746,-1.612674,-1.754828,-1.800766,0.924507
25%,-0.6827856,-0.6832568,-0.6702569,-0.6652009,-0.6607103,-0.65205,-0.4593474,-0.6393931,-0.6447801,-0.6281137,...,-0.5309859,-0.4633679,-0.4682248,-0.7436624,-0.6998095,-0.7513924,-0.7019419,-0.6865189,-0.7207257,0.996988
50%,0.09806048,0.1166437,0.1154802,0.1209703,0.1264815,0.1348818,0.09381266,-0.01340082,-0.03109266,-0.006402716,...,0.1207952,0.07941809,0.08620214,-0.1502584,-0.2283676,-0.2733137,-0.2531058,-0.2308046,-0.212391,1.000282
75%,0.7234538,0.7146402,0.6891096,0.6946357,0.6936443,0.691401,0.5750143,0.6560989,0.671493,0.6021685,...,0.5833069,0.5646887,0.5762794,0.5918864,0.4675324,0.5032583,0.4954618,0.5037636,0.5897731,1.003699
max,2.876612,2.818654,3.017061,3.087917,3.104363,3.114618,4.976245,2.725618,2.594254,2.566409,...,4.87277,4.704428,3.988879,4.483136,4.061787,3.826344,3.569066,3.683582,3.524693,1.087726


In [6]:
import tqdm

n = 3 #用前三列

X = []
y = []
indexes = []
dataset_scaled_x = dataset_scaled[feature_names]

for i in tqdm.tqdm_notebook(range(0, len(dataset_scaled)-n)):
    X.append(dataset_scaled_x.iloc[i:i+n].values)
    y.append(dataset_scaled['return'].iloc[i+n-1])
    indexes.append(dataset_scaled.index[i+n-1])
#dataset_scaled.head()

HBox(children=(IntProgress(value=0, max=77071), HTML(value='')))




In [7]:
X[0] 

array([[-1.37446468, -1.25455295, -0.89032254, -0.71868138, -0.65366228,
        -0.58266071, -0.58912117,  0.80838147,  0.48011022, -0.52317071,
        -2.14040379, -2.21283041, -1.86953479, -1.364285  , -0.98016952,
         2.6821913 ,  3.93097803,  3.68139704,  2.85865421,  1.90249953,
         1.32423226,  1.56149231,  1.39305069,  1.2043558 ,  0.94243202,
         0.67314272,  0.57619832, -1.52006797, -0.98476751, -0.79909761,
        -1.41298164, -1.41406975, -1.83160707, -2.0359443 , -0.18640032,
        -0.07321812, -1.25611819, -0.07950386,  0.45722359,  0.48300929,
        -0.43024914, -1.34663473],
       [-1.37446468, -1.25455295, -0.89032254, -0.71868138, -0.65366228,
        -0.58266071, -0.60886166,  0.73303508,  0.46727702, -0.51344402,
        -2.13484891, -2.21902582, -1.87962974, -1.37289659, -0.98912046,
         1.38224702,  3.29977962,  3.4039212 ,  2.79534786,  1.87964381,
         1.31600134,  1.55228147,  1.39161795,  1.20420191,  0.94305593,
         0.67400

In [8]:
import numpy as np
X = np.array(X)
y = np.array(y)

In [9]:
indexes = np.array(indexes)

## 神經網路 Model

In [10]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.models.Sequential()
#model.add(keras.layers.Dense(100, activation="relu", input_shape=(len(feature_names),)))
model.add(layers.LSTM(100, return_sequences=True, input_shape=X[0].shape))
model.add(layers.LSTM(100))
model.add(layers.Dense(8))
model.add(layers.Dense(1,kernel_initializer="uniform",activation='linear'))

adam = keras.optimizers.Adam(0.0006)

model.compile(optimizer=adam, loss="binary_crossentropy", metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 3, 100)            57200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 8)                 808       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9         
Total params: 138,417
Trainable params: 138,417
Non-trainable params: 0
_________________________________________________________________


## 神經網路訓練

In [11]:
# dataset_scaled_train = dataset_scaled[:'2020']

import datetime
X_train = X[indexes < datetime.datetime(2021, 1, 1)]
y_train = y[indexes < datetime.datetime(2021, 1, 1)]

checkpoint_filepath = './tmp/checkpoint_u22'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

history = model.fit(
    X_train,
    y_train > 1,
    batch_size=5000,
    epochs=300,
    validation_split=0.2,
    callbacks=[model_checkpoint_callback])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa6 in position 164: invalid start byte

In [None]:
model.load_weights(checkpoint_filepath)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'],color='blue')
plt.plot(history.history ['val_loss'],color='red')
plt.xlabel('Times')                    #2
plt.ylabel('Value')            #3

## 回測

In [None]:
ey = model.predict(X)
ey = pd.Series(ey.swapaxes(0,1)[0], index=indexes)
ey.plot()


In [None]:
eq = twii[indexes]
returns = (eq.shift(-1) - eq)

signal = (ey > ey.quantile(0.7)).rolling(10).sum() > 0
signal = signal.shift(1).fillna(False)

eq = (returns[signal]['2021':]).cumsum()
eq.plot()

In [None]:
(signal.astype(int).diff().abs().fillna(0) * 3)['2021':].sum()