# 神經網路實做

## 拿取加權指數資料

In [22]:
from finlab.data import Data

data = Data()
twii = data.get("發行量加權股價指數")

twii = twii['台股指數'].resample("15T").first().dropna()
twii.head()

date
2006-01-02 09:00:00    6548.34
2006-01-02 09:15:00    6478.09
2006-01-02 09:30:00    6474.88
2006-01-02 09:45:00    6471.12
2006-01-02 10:00:00    6480.50
Name: 台股指數, dtype: float64

## 製作features

In [23]:
import talib
import numpy as np
import pandas as pd

sma = talib.SMA(twii, timeperiod=120)
wma = talib.WMA(twii, timeperiod=120)
mom = talib.MOM(twii, timeperiod=120)
k, d = talib.STOCH  (twii, twii, twii, fastk_period=120, slowk_period=60, slowd_period=60)
k2, d2 = talib.STOCH(twii, twii, twii, fastk_period=240, slowk_period=120, slowd_period=120)
k3, d3 = talib.STOCH(twii, twii, twii, fastk_period=360, slowk_period=180, slowd_period=180)
k4, d4 = talib.STOCH(twii, twii, twii, fastk_period=480, slowk_period=240, slowd_period=240)
k5, d5 = talib.STOCH(twii, twii, twii, fastk_period=640, slowk_period=320, slowd_period=320)
k6, d6 = talib.STOCH(twii, twii, twii, fastk_period=720, slowk_period=360, slowd_period=360)
k7, d7 = talib.STOCH(twii, twii, twii, fastk_period=840, slowk_period=420, slowd_period=420)
k8, d8 = talib.STOCH(twii, twii, twii, fastk_period=960, slowk_period=480, slowd_period=480)

rsi = talib.RSI (twii, timeperiod=120)
rsi2 = talib.RSI(twii, timeperiod=240)
rsi3 = talib.RSI(twii, timeperiod=480)
rsi4 = talib.RSI(twii, timeperiod=640)
rsi5 = talib.RSI(twii, timeperiod=720)
rsi6 = talib.RSI(twii, timeperiod=840)

macd1, macd2, macd3 = talib.MACD(twii, fastperiod=120, slowperiod=60, signalperiod=60)
willr = talib.WILLR(twii, twii, twii, timeperiod=120)
cci = talib.CCI(twii, twii, twii, timeperiod=120)

dataset = pd.DataFrame({
    'RSIb': rsi / 50,
    'RSIb2': rsi2 / 50,
    'RSIb3': rsi3 / 50,
    'RSIb4': rsi4 / 50,
    'RSIb5': rsi5 / 50,
    'RSIb6': rsi6 / 50,
    'MOMb': mom - 0,
    'KDb': k - d,
    'KDb2': k2 - d2,
    'KDb3': k3 - d3,
    'KDb4': k4 - d4,
    'KDb5': k5 - d5,
    'KDb6': k6 - d6,
    'KDb7': k7 - d7,
    'KDb8': k8 - d8,
    
    'a5':   (twii.rolling(5).mean()   / twii),
    'a10':  (twii.rolling(10).mean()  / twii),
    'a20':  (twii.rolling(20).mean()  / twii),
    'a40':  (twii.rolling(40).mean()  / twii),
    'a80':  (twii.rolling(80).mean()  / twii),
    'a160': (twii.rolling(160).mean() / twii),
    'a320': (twii.rolling(320).mean() / twii),
    'a640': (twii.rolling(640).mean() / twii),
    'a720': (twii.rolling(720).mean() / twii),
    'a840': (twii.rolling(840).mean() / twii),
    'a960': (twii.rolling(960).mean() / twii),
    'a1024':(twii.rolling(1024).mean() / twii),
    'b1': twii/twii.shift(50),
    'b2': twii/twii.shift(100),
    'b3': twii/twii.shift(150),
    'b4': twii/twii.shift(200),
    'b5': twii/twii.shift(250),
    'b6': twii/twii.shift(300),
    'b7': twii/twii.shift(350),
    'LINEARREG_SLOPE0': talib.LINEARREG_SLOPE(twii, 60),
    'LINEARREG_SLOPE1': talib.LINEARREG_SLOPE(twii, 120),

    'ADXR0': talib.ADXR(twii, twii, twii, 60),
    'ADXR1': talib.ADXR(twii, twii, twii, 120),
    'ADXR2': talib.ADXR(twii, twii, twii, 240),
    'ADXR3': talib.ADXR(twii, twii, twii, 360),
    'ADXR4': talib.ADXR(twii, twii, twii, 480),
    'ADXR5': talib.ADXR(twii, twii, twii, 640),

    'return': twii.shift(-10) / twii,
})

feature_names = list(dataset.columns[:-1])

## 簡單處理一下

In [24]:
print("before dropping NaN", dataset.shape)
dataset = dataset.dropna()
print("after dropping NaN", dataset.shape)


before dropping NaN (79078, 43)
after dropping NaN (77150, 43)


In [25]:
import lightgbm as lgb
dataset_train = dataset[:'2020']

gbm = lgb.LGBMClassifier(n_estimators=100, random_state=5, learning_rate=0.01)

gbm.fit(dataset_train[feature_names], dataset_train['return'] > 1)


LGBMClassifier(learning_rate=0.01, random_state=5)

## 神經網路Preprocessing

In [26]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

dataset_scaled = ss.fit_transform(dataset)
dataset_scaled = pd.DataFrame(dataset_scaled, columns=dataset.columns, index=dataset.index)
dataset_scaled['return'] = dataset['return']
dataset_scaled.describe()

Unnamed: 0,RSIb,RSIb2,RSIb3,RSIb4,RSIb5,RSIb6,MOMb,KDb,KDb2,KDb3,...,b7,LINEARREG_SLOPE0,LINEARREG_SLOPE1,ADXR0,ADXR1,ADXR2,ADXR3,ADXR4,ADXR5,return
count,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0,...,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0,77150.0
mean,4.062826e-16,1.479787e-15,-3.589392e-15,-1.782172e-15,-3.785165e-15,-2.73042e-15,2.5189040000000003e-17,2.1703670000000002e-17,1.1821750000000001e-17,-3.73576e-18,...,-3.329057e-15,-1.733646e-16,-5.333675000000001e-17,-2.673227e-16,-3.543561e-16,-3.209962e-16,-7.830130000000001e-17,3.9787e-16,-3.86349e-16,1.000135
std,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,...,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,0.008257
min,-4.368989,-4.506759,-4.762963,-4.7387,-4.70221,-4.630653,-8.88358,-2.586458,-2.685999,-2.663557,...,-5.480993,-9.322652,-7.097892,-2.105435,-2.011292,-1.696597,-1.612772,-1.755444,-1.801731,0.924507
25%,-0.6821675,-0.6833395,-0.6700367,-0.6656202,-0.6610806,-0.6519681,-0.4595441,-0.6391872,-0.64428,-0.6282222,...,-0.5310826,-0.4636567,-0.4686256,-0.7436176,-0.6998969,-0.7519128,-0.7023951,-0.6870651,-0.7213463,0.996989
50%,0.09697027,0.1165786,0.115696,0.1205676,0.1261177,0.134338,0.09098792,-0.01326469,-0.03171791,-0.0077173,...,0.1203265,0.07747991,0.08381842,-0.1503141,-0.2273822,-0.2738363,-0.253871,-0.231204,-0.2118342,1.000285
75%,0.7223377,0.7145231,0.6887814,0.6946299,0.6937432,0.6916076,0.5701869,0.656076,0.6728323,0.6015924,...,0.5829542,0.5610455,0.5725418,0.5911812,0.4673996,0.5046496,0.4980228,0.5069936,0.590669,1.003707
max,3.246771,2.815948,3.017904,3.08925,3.105732,3.115908,5.31491,2.726409,2.592381,2.559921,...,4.869922,4.700631,4.095958,4.476193,4.063373,3.820162,3.563766,3.680746,3.523598,1.087726


In [None]:
import tqdm

n = 3

X = []
y = []
indexes = []
dataset_scaled_x = dataset_scaled[feature_names]

for i in tqdm.tqdm_notebook(range(0, len(dataset_scaled)-n)):
    X.append(dataset_scaled_x.iloc[i:i+n].values)
    y.append(dataset_scaled['return'].iloc[i+n-1])
    indexes.append(dataset_scaled.index[i+n-1])
#dataset_scaled.head()

In [None]:
indexes[0]

In [7]:
import numpy as np
X = np.array(X)
y = np.array(y)

In [8]:
indexes = np.array(indexes)
indexes

array([Timestamp('2006-06-06 09:15:00'), Timestamp('2006-06-06 09:30:00'),
       Timestamp('2006-06-06 09:45:00'), ...,
       Timestamp('2022-11-15 10:15:00'), Timestamp('2022-11-15 10:30:00'),
       Timestamp('2022-11-15 10:45:00')], dtype=object)

In [9]:
dataset_scaled.index

DatetimeIndex(['2006-06-05 13:30:00', '2006-06-06 09:00:00',
               '2006-06-06 09:15:00', '2006-06-06 09:30:00',
               '2006-06-06 09:45:00', '2006-06-06 10:00:00',
               '2006-06-06 10:15:00', '2006-06-06 10:30:00',
               '2006-06-06 10:45:00', '2006-06-06 11:00:00',
               ...
               '2022-11-14 13:30:00', '2022-11-15 09:00:00',
               '2022-11-15 09:15:00', '2022-11-15 09:30:00',
               '2022-11-15 09:45:00', '2022-11-15 10:00:00',
               '2022-11-15 10:15:00', '2022-11-15 10:30:00',
               '2022-11-15 10:45:00', '2022-11-15 11:00:00'],
              dtype='datetime64[ns]', name='date', length=77150, freq=None)

## 神經網路 Model

In [10]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.models.Sequential()
#model.add(keras.layers.Dense(100, activation="relu", input_shape=(len(feature_names),)))
model.add(layers.LSTM(100, return_sequences=True, input_shape=X[0].shape))
model.add(layers.LSTM(100))
model.add(layers.Dense(8))
model.add(layers.Dense(1,kernel_initializer="uniform",activation='linear'))

adam = keras.optimizers.Adam(0.0006)

model.compile(optimizer=adam, loss="binary_crossentropy", metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 3, 100)            57200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 8)                 808       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9         
Total params: 138,417
Trainable params: 138,417
Non-trainable params: 0
_________________________________________________________________


## 神經網路訓練

In [11]:
indexes

array([Timestamp('2006-06-06 09:15:00'), Timestamp('2006-06-06 09:30:00'),
       Timestamp('2006-06-06 09:45:00'), ...,
       Timestamp('2022-11-15 10:15:00'), Timestamp('2022-11-15 10:30:00'),
       Timestamp('2022-11-15 10:45:00')], dtype=object)

In [12]:
# dataset_scaled_train = dataset_scaled[:'2020']

import datetime
X_train = X[indexes < datetime.datetime(2021, 1, 1)]
y_train = y[indexes < datetime.datetime(2021, 1, 1)]

checkpoint_filepath = './checkpoint_u22'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

history = model.fit(
    X_train,
    y_train > 1,
    batch_size=5000,
    epochs=300,
    validation_split=0.2,
    callbacks=[model_checkpoint_callback])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [None]:
model.load_weights(checkpoint_filepath)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

## 回測

In [None]:
ey = model.predict(X)
ey = pd.Series(ey.swapaxes(0,1)[0], index=indexes)
ey.plot()


In [None]:
eq = twii[indexes]
returns = (eq.shift(-1) - eq)

signal = (ey > ey.quantile(0.6)).rolling(10).sum() > 0
signal = signal.shift(1).fillna(False)

eq = (returns[signal]['2021':]).cumsum()
eq.plot()

In [None]:
(signal.astype(int).diff().abs().fillna(0) * 3)['2021':].sum()