In [2]:
from datetime import datetime
from iexfinance.stocks import get_historical_data
import pandas as pd
from keras.models import Sequential
from keras.layers import Activation, Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score, confusion_matrix, r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import plotly
import timeit
import random
random.seed(7)

Using TensorFlow backend.


In [3]:
etf_list = ['SPY','IVV','VTI','VOO','QQQ','VEA','EFA','IEFA','VWO','AGG','IJH','IEMG','IWM','IJR','VTV','IWF','IWD','VUG','BND','LQD','XLF','VNQ','VIG','EEM','GLD','VB','BSV','VO','TIP','VEU','IVW','DIA','XLK','VYM','VGT','VCSH','MDY','IWB','VCIT','XLV','IWR','XLE','DVY','USMV','EWJ','VGK','PFF','SCHF','SDY','RSP','XLY','ITOT','IVE','SCHX','HYG','SHV','VBR','EMB','SHY','VV','SCHB','XLI','BIV','VT','MBB','BNDX','IWS','VXUS','FLOT','IWO','IXUS','MINT','SCZ','PYZ','MXI','IYM','IXP','RXI','VCR','RHS','VDC','PXI','PXE','IEO','RYF','IYG','KIE','FBT','PTH','IHI','ITA','VIS','ICF','REZ','RWR','PSJ','IGV','RYU','IDU','VPU']
# data = pd.read_csv('stats_100_etfs.csv')
# data

In [4]:
def get_iex_data(stock_list, start=datetime(2015,1,1), end=datetime(2019,12,31)):
    return_list = []
    for i in stock_list:
        df = pd.DataFrame(get_historical_data(i, start, end, output_format='pandas', token='pk_d28c0190de7a4d6da30b3bd2b08487c8')).interpolate()
        df['ticker'] = i
        return_list.append(df)
    return return_list

def lstm_clean_data(data):
    for i in range(len(data)):
        data[i] = data[i].reset_index().dropna()
        data[i]['date'] = pd.to_datetime(data[i]['date'])
        data[i] = data[i].set_index('date')
        data[i]['Reg_Target'] = data[i]['close'].shift(-1)
    return data

def add_past(etf_list, times):
    for i in range(len(etf_list)):
        for n in times:
            etf_list[i]['{}day return'.format(n)] = -etf_list[i]['close'].diff(periods=n).round(3)
    return etf_list

def lstm_time_test_split(X, n_past, date):
    X = X.reset_index()
    scaler = MinMaxScaler()
    y_scaler = MinMaxScaler()
    ticker = X['ticker'].iloc[0]
    x_train = X[X['date'] < date].drop(columns=['date', 'Reg_Target', 'ticker', '1day return', '5day return', '21day return', '252day return'])
    scaler.fit(x_train)
    x_test = X[X['date'] >= date].drop(columns=['date', 'Reg_Target', 'ticker', '1day return', '5day return', '21day return', '252day return'])[:-1]
    x_train = scaler.transform(x_train)
    x_train = np.reshape(x_train,(x_train.shape[0], n_past, x_train.shape[1]))
    x_test = scaler.transform(x_test)
    x_test = np.reshape(x_test,(x_test.shape[0], n_past, x_test.shape[1]))
    y_train = np.array(X[X['date'] < date]['Reg_Target'].drop(columns='date')).ravel().astype('float').reshape(-1,1)
    y_scaler.fit(y_train)
    y_train = y_scaler.transform(y_train)
    y_test = np.array(X[X['date'] >= date]['Reg_Target'].drop(columns='date')).ravel().astype('float')[:-1].reshape(-1,1)
    y_test = y_scaler.transform(y_test)
    x_holdout = X[X['date'] >= date].drop(columns=['date', 'Reg_Target', 'ticker', '1day return', '5day return', '21day return', '252day return'])[-1:]
    x_holdout = scaler.transform(x_holdout)
    x_holdout = np.reshape(x_holdout,(x_holdout.shape[0], n_past, x_holdout.shape[1]))
#     y_test = scaler.transform(y_test)
    return ticker, x_train, x_test, x_holdout, y_train, y_test, scaler, y_scaler

def build_step_model(x_train, y_train, epoc):
    model = Sequential()
    model.add(LSTM(50, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(100, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.add(Activation('relu'))
    model.compile(loss='mse', optimizer='adam')
    history = model.fit(x_train, y_train, epochs=epoc, batch_size=64, validation_split=.1, verbose=2,shuffle=False)
    return model, history

def yield_preds(model, scaler, x_test, x_holdout, y_test):
    yhat = model.predict(x_test)
    preds = scaler.inverse_transform(yhat)
    true = scaler.inverse_transform(y_test)
    today = model.predict(x_holdout)
    today_pred = scaler.inverse_transform(today)
    return preds, today_pred

def run_all_lstms(data, split, epoc):
    out = pd.DataFrame()
    tomorrow = pd.DataFrame()
    start = timeit.default_timer()
    for i in range(len(data)):
        ticker, x_train, x_test, x_holdout, y_train, y_test, scaler, y_scaler = lstm_time_test_split(data[i], 1, split)
        print('Model #: {}'.format(i))
        model, history = build_step_model(x_train, y_train, epoc)
        preds, future = yield_preds(model, y_scaler, x_test, x_holdout, y_test)
        out[ticker] = preds.flatten()
        tomorrow[ticker] = future.flatten()
    out = out.set_index(data[0][-len(out):].index)
    stop = timeit.default_timer()
    print('Time: ', stop - start)
    return out, tomorrow
data = get_iex_data(etf_list)
clean_full = lstm_clean_data(data)
data = add_past(clean_full, [1, 5, 21, 252])

In [5]:
# data
comb = data[0]
x=1
while x != 100:
    comb = comb.append(data[x])
    x += 1
    print (x)
comb


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


Unnamed: 0_level_0,open,high,low,close,volume,ticker,Reg_Target,1day return,5day return,21day return,252day return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-08,204.01,206.16,203.99,205.90,147217784,SPY,204.25,,,,
2015-01-09,206.40,206.42,203.51,204.25,158567288,SPY,202.65,1.65,,,
2015-01-12,204.41,204.60,201.92,202.65,144396067,SPY,202.08,1.60,,,
2015-01-13,204.12,205.48,200.51,202.08,214553306,SPY,200.86,0.57,,,
2015-01-14,199.65,201.10,198.57,200.86,192991092,SPY,199.02,1.22,,,
...,...,...,...,...,...,...,...,...,...,...,...
2019-12-24,141.59,141.78,141.04,141.62,52726,VPU,141.86,-0.07,-0.30,-2.20,-26.91
2019-12-26,141.85,142.05,141.37,141.86,68987,VPU,142.25,-0.24,0.22,-2.85,-25.53
2019-12-27,142.01,142.27,141.71,142.25,107721,VPU,142.25,-0.39,-0.29,-2.71,-24.96
2019-12-30,141.92,142.34,141.61,142.25,148696,VPU,142.89,-0.00,0.79,-2.43,-24.80


In [6]:
comb.to_csv('ForeMet_100_etfs.csv', encoding='utf-8')

In [7]:
df = pd.DataFrame()
for i in range(len(data)):
    df[data[i]['ticker'].iloc[0]] = data[i]['close']

In [8]:
df.to_csv('eachetf.csv', encoding='utf-8')

In [9]:
naive = df.shift(1)

In [10]:
naive

Unnamed: 0_level_0,SPY,IVV,VTI,VOO,QQQ,VEA,EFA,IEFA,VWO,AGG,...,ITA,VIS,ICF,REZ,RWR,PSJ,IGV,RYU,IDU,VPU
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-08,,,,,,,,,,,...,,,,,,,,,,
2015-01-09,205.90,207.40,106.15,188.82,103.30,37.39,59.93,54.65,40.30,110.76,...,115.27,105.42,102.00,62.28,95.41,39.12,92.06,79.13,119.65,103.47
2015-01-12,204.25,205.65,105.27,187.23,102.62,37.21,59.64,54.40,40.10,111.03,...,114.51,104.27,102.27,62.43,95.56,38.72,91.17,78.37,118.81,102.73
2015-01-13,202.65,204.09,104.52,185.84,101.55,37.17,59.50,54.31,39.74,111.12,...,113.63,103.48,103.06,63.10,96.32,38.54,90.73,78.51,118.44,102.46
2015-01-14,202.08,203.56,104.28,185.35,101.52,37.35,59.80,54.53,40.02,111.15,...,113.90,103.38,102.84,62.96,96.09,38.11,90.49,78.65,118.92,102.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-24,321.22,322.61,164.29,295.16,211.81,43.86,69.29,65.07,44.22,112.24,...,226.43,154.95,114.71,73.88,100.07,100.35,233.81,105.23,160.53,141.55
2019-12-26,321.23,322.65,163.44,295.16,211.92,43.82,69.22,65.01,44.18,112.38,...,225.10,154.43,114.99,74.01,100.33,100.37,233.67,105.31,160.72,141.62
2019-12-27,322.94,324.32,164.23,296.67,213.79,44.02,69.52,65.27,44.55,112.48,...,224.90,154.86,115.69,74.26,100.84,100.69,234.87,105.39,160.87,141.86
2019-12-30,322.86,324.26,164.08,296.67,213.61,44.14,69.64,65.43,44.61,112.63,...,224.31,154.59,116.09,74.57,101.08,100.43,235.04,105.67,161.31,142.25


In [None]:
mapframe_preds, future_preds = run_all_lstms(data, '11-2019', 6)

Model #: 0




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 5s - loss: 0.2267 - val_loss: 0.6164
Epoch 2/6
 - 0s - loss: 0.1135 - val_loss: 0.2769
Epoch 3/6
 - 0s - loss: 0.0178 - val_loss: 0.0233
Epoch 4/6
 - 0s - loss: 0.0207 - val_loss: 0.0063
Epoch 5/6
 - 0s - loss: 0.0181 - val_loss: 0.0168
Epoch 6/6
 - 0s - loss: 0.0110 - val_loss: 0.0046
Model #: 1
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 5s - loss: 0.1701 - val_loss: 0.4298
Epoch 2/6
 - 0s - loss: 0.0456 - val_loss: 0.1037
Epoch 3/6
 - 0s - loss: 0.0274 - val_loss: 0.0281
Epoch 4/6
 - 0s - loss: 0.0354 - val_loss: 0.0417
Epoch 5/6
 - 0s - loss: 0.0261 - val_loss: 0.0294
Epoch 6/6
 - 0s - loss: 0.0218 - val_loss: 0.0208
Model #: 2
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 

Model #: 17
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 12s - loss: 0.1550 - val_loss: 0.4656
Epoch 2/6
 - 0s - loss: 0.0461 - val_loss: 0.1199
Epoch 3/6
 - 0s - loss: 0.0101 - val_loss: 0.0078
Epoch 4/6
 - 0s - loss: 0.0178 - val_loss: 0.0141
Epoch 5/6
 - 0s - loss: 0.0121 - val_loss: 0.0118
Epoch 6/6
 - 1s - loss: 0.0084 - val_loss: 0.0047
Model #: 18
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 14s - loss: 0.2413 - val_loss: 0.3600
Epoch 2/6
 - 0s - loss: 0.1076 - val_loss: 0.0672
Epoch 3/6
 - 0s - loss: 0.0206 - val_loss: 0.0019
Epoch 4/6
 - 0s - loss: 0.0088 - val_loss: 0.0175
Epoch 5/6
 - 0s - loss: 0.0109 - val_loss: 0.0067
Epoch 6/6
 - 0s - loss: 0.0074 - val_loss: 0.0053
Model #: 19
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 16s - loss: 0.1262 - val_loss: 0.3254
Epoch 2/6
 - 0s - loss: 0.0428 - val_loss: 0.0558
Epoch 3/6
 - 0s - loss: 0.0109 - val_loss: 0.0129
Epoch 4/6
 - 0s - loss: 0.0075 - val_loss: 0.0304
Epoch 5/6
 - 0s - lo

Epoch 5/6
 - 0s - loss: 0.0156 - val_loss: 0.0110
Epoch 6/6
 - 0s - loss: 0.0112 - val_loss: 0.0041
Model #: 40
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 26s - loss: 0.2230 - val_loss: 0.4328
Epoch 2/6
 - 0s - loss: 0.0569 - val_loss: 0.0657
Epoch 3/6
 - 0s - loss: 0.0094 - val_loss: 0.0015
Epoch 4/6
 - 0s - loss: 0.0160 - val_loss: 0.0145
Epoch 5/6
 - 0s - loss: 0.0101 - val_loss: 0.0071
Epoch 6/6
 - 0s - loss: 0.0096 - val_loss: 0.0056
Model #: 41
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 28s - loss: 0.2863 - val_loss: 0.0278
Epoch 2/6
 - 0s - loss: 0.1070 - val_loss: 0.0066
Epoch 3/6
 - 0s - loss: 0.0163 - val_loss: 0.0286
Epoch 4/6
 - 0s - loss: 0.0097 - val_loss: 0.0127
Epoch 5/6
 - 0s - loss: 0.0093 - val_loss: 0.0154
Epoch 6/6
 - 0s - loss: 0.0079 - val_loss: 0.0121
Model #: 42
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 32s - loss: 0.2634 - val_loss: 0.4224
Epoch 2/6
 - 0s - loss: 0.0752 - val_loss: 0.0641
Epoch 3/6
 - 0s - lo

Epoch 3/6
 - 0s - loss: 0.0266 - val_loss: 0.0025
Epoch 4/6
 - 0s - loss: 0.0113 - val_loss: 0.0169
Epoch 5/6
 - 0s - loss: 0.0111 - val_loss: 0.0106
Epoch 6/6
 - 0s - loss: 0.0084 - val_loss: 0.0058
Model #: 63
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 34s - loss: 0.2401 - val_loss: 0.3747
Epoch 2/6
 - 0s - loss: 0.0750 - val_loss: 0.0644
Epoch 3/6
 - 0s - loss: 0.0082 - val_loss: 8.5631e-04
Epoch 4/6
 - 0s - loss: 0.0162 - val_loss: 0.0065
Epoch 5/6
 - 0s - loss: 0.0115 - val_loss: 0.0041
Epoch 6/6
 - 0s - loss: 0.0096 - val_loss: 0.0024
Model #: 64
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 40s - loss: 0.3875 - val_loss: 0.2650
Epoch 2/6
 - 0s - loss: 0.2004 - val_loss: 0.0476
Epoch 3/6
 - 0s - loss: 0.0444 - val_loss: 0.0096
Epoch 4/6
 - 0s - loss: 0.0125 - val_loss: 0.0020
Epoch 5/6
 - 0s - loss: 0.0153 - val_loss: 5.0637e-04
Epoch 6/6
 - 0s - loss: 0.0107 - val_loss: 6.0887e-04
Model #: 65
Train on 1091 samples, validate on 122 samples
Epoch 1/6

Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 69s - loss: 0.2401 - val_loss: 0.4362
Epoch 2/6
 - 0s - loss: 0.0824 - val_loss: 0.0958
Epoch 3/6
 - 0s - loss: 0.0087 - val_loss: 8.8434e-04
Epoch 4/6
 - 0s - loss: 0.0183 - val_loss: 0.0052
Epoch 5/6
 - 0s - loss: 0.0134 - val_loss: 0.0051
Epoch 6/6
 - 0s - loss: 0.0099 - val_loss: 0.0023
Model #: 86
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 77s - loss: 0.1893 - val_loss: 0.5003
Epoch 2/6
 - 1s - loss: 0.0621 - val_loss: 0.1210
Epoch 3/6
 - 1s - loss: 0.0083 - val_loss: 0.0033
Epoch 4/6
 - 1s - loss: 0.0171 - val_loss: 0.0118
Epoch 5/6
 - 1s - loss: 0.0115 - val_loss: 0.0081
Epoch 6/6
 - 1s - loss: 0.0090 - val_loss: 0.0036
Model #: 87
Train on 1091 samples, validate on 122 samples
Epoch 1/6
 - 68s - loss: 0.2147 - val_loss: 0.2315
Epoch 2/6
 - 0s - loss: 0.0734 - val_loss: 0.0387
Epoch 3/6
 - 0s - loss: 0.0120 - val_loss: 0.0014
Epoch 4/6
 - 0s - loss: 0.0176 - val_loss: 0.0022
Epoch 5/6
 - 0s - loss: 0.01

## Forecasting with LSTM

In [11]:
def lstm_plot(ETFs):
    nav_rmse = round(mean_squared_error(df[ETFs][mapframe_preds.index].values, naive[ETFs][mapframe_preds.index].values),3)
    nav_mae = round(mean_absolute_error(df[ETFs][mapframe_preds.index].values, naive[ETFs][mapframe_preds.index].values),3)
    nav_r2 = round(r2_score(df[ETFs][mapframe_preds.index].values, naive[ETFs][mapframe_preds.index].values),3)
    rmse = round(mean_squared_error(df[ETFs][mapframe_preds.index].values, mapframe_preds[ETFs].values),3)
    mae = round(mean_absolute_error(df[ETFs][mapframe_preds.index].values, mapframe_preds[ETFs].values),3)
    r2 = round(r2_score(df[ETFs][mapframe_preds.index].values, mapframe_preds[ETFs].values),3)
    true = go.Scatter(x=df.index, y=df[ETFs].values, mode = 'markers', name = 'True Value')
    pred = go.Scatter(x=mapframe_preds.index, y=mapframe_preds[ETFs].values, mode = 'markers', name = 'Prediction')
    nav = go.Scatter(x=mapframe_preds.index, y=naive[ETFs][mapframe_preds.index].values, mode = 'markers', name = 'Naive')
    fake = go.Scatter(x=['07-2018'], y=df[ETFs].values, opacity = 0, name = '<br>Naive Metrics:<br>RMSE: {}<br>R-Squared: {}<br>MAE: {}<br><br>LSTM Metrics:<br>RMSE: {}<br>R-Squared: {}<br>MAE: {}'.format(nav_rmse,nav_mae,nav_r2,rmse,r2,mae))
    trace = [true, nav, pred, fake]
    layout = dict(title = "{} Prices".format(ETFs), xaxis = dict(range = ['2018-09-01','2018-10-04']), yaxis=dict(autorange=True, showgrid=True))
    fig = dict(data=trace, layout=layout)
    iplot(fig)
interact(lstm_plot, ETFs=etf_list)

<function __main__.lstm_plot>