In [367]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

In [108]:
vg_funds = pd.read_csv('data/fund_list.csv')

Unnamed: 0,fund_name,asset_class
0,EDF,Bond - Long Term Government
1,BIV,Bond - Inter Term Government
2,VGIT,Bond - Inter Term Investment
3,BLV,
4,VGLT,
5,VMBS,
6,BSV,
7,VTIP,
8,VGSH,
9,BND,


In [341]:
import quandl
quandl.ApiConfig.api_key = 'rb8eEU4go9wAPCe7Woq8'

def get_data(symbols, date_range):
    """Read stock data (adjusted close) for given symbols from Quandl data."""
    
    dfs = []

    pbar = tqdm(total=len(date_range))
    for date in date_range:
        df = quandl.get_table('SHARADAR/SFP', date=date, ticker=symbols)
        dfs.append(df)
        pbar.update(1)
    
    pbar.close()
    data = pd.concat(dfs)
    data = data.reset_index().set_index(['ticker', 'date'])
    return data

In [204]:
import datetime

start_date = datetime.datetime(2012, 1, 1)
end_date = datetime.datetime(2018, 1, 1)
dates = pd.date_range(start_date, end_date)
symbols = list(vg_funds['fund_name'])

fund_prices = get_data(symbols, dates)
fund_prices.drop(['None', 'lastupdated', 'dividends', 'closeunadj'], axis=1, inplace=True)

In [205]:
print(len(fund_prices))

79113


In [222]:
print(fund_prices.head())

                    open    high    low  close     volume
ticker date                                              
BIV    2012-01-03  86.62  87.007  86.62  86.90   379600.0
BLV    2012-01-03  91.24  91.850  90.88  91.51   171000.0
BND    2012-01-03  83.43  83.430  83.18  83.28  2282600.0
BSV    2012-01-03  80.93  80.930  80.75  80.84   786300.0
EDF    2012-01-03  21.28  21.630  21.21  21.63    69000.0


In [392]:
sample = fund_prices.sample(n=1).iloc[0]

pre = fund_prices.loc[sample.name[0]].loc[:sample.name[1]]
pre = pre.reset_index().iloc[-lookback-1:]
pre = pre.drop(['date'], axis=1)
pre = pre.pct_change()
pre.dropna(inplace=True)

pre

Unnamed: 0,open,high,low,close,volume
970,-0.009581,-0.004476,0.000537,0.001572,-0.188654
971,0.00694,0.001255,0.002095,-0.003872,-0.301642
972,-0.011174,-0.008668,-0.01355,-0.01355,1.239675
973,-0.010561,-0.011798,-0.011394,-0.010755,0.417847
974,-0.010247,0.004797,-0.001185,0.014316,-0.372913


In [412]:
from sklearn import preprocessing

def create_dataset(fund_prices, 
                   n_samples=10000, 
                   lookback=60, 
                   window=30):
    
    samples = fund_prices.sample(n=n_samples)
    
    X = []
    y = []
    
    
    pbar = tqdm(total=len(samples))
    for index, row in samples.iterrows():
        
        pre = fund_prices.loc[index[0]].loc[:index[1]]
        pre = pre.reset_index().iloc[-lookback:]
        post = fund_prices.loc[index[0]].loc[index[1]:]
        post = post.reset_index().iloc[1:window+1]
        
        pre = pre.drop(['date'], axis=1)
#         pre = pd.DataFrame(scaler.fit_transform(pre), 
#                                   index=pre.index, 
#                                   columns=pre.columns)
        pre = pre.values

        post = post.drop(['date'], axis=1)
#         post = pd.DataFrame(scaler.transform(post), 
#                                   index=post.index, 
#                                   columns=post.columns)
        post = post['close'].values
        
        if len(pre) == lookback and len(post) == window:
            X.append(pre)
            y.append(post)
        
        pbar.update(1)
    
    pbar.close()
    return np.array(X), np.array(y)

In [408]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
fund_prices_scaled = pd.DataFrame(scaler.fit_transform(fund_prices), 
                                  index=fund_prices.index, 
                                  columns=fund_prices.columns)

In [422]:
X_train, y_train = create_dataset(fund_prices_scaled, n_samples=70000, window=15)

HBox(children=(IntProgress(value=0, max=70000), HTML(value='')))

KeyboardInterrupt: 

In [424]:
print(X_train.shape)
print(y_train.shape)

(951, 60, 5)
(951, 15)


In [415]:
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential

def build_model(X, y):
    model = Sequential()
    model.add(LSTM(input_shape=(X.shape[1], X.shape[2]), output_dim=X.shape[1], return_sequences = True))
    model.add(Dropout(0.5))
    model.add(LSTM(256))
    model.add(Dropout(0.5))
    model.add(Dense(y.shape[1]))
    model.add(Activation("linear"))
    model.compile(loss="mse", optimizer="adam")
    model.summary()
    return model

In [416]:
model = build_model(X=X_train, y=y_train)

  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 60, 60)            15840     
_________________________________________________________________
dropout_9 (Dropout)          (None, 60, 60)            0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 256)               324608    
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 15)                3855      
_________________________________________________________________
activation_5 (Activation)    (None, 15)                0         
Total params: 344,303
Trainable params: 344,303
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, batch_size=64, shuffle=True, epochs=5)

Evaluate model

In [None]:
predictions = model.predict(X_train[:20])

In [None]:
example = 0

real = np.concatenate((X_train[example, :, 3], y_train[example]))
predicted =  np.empty((len(real)))
predicted[:] = np.nan
predicted[60:] = predictions[example]

df = pd.DataFrame(np.vstack((real, predicted)).T)
df.plot()
plt.show()

In [262]:
start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2018, 6, 1)
dates = pd.date_range(start_date, end_date)
symbols = list(vg_funds['fund_name'])

fund_prices_test = get_data(symbols, dates)
fund_prices_test.drop(['None', 'lastupdated', 'dividends', 'closeunadj'], axis=1, inplace=True)

fund_prices_test_scaled = pd.DataFrame(scaler.transform(fund_prices_test), 
                                       index=fund_prices_test.index, 
                                       columns=fund_prices_test.columns)

100%|██████████| 152/152 [00:43<00:00,  3.95it/s]


In [266]:
X_test, y_test = create_dataset(fund_prices_test_scaled, n_samples=5000)

100%|██████████| 5000/5000 [00:18<00:00, 274.29it/s]
