In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.


In [2]:
vg_funds = pd.read_csv('data/fund_list.csv')

In [3]:
import quandl
quandl.ApiConfig.api_key = 'rb8eEU4go9wAPCe7Woq8'

def get_data(symbols, 
             start_date='2017-01-01',
             end_date='2017-12-31',
             columns=['ticker', 'date', 'close', 'open', 'low', 'high', 'volume']):
    
    """Read stock data (adjusted close) for given symbols from Quandl data."""
    
    data = quandl.get_table('SHARADAR/SFP', 
                            qopts = { 'columns': columns }, 
                            ticker = symbols, 
                            date = { 'gte': '2008-01-01', 'lte': '2017-12-31' }, 
                            paginate=True)
    data = data.set_index(['ticker', 'date'])
    
    return data

In [4]:
import datetime

symbols = list(vg_funds['fund_name'])
fund_prices = get_data(symbols, start_date='2008-01-01', end_date='2017-12-31', )

In [5]:
print(len(fund_prices))

121824


In [6]:
print(fund_prices.head())

                   close   open     low   high     volume
ticker date                                              
BIV    2008-01-02  77.99  78.97  77.418  78.97    63600.0
BLV    2008-01-02  77.56  77.00  76.890  77.63    31600.0
BND    2008-01-02  77.36  78.26  76.910  78.26  2723300.0
BSV    2008-01-02  77.39  77.25  77.120  77.40    24200.0
MGC    2008-01-02  49.34  50.00  49.240  50.00    17700.0


In [7]:
from sklearn.preprocessing import StandardScaler

fund_pct_change = fund_prices.groupby(level=0).pct_change()
fund_pct_change.replace([np.inf, -np.inf], np.nan, inplace=True)
fund_pct_change.ffill(inplace=True)
fund_pct_change.dropna(how='any', inplace=True)

scaler = StandardScaler()
scaler.fit(fund_pct_change)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
from sklearn import preprocessing

def get_trend(cumsum):
    if cumsum > .06: 
#         return 'very high'
        return [0., 0., 0., 0., 0., 1.]
    elif cumsum > .03: 
#         return 'high'
        return [0., 0., 0., 0., 1., 0.]
    elif cumsum >= .005: 
#         return 'above average'
        return [0., 0., 0., 1., 0., 0.]
    elif cumsum > -.02: 
#         return 'below average'
        return [0., 0., 1., 0., 0., 0.]
    elif cumsum > -.05:
#         return 'low'
        return [0., 1., 0., 0., 0., 0.]
    else: 
#         return 'very low'
         return [1., 0., 0., 0., 0., 0.]

def create_dataset(fund_prices, 
                   n_samples=10000, 
                   lookback=60, 
                   window=30):
    
    samples = fund_prices.sample(n=n_samples)
    
    X = []
    y = []
    
    
    pbar = tqdm(total=len(samples))
    for index, row in samples.iterrows():
        
        pre = fund_prices.loc[index[0]].loc[:index[1]]
        pre = pre.reset_index().iloc[-lookback:]
        post = fund_prices.loc[index[0]].loc[index[1]:]
        post = post.reset_index().iloc[1:window+1]
        
        pre = pre.drop(['date'], axis=1)
        pre = pre.values

        post = post.drop(['date'], axis=1)
        post = post['close']
        
        if len(pre) == lookback and len(post) == window:
            X.append(scaler.transform(pre))
            y.append(get_trend(post.cumsum().iloc[-1]))
        
        pbar.update(1)
    
    return np.array(X), np.array(y)

In [9]:
X_train, y_train = create_dataset(fund_pct_change, n_samples=120000, window=15)

HBox(children=(IntProgress(value=0, max=120000), HTML(value='')))

In [10]:
print(X_train.shape)
print(y_train.shape)

(115947, 60, 5)
(115947, 6)


In [22]:
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import CuDNNLSTM, InputLayer
# from keras.layers.recurrent import LSTM
from keras.models import Sequential

def build_model(X, y, session=None):
    if session is not None:
        from keras import backend as K
        K.set_session(sess)
    
    model = Sequential()
    # model.add(LSTM(input_shape=(X.shape[1], X.shape[2]), output_dim=X.shape[1], return_sequences = True))
    model.add(CuDNNLSTM(X.shape[1], return_sequences = True))
    model.add(Dropout(0.5))
    # model.add(LSTM(256))
    # model.add(CuDNNLSTM(256))
    # model.add(Dropout(0.5))
    model.add(Dense(y.shape[1]))
    model.add(Activation("softmax"))
    model.compile(loss="categorical_crossentropy", optimizer="sgd")
    # model.summary()
    return model

In [23]:
model = build_model(X=X_train, y=y_train)

In [24]:
model.fit(X_train, y_train, batch_size=128, shuffle=True, epochs=50, verbose=0, callbacks=[TQDMNotebookCallback()])

ValueError: Error when checking target: expected activation_4 to have 3 dimensions, but got array with shape (115947, 6)

Evaluate model

In [None]:
predictions = model.predict(X_train[:20])

In [None]:
example = 0

real = np.concatenate((X_train[example, :, 3], y_train[example]))
predicted =  np.empty((len(real)))
predicted[:] = np.nan
predicted[60:] = predictions[example]

df = pd.DataFrame(np.vstack((real, predicted)).T)
df.plot()
plt.show()

In [262]:
start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2018, 6, 1)
dates = pd.date_range(start_date, end_date)
symbols = list(vg_funds['fund_name'])

fund_prices_test = get_data(symbols, dates)
fund_prices_test.drop(['None', 'lastupdated', 'dividends', 'closeunadj'], axis=1, inplace=True)

fund_prices_test_scaled = pd.DataFrame(scaler.transform(fund_prices_test), 
                                       index=fund_prices_test.index, 
                                       columns=fund_prices_test.columns)

100%|██████████| 152/152 [00:43<00:00,  3.95it/s]


In [266]:
X_test, y_test = create_dataset(fund_prices_test_scaled, n_samples=5000)

100%|██████████| 5000/5000 [00:18<00:00, 274.29it/s]
