In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

In [3]:
vg_funds = pd.read_csv('data/fund_list.csv')

In [4]:
import quandl
quandl.ApiConfig.api_key = 'rb8eEU4go9wAPCe7Woq8'

def get_data(symbols, date_range):
    """Read stock data (adjusted close) for given symbols from Quandl data."""
    
    dfs = []

    pbar = tqdm(total=len(date_range))
    for date in date_range:
        df = quandl.get_table('SHARADAR/SFP', date=date, ticker=symbols)
        dfs.append(df)
        pbar.update(1)
    
    pbar.close()
    data = pd.concat(dfs)
    data = data.reset_index().set_index(['ticker', 'date'])
    return data

In [5]:
import datetime

start_date = datetime.datetime(2012, 1, 1)
end_date = datetime.datetime(2018, 1, 1)
dates = pd.date_range(start_date, end_date)
symbols = list(vg_funds['fund_name'])

fund_prices = get_data(symbols, dates)
fund_prices.drop(['None', 'lastupdated', 'dividends', 'closeunadj'], axis=1, inplace=True)

HBox(children=(IntProgress(value=0, max=2193), HTML(value='')))




In [6]:
print(len(fund_prices))

79113


In [7]:
print(fund_prices.head())

                    open    high    low  close     volume
ticker date                                              
BIV    2012-01-03  86.62  87.007  86.62  86.90   379600.0
BLV    2012-01-03  91.24  91.850  90.88  91.51   171000.0
BND    2012-01-03  83.43  83.430  83.18  83.28  2282600.0
BSV    2012-01-03  80.93  80.930  80.75  80.84   786300.0
EDF    2012-01-03  21.28  21.630  21.21  21.63    69000.0


In [9]:
from sklearn.preprocessing import StandardScaler

fund_pct_change = fund_prices.groupby(level=0).pct_change()
fund_pct_change.replace([np.inf, -np.inf], np.nan, inplace=True)
fund_pct_change.ffill(inplace=True)
fund_pct_change.dropna(how='any', inplace=True)

scaler = StandardScaler()
scaler.fit(fund_pct_change)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [50]:
from sklearn import preprocessing

def get_trend(cumsum):
    if cumsum > .06: 
#         return 'very high'
        return [0., 0., 0., 0., 0., 1.]
    elif cumsum > .03: 
#         return 'high'
        return [0., 0., 0., 0., 1., 0.]
    elif cumsum >= .005: 
#         return 'above average'
        return [0., 0., 0., 1., 0., 0.]
    elif cumsum > -.02: 
#         return 'below average'
        return [0., 0., 1., 0., 0., 0.]
    elif cumsum > -.05:
#         return 'low'
        return [0., 1., 0., 0., 0., 0.]
    else: 
#         return 'very low'
         return [1., 0., 0., 0., 0., 0.]

def create_dataset(fund_prices, 
                   n_samples=10000, 
                   lookback=60, 
                   window=30):
    
    samples = fund_prices.sample(n=n_samples)
    
    X = []
    y = []
    
    
    pbar = tqdm(total=len(samples))
    for index, row in samples.iterrows():
        
        pre = fund_prices.loc[index[0]].loc[:index[1]]
        pre = pre.reset_index().iloc[-lookback:]
        post = fund_prices.loc[index[0]].loc[index[1]:]
        post = post.reset_index().iloc[1:window+1]
        
        pre = pre.drop(['date'], axis=1)
        pre = pre.values

        post = post.drop(['date'], axis=1)
        post = post['close']
        
        if len(pre) == lookback and len(post) == window:
            X.append(scaler.transform(pre))
            y.append(get_trend(post.cumsum().iloc[-1]))
        
        pbar.update(1)
    
    return np.array(X), np.array(y)

In [51]:
X_train, y_train = create_dataset(fund_pct_change, n_samples=70000, window=15)

HBox(children=(IntProgress(value=0, max=70000), HTML(value='')))

In [54]:
print(X_train.shape)
print(y_train.shape)

(66349, 60, 5)
(66349, 6)


In [56]:
y_train

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [57]:
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential

def build_model(X, y):
    model = Sequential()
    model.add(LSTM(input_shape=(X.shape[1], X.shape[2]), output_dim=X.shape[1], return_sequences = True))
    model.add(Dropout(0.5))
    model.add(LSTM(256))
    model.add(Dropout(0.5))
    model.add(Dense(y.shape[1]))
    model.add(Activation("softmax"))
    model.compile(loss="categorical_crossentropy", optimizer="adam")
    model.summary()
    return model

In [58]:
model = build_model(X=X_train, y=y_train)

  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 60, 60)            15840     
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 60)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               324608    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 1542      
_________________________________________________________________
activation_1 (Activation)    (None, 6)                 0         
Total params: 341,990
Trainable params: 341,990
Non-trainable params: 0
_________________________________________________________________


In [59]:
model.fit(X_train, y_train, batch_size=64, shuffle=True, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a1513dfd0>

Evaluate model

In [None]:
predictions = model.predict(X_train[:20])

In [None]:
example = 0

real = np.concatenate((X_train[example, :, 3], y_train[example]))
predicted =  np.empty((len(real)))
predicted[:] = np.nan
predicted[60:] = predictions[example]

df = pd.DataFrame(np.vstack((real, predicted)).T)
df.plot()
plt.show()

In [262]:
start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2018, 6, 1)
dates = pd.date_range(start_date, end_date)
symbols = list(vg_funds['fund_name'])

fund_prices_test = get_data(symbols, dates)
fund_prices_test.drop(['None', 'lastupdated', 'dividends', 'closeunadj'], axis=1, inplace=True)

fund_prices_test_scaled = pd.DataFrame(scaler.transform(fund_prices_test), 
                                       index=fund_prices_test.index, 
                                       columns=fund_prices_test.columns)

100%|██████████| 152/152 [00:43<00:00,  3.95it/s]


In [266]:
X_test, y_test = create_dataset(fund_prices_test_scaled, n_samples=5000)

100%|██████████| 5000/5000 [00:18<00:00, 274.29it/s]
