In [86]:
%matplotlib inline
import matplotlib.pyplot as plt

In [87]:
from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, UpSampling2D
from keras.models import Model
from keras import regularizers

In [88]:
import numpy as np

### Reference :
* Blog : building autoencoders in keras : https://blog.keras.io/building-autoencoders-in-keras.html

### Load market data from Quandl

In [89]:
import quandl # pip install quandl
import pandas as pd

In [90]:
def qData(tick='XLU'):
    # GOOG/NYSE_XLU.4
    # WIKI/MSFT.4
    qtck = "GOOG/NYSE_"+tick+".4"
    return quandl.get(qtck,
                      start_date="2003-01-01",
                      end_date="2016-12-31",
                      collapse="daily")

In [91]:
'''TICKERS = ['MSFT','JPM','INTC','DOW','KO',
             'MCD','CAT','WMT','MMM','AXP',
             'BA','GE','XOM','PG','JNJ']'''
TICKERS = ['XLU','XLF','XLK','XLY','XLV','XLB','XLE','XLP','XLI']

In [92]:
try:
    D.keys()
except:
    print('create empty Quandl cache')
    D = {}

for tckr in TICKERS:
    if not(tckr in D.keys()):
        print(tckr)
        qdt = qData(tckr)
        qdt.rename(columns={'Close': tckr}, inplace = True)
        D[tckr] = qdt
        
for tck in D.keys():
    assert(D[tck].keys() == [tck])

In [93]:
for tck in D.keys():
    print(D[tck].shape)

(3538, 1)
(3538, 1)
(3538, 1)
(3538, 1)
(3538, 1)
(3538, 1)
(3538, 1)
(3538, 1)
(3538, 1)


In [94]:
J = D[TICKERS[0]].join(D[TICKERS[1]])
for tck in TICKERS[2:]:
    J = J.join(D[tck])

In [95]:
J.head(5)

Unnamed: 0_level_0,XLU,XLF,XLK,XLY,XLV,XLB,XLE,XLP,XLI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2003-01-02,19.6,22.8,15.6,23.98,27.27,20.36,22.8,20.32,21.12
2003-01-03,19.8,22.78,15.66,23.43,27.55,20.25,22.76,20.3,21.19
2003-01-06,20.69,23.55,16.35,23.86,27.85,20.64,22.95,20.45,21.38
2003-01-07,20.24,23.29,16.52,23.73,27.45,20.57,22.2,20.27,21.26
2003-01-08,20.38,23.05,15.98,23.51,27.24,20.04,21.99,20.15,21.0


In [96]:
J.isnull().sum()

XLU    0
XLF    0
XLK    0
XLY    0
XLV    0
XLB    0
XLE    0
XLP    0
XLI    0
dtype: int64

In [97]:
J2 = J.fillna(method='ffill')
#J2[J['WMT'].isnull()]

LogDiffJ = J2.apply(np.log).diff(periods=1, axis=0)
LogDiffJ.drop(LogDiffJ.index[0:1], inplace=True)
print LogDiffJ.shape

MktData = LogDiffJ.as_matrix(columns=None) # as numpy.array
print MktData.shape

(3537, 9)
(3537, 9)


In [98]:
np.random.shuffle(MktData)
split_index = 3000
x_train = MktData[0:split_index,:]*100
x_test = MktData[split_index:,:]*100

In [99]:
np.std(x_train, axis=0)

array([ 1.0982247 ,  2.03701577,  1.29596141,  1.32517727,  1.04548284,
        1.53840115,  1.77989192,  0.83855434,  1.3078465 ])

## Linear auto-encoder : like PCA
### We get a linear model by removing activation functions

In [100]:
original_dim = 9

# this is the size of our encoded representations
encoding_dim = 3

# this is our input placeholder
input_data = Input(shape=(original_dim,))

if True: # no sparsity constraint
    encoded = Dense(encoding_dim, activation=None)(input_data)
else:
    encoded = Dense(encoding_dim, activation=None,
                    activity_regularizer=regularizers.activity_l1(10e-5))(input_data)

# "decoded" is the lossy reconstruction of the input
decoded = Dense(original_dim, activation=None)(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(inputs=input_data, outputs=decoded)

# this model maps an input to its encoded representation
encoder = Model(inputs=input_data, outputs=encoded)

# create a placeholder for an encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(inputs=encoded_input, outputs=decoder_layer(encoded_input))

In [101]:
# train autoencoder to reconstruct Stock returns
# use L2 loss
autoencoder.compile(optimizer='adadelta', loss='mean_squared_error')

In [123]:
autoencoder.fit(x_train, x_train,
                epochs=50,
                batch_size=128,
                shuffle=True,
                validation_data=(x_test, x_test))

Train on 3000 samples, validate on 537 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f17f8af4590>

In [124]:
# encode and decode some digits
# note that we take them from the *test* set
encoded_data = encoder.predict(x_test)
decoded_data = decoder.predict(encoded_data)

In [125]:
for i in range(original_dim):
    print i, np.corrcoef(x_test[:,i].T, decoded_data[:,i].T)[0,1]

0 0.769975033646
1 0.99518747652
2 0.901590510518
3 0.947480846807
4 0.82774934141
5 0.900718119215
6 0.989944764408
7 0.854005961685
8 0.930788316892


In [126]:
decoding_error = x_test - decoded_data
for i in range(original_dim):
    print i, np.corrcoef(decoded_data[:,i].T, decoding_error[:,i].T)[0,1]

0 0.11938556286
1 -0.115034789311
2 -0.0466444153257
3 0.241456431721
4 -0.140337077375
5 -0.100318098446
6 0.106878897453
7 0.0370390714887
8 -0.0743364310779
