In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

In [2]:
# import data/features (refer to Capstone 1 XGBoost model w/ multiple features for feature creation)
train = pd.read_csv('/Users/meiliu/Documents/SpringBoard Related/AAPL_train.csv', index_col=0)
test = pd.read_csv('/Users/meiliu/Documents/SpringBoard Related/AAPL_test.csv', index_col=0)

data = pd.concat([train, test], axis=0)
data.set_index('index', inplace=True)
data.tail()

Unnamed: 0_level_0,date,DCLRDT,DIVAMT,PRC,VOL,OPENPRC,NUMTRD,sprtrn,day,month,...,SPRD_moving_max_10,SPRD_moving_max_15,SPRD_moving_max_50,OCdiff_moving_max_5,OCdiff_moving_max_10,OCdiff_moving_max_15,OCdiff_moving_max_50,PRC_week_encode,PRC_month_encode,PRC_year_encode
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1505,2018-12-24,0,0.0,146.83,37249292,148.14999,253083,-0.027112,Monday,12,...,0.05,0.05,0.12001,5.0,5.0,5.0,9.10001,117.691859,121.716203,190.372739
1506,2018-12-26,0,0.0,157.17,58584634,148.3,359309,0.049594,Wednesday,12,...,0.05,0.05,0.12001,5.0,5.0,5.0,9.10001,117.691859,121.716203,190.372739
1507,2018-12-27,0,0.0,156.14999,53117005,155.84,335536,0.008563,Thursday,12,...,0.05,0.05,0.12001,5.0,5.0,5.0,9.10001,117.691859,121.716203,190.372739
1508,2018-12-28,0,0.0,156.23,42291347,157.5,274455,-0.001242,Friday,12,...,0.05,0.05,0.12001,5.0,5.0,5.0,9.10001,117.691859,121.716203,190.372739
1509,2018-12-31,0,0.0,157.74001,35003466,158.53,207618,0.008492,Monday,12,...,0.05,0.05,0.12001,5.0,5.0,5.0,9.10001,110.597714,121.716203,190.372739


## Simple MultiStep LSTM Model Using Only Price

In [3]:
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense, LSTM
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

Using TensorFlow backend.


In [4]:
# split price data into train, validation, and test sets
X_train_lstm1 = data['PRC'][:-30]
y_train_lstm1 = data['PRC'][10:-20]

X_val_lstm1 = data['PRC'][-30:-20]
y_val_lstm1 = data['PRC'][-20:-10]

X_test_lstm1 = data['PRC'][-20:-10]
y_test_lstm1 = data['PRC'][-10:]

print(X_train_lstm1.shape, y_train_lstm1.shape, X_val_lstm1.shape, y_val_lstm1.shape, 
      X_test_lstm1.shape, y_test_lstm1.shape)

(1430,) (1430,) (10,) (10,) (10,) (10,)


In [5]:
# reshape data to t=10 timesteps
X_train_lstm1 = np.array(X_train_lstm1).reshape(int(len(X_train_lstm1)/10),10)
y_train_lstm1 = np.array(y_train_lstm1).reshape(int(len(y_train_lstm1)/10),10)

X_val_lstm1 = np.array(X_val_lstm1).reshape(int(len(X_val_lstm1)/10),10)
y_val_lstm1 = np.array(y_val_lstm1).reshape(int(len(y_val_lstm1)/10),10)

X_test_lstm1 = np.array(X_test_lstm1).reshape(1,10)
y_test_lstm1 = np.array(y_test_lstm1).reshape(1,10)

In [6]:
# scale data using MinMax
s = MinMaxScaler()
X_train_lstm1 = s.fit_transform(X_train_lstm1)
y_train_lstm1 = s.transform(y_train_lstm1)

X_val_lstm1 = s.transform(X_val_lstm1)
y_val_lstm1 = s.transform(y_val_lstm1)

X_test_lstm1 = s.transform(X_test_lstm1)
y_test_lstm1 = s.transform(y_test_lstm1)

In [18]:
# reshape X arrays to be 3D for LSTM layer
X_train_lstm1 = np.array(X_train_lstm1).reshape(len(X_train_lstm1),10,1)
X_val_lstm1 = np.array(X_val_lstm1).reshape(len(X_val_lstm1),10,1)
X_test_lstm1 = np.array(X_test_lstm1).reshape(1,10,1)

In [41]:
# LSTM model with single LSTM layer, 10 output nodes
def priceLSTM():
    model = Sequential()
    model.add(LSTM(units=10, activation='relu', input_shape=(10,1)))
    model.add(Dropout(0.2))
    model.add(Dense(10))
    model.compile(loss='mse', optimizer='adam')
    
    return model

In [42]:
# fit model
model = priceLSTM()

lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', cooldown=0, min_lr=0.0001)
checkpoint = ModelCheckpoint('best_model_weights.hdf5',monitor='val_loss',save_best_only=True)
early_stopping_monitor = EarlyStopping(patience=30)

model.fit(x=X_train_lstm1, y=y_train_lstm1, validation_data=(X_val_lstm1, y_val_lstm1), epochs=100,
          callbacks=[lr_reducer, checkpoint, early_stopping_monitor])

Train on 143 samples, validate on 1 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100

Epoch 00066: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 67/100
Epoch 68/100

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100


<keras.callbacks.callbacks.History at 0x1a40cadef0>

In [43]:
# predict test set
y_pred_lstm1 = model.predict(X_test_lstm1)
y_pred_lstm1 = s.inverse_transform(y_pred_lstm1)

In [44]:
# RMSE calculation function
def rmse(pred, target):
    return np.sqrt(((pred - target) ** 2).mean())

In [45]:
print('LSTM w/ MinMax scaled Price:')
print(y_pred_lstm1, '\n-----')
print(s.inverse_transform(y_test_lstm1), '\n-----')
print('rmse:', rmse(y_pred_lstm1,s.inverse_transform(y_test_lstm1)))

LSTM w/ MinMax scaled Price:
[[165.23593 164.89723 161.01102 162.59477 164.37976 166.57533 168.54662
  161.13947 160.52977 164.9746 ]] 
-----
[[163.94    166.07001 160.89    156.83    150.73    146.83    157.17
  156.14999 156.23    157.74001]] 
-----
rmse: 9.152298596809286


The predictions and prediction accuracy are quite different across runs of the model. I'll use the average prediction RMSE across 30 different runs to estimate the RMSE of the model.

In [60]:
def LSTMfit(n_iter):
    rmse_list = []
    for i in range(n_iter):
        model = priceLSTM()
        lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=0, mode='auto', cooldown=0, min_lr=0.0001)
        checkpoint = ModelCheckpoint('best_model_weights.hdf5',monitor='val_loss',save_best_only=True)
        early_stopping_monitor = EarlyStopping(patience=30)

        model.fit(x=X_train_lstm1, y=y_train_lstm1, validation_data=(X_val_lstm1, y_val_lstm1), epochs=100,
              callbacks=[lr_reducer, checkpoint, early_stopping_monitor], verbose=0)
        y_pred_lstm1 = s.inverse_transform(model.predict(X_test_lstm1))
        rmse_list.append(rmse(y_pred_lstm1,s.inverse_transform(y_test_lstm1)))
    return np.mean(np.array(rmse_list))

In [62]:
avg_rmse = LSTMfit(30)
print('Avg RMSE across 30 iterations: ', avg_rmse)

Avg RMSE across 30 iterations:  11.595778983810366


## Simple Sequential Model w/ Categorical Embedding

### Preprocessing

In [63]:
# identify the target, categorical features, and continuous features
target = ['PRC']
cat_vars = ['DCLRDT', 'month','year','dayofweek', 'qtr', 'day365', 'dayofmonth',
            'weekofyear', 'startend','Announce']

drop=['date','VOL','OPENPRC','NUMTRD','sprtrn','day','HISPRD','SPRD', 'OCdiff']

cont_vars = list(data.columns)

cont_vars.remove(target[0])

for cat in cat_vars:
    cont_vars.remove(cat)

for d in drop:
    cont_vars.remove(d)

In [64]:
# Label encode each categorical feature
encoders = {}
for v in cat_vars:
    le = LabelEncoder()
    le.fit(data[v].values)
    encoders[v] = le
    data.loc[:, v] = le.transform(data[v].values)
    print('{0}: {1}'.format(v, le.classes_))

DCLRDT: [0 1]
month: [ 1  2  3  4  5  6  7  8  9 10 11 12]
year: [2013 2014 2015 2016 2017 2018]
dayofweek: [0 1 2 3 4]
qtr: [1 2 3 4]
day365: [  2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19
  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37
  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109
 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
 182 183 184 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
 201 202 203 204 205 206 207 208 209 210 211 212 213 2

In [65]:
# split the data into training, validation, and test sets
X = data[cat_vars + cont_vars][:-110].copy()
y = data[target][:-110].copy()

X_val = data[cat_vars + cont_vars][-110:-10].copy()
y_val = data[target][-110:-10].copy()

X_test = data[cat_vars + cont_vars][-10:].copy()
y_test = data[target][-10:].copy()

In [66]:
# scale continuous variables using scaler fitted on only training data
scaler = MinMaxScaler()
X.loc[:, cont_vars] = scaler.fit_transform(X[cont_vars].values)
X_val.loc[:, cont_vars] = scaler.transform(X_val[cont_vars].values)
X_test.loc[:, cont_vars] = scaler.transform(X_test[cont_vars].values)

# scale target by max target
y_max = np.max(train['PRC'])
y = y/y_max
y_val = y_val/y_max
y_test = y_test/y_max

In [67]:
# change categorical features to type category, continuous features to type float32
for v in cat_vars:
    X[v] = X[v].astype('int').astype('category').cat.as_ordered()
    X_val[v] = X_val[v].astype('int').astype('category').cat.as_ordered()
    X_test[v] = X_test[v].astype('int').astype('category').cat.as_ordered()
for v in cont_vars:
    X[v] = X[v].astype('float32')
    X_val[v] = X_val[v].astype('float32')
    X_test[v] = X_test[v].astype('float32')

In [68]:
# print the shape of each input
print(X.info())

X.shape, X_val.shape, X_test.shape, y.shape, y_val.shape, y_test.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1350 entries, 50 to 1399
Columns: 115 entries, DCLRDT to PRC_year_encode
dtypes: category(10), float32(105)
memory usage: 598.7 KB
None


((1350, 115), (100, 115), (10, 115), (1350, 1), (100, 1), (10, 1))

In [69]:
# input size of categorical variables
cat_sizes = [(c, len(X[c].cat.categories)) for c in cat_vars]
cat_sizes

[('DCLRDT', 2),
 ('month', 12),
 ('year', 6),
 ('dayofweek', 5),
 ('qtr', 4),
 ('day365', 362),
 ('dayofmonth', 31),
 ('weekofyear', 53),
 ('startend', 2),
 ('Announce', 31)]

In [70]:
# embedding size of categorical variables
embedding_sizes = [(c, min(50, (c + 1) // 2)) for _, c in cat_sizes]
embedding_sizes

[(2, 1),
 (12, 6),
 (6, 3),
 (5, 3),
 (4, 2),
 (362, 50),
 (31, 16),
 (53, 27),
 (2, 1),
 (31, 16)]

In [71]:
# create input arrays to feed into the neural network
X_arr = []
X_val_arr = []
X_test_arr = []

for i, v in enumerate(cat_vars):
    print(i,v)
    X_arr.append(X.iloc[:, i])
    X_val_arr.append(X_val.iloc[:, i])
    X_test_arr.append(X_test.iloc[:, i])

X_arr.append(X.iloc[:, len(cat_vars):])
X_val_arr.append(X_val.iloc[:, len(cat_vars):])
X_test_arr.append(X_test.iloc[:, len(cat_vars):])

len(X_arr), len(X_val_arr), len(X_test_arr)

0 DCLRDT
1 month
2 year
3 dayofweek
4 qtr
5 day365
6 dayofmonth
7 weekofyear
8 startend
9 Announce


(11, 11, 11)

### Building Model

In [72]:
from keras.layers import Concatenate, Input, Reshape
from keras.layers.embeddings import Embedding

In [79]:
# create a sequential neural network with an embedding layer
def EmbeddingNet(cat_vars, cont_vars, embedding_sizes):
    inputs = []
    embed_layers = []
    for (c, (in_size, out_size)) in zip(cat_vars, embedding_sizes):
        # define input layer
        i = Input(shape=(1,))
        #define embedding layer
        o = Embedding(in_size, out_size, name=c)(i)
        o = Reshape(target_shape=(out_size,))(o)
        # store layers
        inputs.append(i)
        embed_layers.append(o)
    
    # concat embedding layers
    embed = Concatenate()(embed_layers)

    cont_input = Input(shape=(len(cont_vars),))
    inputs.append(cont_input)
    
    # concat embeddings and continuous variables to input into sequential nn
    x = Concatenate()([embed, cont_input])

    dense = Dense(1000, activation='relu', kernel_initializer='he_normal')(x)
    
    dense = Dense(500, activation='relu', kernel_initializer='he_normal')(dense)
    
    output = Dense(1, activation='sigmoid', kernel_initializer='he_normal')(dense)

    model = Model(inputs=inputs, outputs=output)
    model.compile(loss='mean_squared_error', optimizer='adam')

    return model

In [86]:
# initialize model
model = EmbeddingNet(cat_vars, cont_vars, embedding_sizes)

# fit model with training data and evaluate using validation data
lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', cooldown=0, min_lr=0.0001)
checkpoint = ModelCheckpoint('best_model_weights.hdf5',monitor='val_loss',save_best_only=True)
early_stopping_monitor = EarlyStopping(patience=20)

model.fit(x=X_arr, y=y, validation_data=(X_val_arr, y_val), epochs=100,
          callbacks=[lr_reducer, checkpoint, early_stopping_monitor])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1350 samples, validate on 100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

Epoch 00025: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 26/100
Epoch 27/100


<keras.callbacks.callbacks.History at 0x1a714acf98>

In [90]:
print(model.summary())

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_24 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_25 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_26 (InputLayer)           (None, 1)            0                                            
____________________________________________________________________________________________

In [91]:
# predict test set
pred = model.predict(x=X_test_arr)
pred = pred*y_max

In [92]:
print('cont. features scaled using MinMax, target scaled using y_max:')
print(pred, '\n-----')
print(y_test*y_max, '\n-----')
print('rmse:', rmse(pred,y_test*y_max))

cont. features scaled using MinMax, target scaled using y_max:
[[167.1573 ]
 [166.65193]
 [167.65251]
 [166.22545]
 [167.9786 ]
 [164.41614]
 [162.69807]
 [162.90753]
 [163.6705 ]
 [163.04926]] 
-----
             PRC
index           
1500   163.94000
1501   166.07001
1502   160.89000
1503   156.83000
1504   150.73000
1505   146.83000
1506   157.17000
1507   156.14999
1508   156.23000
1509   157.74001 
-----
rmse: PRC    9.54597
dtype: float64


In [95]:
# initialize model
def EmbedNetFit(n_iter):
    rmse_list = []
    for i in range(n_iter):
        model = EmbeddingNet(cat_vars, cont_vars, embedding_sizes)

        # fit model with training data and evaluate using validation data
        lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=0, mode='auto', cooldown=0, min_lr=0.0001)
        checkpoint = ModelCheckpoint('best_model_weights.hdf5',monitor='val_loss',save_best_only=True)
        early_stopping_monitor = EarlyStopping(patience=20)

        model.fit(x=X_arr, y=y, validation_data=(X_val_arr, y_val), epochs=100,
                  callbacks=[lr_reducer, checkpoint, early_stopping_monitor], verbose=0)
        pred = model.predict(x=X_test_arr)*y_max
        rmse_list.append(rmse(pred,y_test*y_max))
    return np.mean(np.array(rmse_list))

In [96]:
avg_rmse2 = EmbedNetFit(30)
print('Avg RMSE across 30 iterations: ', avg_rmse2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tenso

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Avg RMSE across 30 iterations:  16.880052797800467


The RMSE is higher than the simple price based LSTM model, but across most runs (not shown above, but just what I observed from running the model multiple times), there is a clear decline in the predicted price from the first prediction date to the last prediction date. The model is capturing the decline in price, which starkly differs from previous models (eg XGBoost, LSTM). However, the prediction is too high. If some recurrent component, where the prediction can use sequential information from previous samples as a baseline for the price, is combined with the sequential model with embedding, the prediction might be able to capture both the decline and the general price level. 

#### Future Steps: Incorporate categorical embedding into RNN or LSTM model with multiple continuous features.