# Maximizing <span style="color:red"> Stock Portfolio ROI </span> using <span style="color:blue"> Deep Learning </span>

## Toy Model

### Data

- Portfolio of stocks.
- Every stock goes on a sequence of increasing or decreasing for X (fixed) days straight.
- Each sequence is linear, w/ a random fixed % change $\in (-1,1)$, with Gaussian noise added.

In [13]:
import numpy as np

In [34]:
def generate_dataset(num_stocks, num_days):
    starting_price = 500.
    seq_length = 5
    data = np.array([[1 for stock in range(num_stocks)]])*starting_price
    labels = np.zeros(num_stocks+1)
    for _ in range(int(num_days/seq_length)):
        percent_change = 2.*np.random.rand(num_stocks) - 1.
        new_label = np.zeros(num_stocks+1)
        if np.max(percent_change) > 0:
            new_label[np.argmax(percent_change)] = 1
        else:
            new_label[-1] = 1
        for day in range(seq_length):
            price_change = 1 + 0.01*percent_change + 0.05/100.*np.random.randn(4)
            data = np.vstack((data,data[-1]*price_change))
            labels = np.vstack((labels, new_label))
    return data, labels    

In [43]:
X_train_toy, Y_train_toy = generate_dataset(4, 2000)

In [44]:
np.save('X_train_toy',X_train_toy)
np.save('Y_train_toy',Y_train_toy)

### Training

In [37]:
import numpy as np
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, TimeDistributed

In [45]:
X_train_toy = np.load('X_train_toy.npy')
Y_train_toy = np.load('Y_train_toy.npy')

mean = X_train_toy.mean(axis=0)
std = X_train_toy.std(axis=0)

X_train_norm = (X_train_toy - mean)/std

In [46]:
timesteps = 10

X_train_seq = []
for batch_start in range(timesteps, X_train_norm.shape[0]):
    X_train_seq.append(X_train_norm[batch_start-timesteps:batch_start,:])
X_train_seq = np.vstack(([np.array([seq]) for seq in X_train_seq]))
Y_train_seq = Y_train_toy[timesteps:]

In [47]:
dataset_size = X_train_seq.shape[0]
split = dataset_size//5

X_test_seq = X_train_seq[:split]
Y_test_seq = Y_train_seq[:split]
X_train_seq = X_train_seq[split:]
Y_train_seq = Y_train_seq[split:]

In [48]:
data_dim = 4
nb_classes = 5

model = Sequential()
model.add(TimeDistributed(Dense(42, activation='relu'), input_shape=(timesteps, data_dim)))
model.add(LSTM(42, return_sequences=True))
model.add(LSTM(42, return_sequences=False))
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['categorical_accuracy'])

In [53]:
model.fit(X_train_seq, Y_train_seq, batch_size=50, nb_epoch=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x120093050>

In [56]:
predictions = np.argmax(model.predict_proba(X_train_seq),axis=1)
truth = np.argmax(Y_train_seq,axis=1)
np.mean(predictions==truth)



0.89391086001255493

In [57]:
predictions = np.argmax(model.predict_proba(X_test_seq),axis=1)
truth = np.argmax(Y_test_seq,axis=1)
np.mean(predictions==truth)



0.30904522613065327

In [58]:
from keras.models import load_model

model.save('model_toy.h5')
#model = load_model('model_toy.h5')

## Curriculum Learning

### Hierarchical Dataset Generation

### Training

## Real Model

### Data

In [57]:
import numpy as np
import glob
import datetime

In [58]:
dataset = []
for idx in glob.glob("data/*.csv"):
    if idx == 'data/S&P500.csv': continue
    stock = np.genfromtxt(idx,skip_header=1,usecols=(0,1),delimiter=',',dtype=["S10","f8"])
    data = np.zeros((stock.shape[0],2))
    data[:,0] = np.array([datetime.datetime.strptime(entry[0], "%Y-%m-%d").date().timetuple().tm_yday for entry in stock])
    data[:,1] = np.array([entry[1] for entry in stock])
    dataset.append((idx,data))

In [59]:
num_stocks = 4
data_batches = []
idxs = []

for stock1 in range(0,len(dataset)):
    for stock2 in range(stock1+1,len(dataset)):
        for stock3 in range(stock2+1,len(dataset)):
            for stock4 in range(stock3+1,len(dataset)):
                size = np.min([dataset[stock1][1].shape[0],dataset[stock2][1].shape[0],dataset[stock3][1].shape[0],dataset[stock4][1].shape[0]])
                data_batch = np.hstack((dataset[stock1][1][0:size,:],dataset[stock2][1][0:size,:],dataset[stock3][1][0:size,:],dataset[stock4][1][0:size,:]))
                data_batches.append(data_batch)
                if not idxs:
                    idxs.append(data_batch.shape[0]-2)
                else:
                    idxs.append(idxs[-1] + data_batch.shape[0] - 1)

In [60]:
def label(current, future):
    label = np.zeros(5) # [stock1, stock2, stock3, stock4, Cash]
    profit = np.true_divide(future - current, current)
    idx = np.argmax(profit)
    if profit[idx] <= 0: idx = -1
    label[idx] = 1
    return label

In [61]:
label_batches = []

for batch in data_batches:
    label_batch = np.zeros((batch.shape[0]-1,5))
    for idx in range(batch.shape[0]-1):
        current, future = batch[idx+1][[1,3,5,7]], batch[idx][[1,3,5,7]]
        label_batch[idx] = label(current,future)
    label_batches.append(label_batch)

In [62]:
X_train = np.vstack((batch[1:,:] for batch in data_batches))

In [63]:
X_train.shape

(264809, 8)

In [64]:
Y_train = np.vstack(np.vstack((label for label in label_batch)) for label_batch in label_batches)

In [65]:
Y_train.shape

(264809, 5)

In [66]:
np.save('X_train',X_train)
np.save('Y_train',Y_train)

### Training

In [92]:
import numpy as np
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, TimeDistributed

In [69]:
X_train = np.load('X_train.npy')
Y_train = np.load('Y_train.npy')

mean = X_train.mean(axis=0)
std = X_train.std(axis=0)

X_train_norm = (X_train - mean)/std

In [70]:
input_dim = X_train.shape[1]
output_dim = Y_train.shape[1]
T = 100
X_train_seq = []
Y_train_seq = []
for i in range(len(idxs)):
    if i == 0:
        start_idx = 0
    else:
        start_idx = idxs[i-1] + 1
    end_idx = idxs[i]
    for j in range(start_idx + T - 1, end_idx+1):
        tX = X_train_norm[j - (T - 1): j + 1]
        tY = Y_train[j]
        X_train_seq.append(tX[None,:,:])
        Y_train_seq.append(tY[None,:])
        del tX
        del tY

X_train_seq = np.concatenate(X_train_seq, axis=0)
Y_train_seq = np.concatenate(Y_train_seq, axis=0)

In [109]:
data_dim = 8
timesteps = 100
nb_classes = 5

model = Sequential()
model.add(TimeDistributed(Dense(42, activation='relu'), input_shape=(timesteps, data_dim)))
model.add(LSTM(42, return_sequences=True))
model.add(LSTM(42, return_sequences=False))
model.add(Dense(5,activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['categorical_accuracy'])

# model.fit(X_train_seq[:20], Y_train_seq[:20], batch_size=20, nb_epoch=20)
#score = model.evaluate(X_test, Y_test, batch_size=16)

In [None]:
model.fit(X_train_seq, Y_train_seq, batch_size=50, nb_epoch=10000)

Epoch 1/10000
  3900/252335 [..............................] - ETA: 2477s - loss: 1.5969 - categorical_accuracy: 0.2508

In [114]:
predictions = np.argmax(model.predict_proba(X_train_seq[1000:1500,:,:]),axis=1)
truth = np.argmax(Y_train_seq[1000:1500],axis=1)
np.mean(predictions==truth)



0.17999999999999999