In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils

from sklearn.preprocessing import RobustScaler


##### **Load in the data**

In [14]:
raw_stock_data = pd.read_csv('data/SP500_stock_prices_cleaned_with_3month_return.csv', index_col=0, parse_dates=True)
stock_tickers = pd.Series(raw_stock_data.Ticker.unique())

raw_stock_data.head()

Unnamed: 0_level_0,Ticker,Open,Low,High,Close,Volume,Sector,log_return_3m
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-04,MMM,148.050003,145.399994,148.320007,146.820007,3277200,Industrials,
2016-01-05,MMM,146.820007,145.610001,147.5,147.460007,2688100,Industrials,
2016-01-06,MMM,145.589996,143.419998,145.759995,144.490005,2997100,Industrials,
2016-01-07,MMM,142.520004,140.630005,143.130005,140.970001,3553500,Industrials,
2016-01-08,MMM,141.360001,140.220001,142.5,140.490005,2664000,Industrials,


#### **Preprocess the dataframe**
Turn the dataframe into the ticker dictionary of feature and target tensors

In [15]:
# import functions
from data_processing import DataFrame_to_Tensors, FindNearestDateIndex, SplitData, CreateSequences

In [16]:

# create the data sequences for each stock ticker.
# we make 
data_sequences_dict = {}
for ticker in stock_tickers:
    print(ticker)
    df = raw_stock_data[raw_stock_data['Ticker'] == ticker].drop(columns=['Ticker', 'Sector'])
    data_sequences_dict[ticker] = DataFrame_to_Tensors(df, split_date='2019-10-01')

MMM
AOS
ABT
ABBV
ACN
ATVI
ADM
ADBE
ADP
AAP
AES
AFL
A
APD
AKAM
ALK
ALB
ARE
ALGN
ALLE
LNT
ALL
GOOGL
GOOG
MO
AMZN
AMCR
AMD
AEE
AAL
AEP
AXP
AIG
AMT
AWK
AMP
ABC
AME
AMGN
APH
ADI
ANSS
AON
APA
AAPL
AMAT
APTV
ACGL
ANET
AJG
AIZ
T
ATO
ADSK
AZO
AVB
AVY
BKR
BALL
BAC
BBWI
BAX
BDX
WRB
BBY
BIO
TECH
BIIB
BLK
BK
BA
BKNG
BWA
BXP
BSX
BMY
AVGO
BR
BRO
BG
CHRW
CDNS
CZR
CPT
CPB
COF
CAH
KMX
CCL
CTLT
CAT
CBOE
CBRE
CDW
CE
CNC
CNP
CF
CRL
SCHW
CHTR
CVX
CMG
CB
CHD
CI
CINF
CTAS
CSCO
C
CFG
CLX
CME
CMS
KO
CTSH
CL
CMCSA
CMA
CAG
COP
ED
STZ
COO
CPRT
GLW
CSGP
COST
CTRA
CCI
CSX
CMI
CVS
DHI
DHR
DRI
DVA
DE
DAL
XRAY
DVN
DXCM
FANG
DLR
DFS
DISH
DIS
DG
DLTR
D
DPZ
DOV
DTE
DUK
DD
DXC
EMN
ETN
EBAY
ECL
EIX
EW
EA
ELV
LLY
EMR
ENPH
ETR
EOG
EPAM
EQT
EFX
EQIX
EQR
ESS
EL
ETSY
RE
EVRG
ES
EXC
EXPE
EXPD
EXR
XOM
FFIV
FDS
FICO
FAST
FRT
FDX
FITB
FRC
FSLR
FE
FIS
FISV
FLT
FMC
F
FTNT
BEN
FCX
GRMN
IT
GEN
GNRC
GD
GE
GIS
GM
GPC
GILD
GL
GPN
GS
HAL
HIG
HAS
HCA
PEAK
HSIC
HSY
HES
HPE
HLT
HOLX
HD
HON
HRL
HST
HPQ
HUM
HBAN
HII
IBM
IEX
IDXX
ITW
ILMN
INCY
PO

In [19]:
data_sequences_dict['AAPL']['train_features'].shape, data_sequences_dict['AAPL']['train_targets'].shape

(torch.Size([879, 63, 6]), torch.Size([879]))

#### Training and test split functions

### **Build the model architecture**

In [20]:
import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_layer_dim, num_layers, output_dim, dropout):
        super(LSTM, self).__init__()
        
        # Define the dimensions of the LSTM layer
        self.hidden_dim = hidden_layer_dim
        self.num_layers = num_layers
        
        # Define the LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_layer_dim, num_layers, batch_first=True, dropout=dropout)
        
        # fc output layer
        self.fc = nn.Linear(hidden_layer_dim, output_dim)
        
    def forward(self, x):
        # Init hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        
        # Decode the last generated hidden state
        out = self.fc(out[:, -1, :])
        return out


In [21]:

# based on a paper.: See notion.

input_dim, hidden_layer_dim, num_layers, output_dim, dropout = 6, 32, 2, 1, 0.4

model = LSTM(input_dim, hidden_layer_dim, num_layers, output_dim, dropout)
print(model)

criteria = nn.MSELoss()

# Well rounded, and doesnt need much fine-tuning.
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Could have better convergence and generalization, but would take some fine-tuning to find optimal parameters.
# we will have to see how long training takes, and decide if we can afford to do a search for hyperparameters 
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

LSTM(
  (lstm): LSTM(6, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)
