In [62]:
import torch
import torch.nn as nn
import pandas as pd
import os
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

In [44]:
def z_norm(df, col_exclude=None):
    """Performs z-score normalization on all columns of df except col_exclude
    
    inputs:
        df: stock data
        col_exclude: columns to be excluded from normalization
        
    returns:
        df_std: normalized z-score data
    """

    df_std = df.copy()
    cols = list(df.columns)
    cols.remove(col_exclude)
    for c in cols:
        df_std[c] = (df_std[c] - df_std[c].mean()) / df_std[c].std()

    return df_std

In [45]:
def to_sequences(seq_size: int, obs: np.array):
    x = []
    y = []
    for i in range(len(obs) - seq_size):
        window = obs[i:(i + seq_size), :]
        after_window = obs[i + seq_size, :]
        x.append(window)
        y.append(after_window)
    return x, y

In [52]:
# loading data
data_path = 'data'
interval = '1d'
companies = ['AAPL'] #os.listdir(data_path)
df_list = []
test_start_date = pd.to_datetime("2022-03-01")
for co in companies:
    files = os.listdir(os.path.join(data_path, co))
    for f in files:
        if interval in f:
            file = f

    df = pd.read_csv(os.path.join(data_path, co, file))
    df = df.drop(columns=['Unnamed: 0'])

    df = z_norm(df, 'date')
    df['date'] = pd.to_datetime(df['date'])
    # could also add ticker label column

    df_list.append(df)

In [59]:
x_train = []
x_test = []
y_train = []
y_test = []
seq_size = 30 # 30th day will be the prediction
for df in df_list:
    # split each df into train and test timeframes
    df_train = df[df['date'] < test_start_date]
    df_test = df[df['date'] >= test_start_date]

    # drop unnecessary columns
    df_train = df.drop(columns=['date'])
    df_test = df.drop(columns=['date'])

    # convert to 2D numpy arrays of shape (-1, num_cols)
    train = df_train.to_numpy()
    test = df_test.to_numpy()

    # convert to sequences and append to respective training and testing lists
    x, y = to_sequences(seq_size, train)
    for i in range(len(x)):
        x_train.append(x[i])
        y_train.append(y[i])
    x, y = to_sequences(seq_size, test)
    for j in range(len(x)):
        x_test.append(x[i])
        y_test.append(y[i])


x_train = np.vstack(x_train)
y_train = np.vstack(y_train)
x_test = np.vstack(x_test)
y_test = np.vstack(y_test)


x_train = torch.tensor(x_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [60]:
x_train.shape

torch.Size([72120, 19])

In [61]:
y_test.shape

torch.Size([2404, 19])

In [36]:
len(df.columns)

20

In [39]:
df.drop(columns='date').to_numpy().shape

(2434, 19)

In [34]:
df[df['date'] > test_start_date]

Unnamed: 0,date,open,high,low,close,volume,pct_change,close_RSI_14,ADOSC_3_10,AROOND_25,AROONU_25,AROONOSC_25,CCI_14_0.015,CG_14,close_HMA_50,ISA_9,ISB_26,ITS_9,IKS_26,VWAP_D
1952,2022-03-02,1.467021,1.487340,1.471476,1.502690,-0.672981,1.089309,-0.596104,0.707591,1.302202,-0.417201,-0.946970,-0.446298,-0.983596,1.515545,1.662439,1.553527,1.558826,1.633761,1.487310
1953,2022-03-03,1.539030,1.514396,1.517844,1.496872,-0.714855,-0.167325,-0.635560,0.492100,1.188451,-0.526497,-0.946970,-0.141296,-0.806672,1.510124,1.683299,1.554449,1.558826,1.633761,1.509772
1954,2022-03-04,1.468786,1.455746,1.456317,1.442917,-0.617830,-1.081952,-0.987749,0.221406,1.074699,-0.635793,-0.946970,-0.713634,-0.844285,1.503768,1.701872,1.554449,1.553797,1.633761,1.451736
1955,2022-03-07,1.448842,1.446494,1.401746,1.374680,-0.443517,-1.377569,-1.370813,-0.255807,0.960948,-0.745089,-0.946970,-1.097908,-0.979837,1.494215,1.697846,1.562099,1.528298,1.633761,1.407871
1956,2022-03-08,1.368714,1.409140,1.343965,1.341884,0.033852,-0.707117,-1.536119,-0.708693,0.847197,-0.854385,-0.946970,-1.494674,-0.973183,1.481119,1.706355,1.562099,1.517164,1.633761,1.365300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2429,2024-01-25,2.011151,1.991972,2.009341,1.989518,-1.015277,-0.151478,0.353225,0.291933,0.278442,-1.400864,-0.946970,0.718580,0.883301,1.855859,1.790252,1.878453,2.078873,2.029718,1.996990
2430,2024-01-26,1.994384,1.965615,1.988476,1.958661,-1.155866,-0.558825,0.010014,0.034404,0.164691,-1.510160,-0.946970,0.389352,0.764184,1.863877,1.801276,1.878453,2.090006,2.045270,1.970942
2431,2024-01-29,1.954496,1.920929,1.946388,1.946495,-1.120793,-0.256669,-0.120828,0.018129,0.050940,-1.619455,-0.946970,0.066453,0.728426,1.871797,1.823691,1.878453,2.090006,2.051419,1.937955
2432,2024-01-30,1.935612,1.913947,1.908759,1.881432,-1.001019,-1.128587,-0.742393,-0.162676,-0.062811,-1.619455,-0.884935,-0.285407,0.537001,1.877500,1.833847,1.878453,2.090006,2.070317,1.901499


Using basic transformer from https://github.com/jeffheaton/app_deep_learning/blob/main/t81_558_class_10_3_transformer_timeseries.ipynb

In [None]:
# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
# Model definition using Transformer
class TransformerModel(nn.Module):
    def __init__(self, input_dim=1, d_model=64, nhead=4, num_layers=2, dropout=0.2):
        super(TransformerModel, self).__init__()

        self.encoder = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.decoder(x[:, -1, :])
        return x

model = TransformerModel()