In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import yfinance as yf
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = yf.download(
    "GOOG AAPL MSFT POLX.L POW.L AMD TSLA F KR UMC GM",
    start="2018-01-01",
    end="2023-01-01",
    group_by="ticker",
)
break_date = dt.date(2022, 1, 1)

# Reshape data
data = (
    data.unstack()
    .reset_index()
    .rename(
        columns={"Date": "date", "level_0": "ticker", "level_1": "measure", 0: "value"}
    )
    .dropna()
)

# Filter to adj close
data = data.loc[
    data["measure"] == "Adj Close", ["date", "ticker", "value"]
].reset_index(drop=True)

# Calculate log diff
data["log_diff"] = data.groupby("ticker")["value"].transform(
    lambda x: np.log(x) - np.log(x.shift())
)

# Split timeseries now to prevent leakage.
data_train = data.loc[data["date"].dt.date < break_date, :].copy()
data_test = data.loc[~data.index.isin(data_train.index), :].copy()


def preprocess_data(data):
    # Moving average
    data["moving_avg"] = data["value"].rolling(5).mean()
    
    # Add flag for whether the value is up in the next weeks moving average is up 10 % or down 10%
    data["sig_delta_up"] = data.groupby("ticker")["value"].transform(
        lambda x: np.where(((x.shift(-5) - x) / x) > 0.1, 1, 0)
    )
    data["sig_delta_down"] = data.groupby("ticker")["value"].transform(
        lambda x: np.where(((x.shift(-5) - x) / x) < -0.1, 1, 0)
    )
    # Remove nas and reset the index
    data = data.dropna().reset_index(drop=True)
    return data

data_train, data_test = map(preprocess_data, [data_train, data_test])

data_train.info(), data_test.info()

[*********************100%***********************]  11 of 11 completed
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11022 entries, 0 to 11021
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   date            11022 non-null  datetime64[ns, UTC]
 1   ticker          11022 non-null  object             
 2   value           11022 non-null  float64            
 3   log_diff        11022 non-null  float64            
 4   moving_avg      11022 non-null  float64            
 5   sig_delta_up    11022 non-null  int64              
 6   sig_delta_down  11022 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(3), int64(2), object(1)
memory usage: 602.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2755 entries, 0 to 2754
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              


(None, None)

In [3]:
def unpack_data(data):
    return {
        ticker: {
            "log_diff": data[(data["ticker"] == ticker)]["log_diff"].values.reshape(
                -1, 1
            ),
            "y": data.loc[
                (data["ticker"] == ticker), ["sig_delta_up", "sig_delta_down"]
            ].values,
        }
        for ticker in data["ticker"].unique()
    }


def get_scale_models(array_train):
    return {
        ticker: MinMaxScaler().fit(array_train[ticker]["log_diff"])
        for ticker in array_train.keys()
    }


def scale_array(array, scalers):
    for ticker in array.keys():
        array[ticker]["log_diff"] = scalers[ticker].transform(array[ticker]["log_diff"])
    return array


def stack_arrays(array):
    x = np.vstack([val["log_diff"] for val in array.values()])
    y = np.vstack([val["y"] for val in array.values()])
    return x, y


array_train, array_test = map(unpack_data, [data_train, data_test])

scalers = get_scale_models(array_train)

array_train, array_test = [
    scale_array(array, scalers) for array in [array_train, array_test]
]

(x_train, y_train), (x_test, y_test) = map(stack_arrays, [array_train, array_test])
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((11022, 1), (11022, 2), (2755, 1), (2755, 2))

In [4]:
lookback = 10


def vectorized_stride(array, sub_window_size=lookback, stride_size=1):
    max_time = array.shape[0]
    start = max_time % sub_window_size
    sub_windows = (
        start
        + np.expand_dims(np.arange(sub_window_size), 0)
        + np.expand_dims(
            np.arange(max_time + 1 - (start + sub_window_size), step=stride_size), 0
        ).T
    )
    return array[sub_windows]


def trim_y(array, sub_window_size=lookback):
    max_time = array.shape[0]
    start = max_time % sub_window_size + sub_window_size - 1
    print(start)
    return array[start:]


X_train = vectorized_stride(x_train).reshape((-1, lookback)).astype("float32")
X_test = vectorized_stride(x_test).reshape((-1, lookback)).astype("float32")
y_train = trim_y(y_train).astype("float32")
y_test = trim_y(y_test).astype("float32")

X_train, y_train, X_test, y_test = [
    torch.from_numpy(array) for array in [X_train, y_train, X_test, y_test]
]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

11
14


(torch.Size([11011, 10]),
 torch.Size([11011, 2]),
 torch.Size([2741, 10]),
 torch.Size([2741, 2]))

In [5]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(10, 100),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(100, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 100),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(100, 2),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Sigmoid(),
        )

    def forward(self, x):
        x = self.linear_relu_stack(x)
        return x


net = Net()
net

Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=10, out_features=100, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.8, inplace=False)
    (3): Linear(in_features=100, out_features=500, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.8, inplace=False)
    (6): Linear(in_features=500, out_features=500, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.8, inplace=False)
    (9): Linear(in_features=500, out_features=500, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.8, inplace=False)
    (12): Linear(in_features=500, out_features=500, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.8, inplace=False)
    (15): Linear(in_features=500, out_features=500, bias=True)
    (16): ReLU()
    (17): Dropout(p=0.8, inplace=False)
    (18): Linear(in_features=500, out_features=100, bias=True)
    (19): ReLU()
    (20): Dropout(p=0.8, inplace=False)
    (21): Linear(in_features=100, out_features=2, bias=True)
    (22): ReLU()
    (23): Dropout(p=0.8, inplace=False)
    (24): Sig

In [6]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.0001, momentum=0.9)


def train(X, y, model, size=None, batch_size=1000):
    # X is a torch Variable
    permutation = torch.randperm(X.size()[0])

    for i in range(0, X.size()[0], batch_size):
        optimizer.zero_grad()

        indices = permutation[i : i + batch_size]
        batch_x, batch_y = X[indices], y[indices]

        # in case you wanted a semi-full example
        outputs = model.forward(batch_x)
        loss = loss_fn(outputs, batch_y)

        loss.backward()
        optimizer.step()
    loss, current = loss.item(), i * len(X)
    return loss


def test(X, y, model):
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for i in range(len(X)):
            pred = model(X[i])
            test_loss += loss_fn(pred, y[i]).item()
            correct += (pred.round() == y[i]).type(torch.float).sum().item()
    #             if i == 1:
    #                 print(pred.round(), y[i])
    test_loss /= len(X)
    correct /= y.numel()
    return (100 * correct), test_loss

In [None]:
for epoch in range(1001):
    loss = train(X_train, y_train, net, len(X_train))
    accuracy, test_loss = test(X_test, y_test, net)
    if epoch % 50 == 0:
        print(f"Epoch-------------{epoch}")
        print(f"loss: {loss:>7f}")
        print(
            f"Test Error: \n Accuracy: {accuracy:>0.1f}%, Avg loss: {test_loss:>8f} \n"
        )

Epoch-------------0
loss: -0.000000
Test Error: 
 Accuracy: 94.3%, Avg loss: 0.079658 

