In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import yfinance as yf
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
data = yf.download("GOOG AAPL MSFT POLX.L POW.L AMD TSLA F KR UMC GM", start="2020-01-01", end="2023-01-01", group_by="ticker")

# Reshape data
data = data.unstack().reset_index().rename(columns={"level_0": "ticker", "level_1": "measure", 0: "value"}).dropna()

# Filter to adj close
data = data.loc[data["measure"]=="Adj Close", ["Date", "ticker", "value"]].reset_index(drop=True)

# Calculate log diff
data["log_diff"] = data.groupby("ticker")["value"].transform(lambda x: np.log(x) - np.log(x.shift()))

# Moving average
data["moving_avg"] = data["value"].rolling(5).mean()

# Add flag for whether the value is up in the next weeks moving average is up 10 % or down 10%
data["sig_delta_up"] = data.groupby("ticker")["value"].transform(lambda x: np.where(((x.shift(-5) - x) / x) > 0.1, 1, 0))
data["sig_delta_down"] = data.groupby("ticker")["value"].transform(lambda x: np.where(((x.shift(-5) - x) / x) < -0.1, 1, 0))

# Remove nas and reset the index
data = data.dropna().reset_index(drop=True)

data.info()

[*********************100%***********************]  11 of 11 completed
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8304 entries, 0 to 8303
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   Date            8304 non-null   datetime64[ns, UTC]
 1   ticker          8304 non-null   object             
 2   value           8304 non-null   float64            
 3   log_diff        8304 non-null   float64            
 4   moving_avg      8304 non-null   float64            
 5   sig_delta_up    8304 non-null   int64              
 6   sig_delta_down  8304 non-null   int64              
 7   is_train        8304 non-null   int64              
dtypes: datetime64[ns, UTC](1), float64(3), int64(3), object(1)
memory usage: 519.1+ KB


In [3]:
scalers = {}
data["scaled_ld"] = None
for ticker in data["ticker"].unique():
    scalers[ticker] = MinMaxScaler((-1, 1)).fit(data[(data["ticker"]==ticker) & (data["is_train"]==1)]["log_diff"].values.reshape(-1, 1))
    data.loc[data["ticker"]==ticker, "scaled_ld"] = scalers[ticker].transform(data[data["ticker"]==ticker]["log_diff"].values.reshape(-1, 1)).reshape(1, -1)[0]

In [9]:
lookback = 10

X = []
y = []
idx = []
for ticker in data["ticker"].unique():
    ticker_data = data[data["ticker"]==ticker]
    length = len(ticker_data["scaled_ld"]) - 1
    start, end = ticker_data.index[lookback], ticker_data.index[length]
    i = 0
    for n in range(lookback-1, length):
        x = data["scaled_ld"].values[n-lookback:n]
        if len(x) == lookback:
            idx.append(i)
            i+=1
            X.append(x)
            y.append(data.loc[n, ["sig_delta_up", "sig_delta_down"]].values)
X = np.array(X).astype("float32")
y = np.array(y).astype("float32")

In [18]:
train_idx = data.loc[idx, "Date"][data["Date"] < data["Date"].max() - dt.timedelta(days=60)].index
val_idx = data.iloc[~train_idx].index

In [19]:
X_test, X_train, y_test, y_train = [torch.tensor(x) for x in train_test_split(train_idx, train_idx, test_size=0.8, shuffle=True, random_state=2)]

X_val, y_val = X[val_idx], y[val_idx]

X_train.shape, y_train.shape, X_val.shape, y_val.shape

IndexError: index 8293 is out of bounds for axis 0 with size 8183

In [6]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(10, 100),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(100, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 500),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(500, 100),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(100, 2),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Sigmoid()
        )
        

    def forward(self, x):
        x = self.linear_relu_stack(x)
        return x


net = Net()
net

Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=10, out_features=100, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.8, inplace=False)
    (3): Linear(in_features=100, out_features=500, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.8, inplace=False)
    (6): Linear(in_features=500, out_features=500, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.8, inplace=False)
    (9): Linear(in_features=500, out_features=500, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.8, inplace=False)
    (12): Linear(in_features=500, out_features=500, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.8, inplace=False)
    (15): Linear(in_features=500, out_features=500, bias=True)
    (16): ReLU()
    (17): Dropout(p=0.8, inplace=False)
    (18): Linear(in_features=500, out_features=100, bias=True)
    (19): ReLU()
    (20): Dropout(p=0.8, inplace=False)
    (21): Linear(in_features=100, out_features=2, bias=True)
    (22): ReLU()
    (23): Dropout(p=0.8, inplace=False)
    (24): Sig

In [7]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(net.parameters(),  lr=0.001, momentum=0.9)

def train(X, y, model, size=None, batch_size=3000):
    # X is a torch Variable
    permutation = torch.randperm(X.size()[0])

    for i in range(0,X.size()[0], batch_size):
        optimizer.zero_grad()

        indices = permutation[i:i+batch_size]
        batch_x, batch_y = X[indices], y[indices]
        
        # in case you wanted a semi-full example
        outputs = model.forward(batch_x)
        loss = loss_fn(outputs, batch_y)

        loss.backward()
        optimizer.step()
    loss, current = loss.item(), i * len(X)
    return loss

def test(X, y, model):
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for i in range(len(X)):
            pred = model(X[i])
            test_loss += loss_fn(pred, y[i]).item()
            correct += (pred.round() == y[i]).type(torch.float).sum().item()
#             if i == 1:
#                 print(pred.round(), y[i])
    test_loss /= len(X)
    correct /= y.numel()
    return (100*correct), test_loss


In [8]:
for epoch in range(201):
    loss = train(X_train, y_train, net, len(X_train))
    accuracy, test_loss = test(X_test, y_test, net)
    if epoch % 50 == 0:
        print(f"Epoch-------------{epoch}")
        print(f"loss: {loss:>7f}")
        print(f"Test Error: \n Accuracy: {accuracy:>0.1f}%, Avg loss: {test_loss:>8f} \n")

Epoch-------------0
loss: 0.277211
Test Error: 
 Accuracy: 46.1%, Avg loss: 0.256931 

Epoch-------------50
loss: 0.250000
Test Error: 
 Accuracy: 87.6%, Avg loss: 0.250000 

Epoch-------------100
loss: 0.250000
Test Error: 
 Accuracy: 87.6%, Avg loss: 0.250000 

Epoch-------------150
loss: 0.250000
Test Error: 
 Accuracy: 87.6%, Avg loss: 0.250000 

Epoch-------------200
loss: 0.250000
Test Error: 
 Accuracy: 87.6%, Avg loss: 0.250000 

