In [1]:
import yfinance as yf

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from itertools import product

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# if torch.cuda.is_available():
#     device = torch.device('cuda')
#     print('GPU')
# else:
#     device = torch.device('cpu')
#     print('CPU')
device = torch.device('cpu')

In [3]:
def get_yfinance_data(ticker_list : list[str], start_date : str, end_date : str):
    """ Get data from yfinance for a list of tickers.
    
    It includes Open, High, Low, Close, Adj. Close, and Volume.
    
    Args:
        ticker_list: List of ticker symbols
        start_date: Start date of the data
        end_date: End date of the data

    Returns:
        A dict mapping ticker symbols to dataframes containing the data
        fetched from yfinance
    """
    data_dict = {}

    for ticker_symbol in ticker_list:
        data = yf.download(ticker_symbol, start=start_date, end=end_date)
        data_dict[ticker_symbol] = data

    return data_dict

In [4]:
# Example tickers and dates
stock1 = "GOOGL" #"AAPL"
stock2 = "DBX" #"MSFT"
stock3 = "V" #"TSLA"

tickers = [stock1, stock2, stock3]

start_date = "2021-01-01"
end_date = "2024-01-01"

yfinance_data_dict = get_yfinance_data(tickers, start_date, end_date)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [5]:
# Example dataframe
example_df = yfinance_data_dict[stock1]
example_df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-11-17,136.0,136.059998,133.649994,135.309998,135.309998,37240600
2023-11-20,133.690002,136.660004,133.619995,136.25,136.25,27815500
2023-11-21,136.289993,137.179993,135.960007,136.970001,136.970001,22635300
2023-11-22,137.470001,139.419998,137.470001,138.490005,138.490005,17813900
2023-11-24,138.029999,138.130005,135.990005,136.690002,136.690002,12514300


In [6]:
WINDOW_SIZE = 3

SMA_PERIOD = WINDOW_SIZE
EMA_PERIOD = WINDOW_SIZE
CCI_PERIOD = WINDOW_SIZE
VOLATILITY_PERIOD  = WINDOW_SIZE
ROC_PERIOD  = WINDOW_SIZE

for ticker, ticker_df in yfinance_data_dict.items():
    ticker_df[f"Returns {ticker}"] = (ticker_df["Close"] - ticker_df["Open"]) / ticker_df["Open"]
    
    ticker_df[f"Log Returns {ticker}"] = np.log(ticker_df["Close"]).diff()
    
    ticker_df[f"SMA {ticker}"] = ticker_df["Close"].rolling(window=SMA_PERIOD).mean()
    
    ticker_df[f"EMA {ticker}"] = ticker_df["Close"].ewm(span=EMA_PERIOD, adjust=False).mean()

    # VWAP
    value = ticker_df["Close"] * ticker_df["Volume"]
    cumulative_value = value.cumsum()
    cumulative_volume = ticker_df["Volume"].cumsum()
    ticker_df[f"VWAP {ticker}"] = cumulative_value / cumulative_volume

    # CCI 
    typical_price = (ticker_df["High"] + ticker_df["Low"] + ticker_df["Close"]) / 3
    mean_typical_price = typical_price.rolling(window=CCI_PERIOD).mean()
    mean_deviation = (typical_price - mean_typical_price).abs().rolling(window=CCI_PERIOD).mean()
    ticker_df[f"CCI {ticker}"] = (typical_price - mean_typical_price) / (0.015 * mean_deviation)

    ticker_df[f"Volatility {ticker}"] = ticker_df[f"Returns {ticker}"].rolling(window=VOLATILITY_PERIOD).std()

    ticker_df[f"RoC {ticker}"] = (ticker_df["Close"] / ticker_df["Close"].shift(ROC_PERIOD) - 1) * 100

    ticker_df.rename(columns={'Volume' : f"Volume {ticker}"}, inplace=True)

In [7]:
for ticker_df in yfinance_data_dict.values():
    ticker_df.drop(["Open", "High", "Low", "Close", "Adj Close"], axis = 1, inplace=True)
    ticker_df.dropna(inplace=True)

In [8]:
# Example dataframe after feature construction
example_df.head()

Unnamed: 0_level_0,Volume GOOGL,Returns GOOGL,Log Returns GOOGL,SMA GOOGL,EMA GOOGL,VWAP GOOGL,CCI GOOGL,Volatility GOOGL,RoC GOOGL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-01-08,35484000,0.011631,0.013152,88.250834,88.724814,87.599331,101.889946,0.008751,3.320594
2021-01-11,34796000,-0.011666,-0.023377,88.807668,88.269657,87.633915,-19.974084,0.019725,1.939197
2021-01-12,29528000,-0.004543,-0.010797,88.1925,87.570577,87.542406,-77.760797,0.011938,-2.080215
2021-01-13,23432000,0.011538,0.005636,87.3495,87.46654,87.526761,-33.705934,0.011887,-2.813391
2021-01-14,29212000,-0.010264,-0.00939,86.926666,87.006269,87.430833,-2.719865,0.011304,-1.444524


In [9]:
full_df = pd.concat(yfinance_data_dict.values(), axis=1)

In [10]:
full_df["Label"] = np.argmax(full_df[[f"Returns {ticker}" for ticker in tickers]].values, axis=1)

In [11]:
full_df.head()

Unnamed: 0_level_0,Volume GOOGL,Returns GOOGL,Log Returns GOOGL,SMA GOOGL,EMA GOOGL,VWAP GOOGL,CCI GOOGL,Volatility GOOGL,RoC GOOGL,Volume DBX,...,Volume V,Returns V,Log Returns V,SMA V,EMA V,VWAP V,CCI V,Volatility V,RoC V,Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-08,35484000,0.011631,0.013152,88.250834,88.724814,87.599331,101.889946,0.008751,3.320594,5240100,...,6513000,0.005977,0.007641,213.959997,214.771872,214.948429,45.644519,0.007148,0.438209,0
2021-01-11,34796000,-0.011666,-0.023377,88.807668,88.269657,87.633915,-19.974084,0.019725,1.939197,8838700,...,7353100,-0.005001,-0.011953,214.049998,213.830935,214.640966,-61.14112,0.00685,0.126989,1
2021-01-12,29528000,-0.004543,-0.010797,88.1925,87.570577,87.542406,-77.760797,0.011938,-2.080215,5722900,...,9331600,-0.016389,-0.019111,212.399999,211.345468,213.71976,-138.656181,0.011183,-2.315138,1
2021-01-13,23432000,0.011538,0.005636,87.3495,87.46654,87.526761,-33.705934,0.011887,-2.813391,14511800,...,6688500,-0.000287,0.002343,210.366669,210.347737,213.271824,-53.236638,0.008278,-2.831279,0
2021-01-14,29212000,-0.010264,-0.00939,86.926666,87.006269,87.430833,-2.719865,0.011304,-1.444524,6976100,...,12887500,-0.041364,-0.036433,206.690002,206.103869,211.389589,-86.102763,0.020698,-5.181079,1


In [12]:
# Split and preprocess the data separately to avoid data leakage
def create_dataset_v2(dataset, window_size):
    X, y = [], []
    for i in range(len(dataset)-window_size):
        features, target = dataset[i:i+window_size, :-1], dataset[i+window_size, -1]
        X.append(features)
        y.append(target)
    return torch.tensor(np.array(X), dtype=torch.float), torch.tensor(np.array(y), dtype=torch.long)

X = full_df.to_numpy()

TRAIN_PROPORTION = 0.9
train_size = int(TRAIN_PROPORTION * X.shape[0])

train_features, train_labels = X[:train_size, :-1], X[:train_size, -1].reshape(-1,1)
test_features, test_labels = X[train_size:, :-1], X[train_size:, -1].reshape(-1,1)

train_scaler = RobustScaler() #  MinMaxScaler(feature_range=(-1,1))
train_features_scaled = train_scaler.fit_transform(train_features)
one_hot_train_labels = np.eye(3)[train_labels.astype(int).reshape(-1)] # Add one-hot encoding of labels from previous days as features
train_dataset = np.concatenate([train_features_scaled, one_hot_train_labels, train_labels], axis=1)

test_scaler = RobustScaler() # MinMaxScaler(feature_range=(-1,1))
test_features_scaled = test_scaler.fit_transform(test_features)
one_hot_test_labels = np.eye(3)[test_labels.astype(int).reshape(-1)] # Add one-hot encoding of labels from previous days as features
test_dataset = np.concatenate([test_features_scaled, one_hot_test_labels, test_labels], axis=1)

X_train, y_train = create_dataset_v2(train_dataset, window_size=WINDOW_SIZE)
X_test, y_test = create_dataset_v2(test_dataset, window_size=WINDOW_SIZE)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([649, 3, 30]),
 torch.Size([649]),
 torch.Size([70, 3, 30]),
 torch.Size([70]))

In [13]:
X_train = X_train.to(device)
y_train = y_train.to(device)
X_test = X_test.to(device)
y_test = y_test.to(device)

In [14]:
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size = self.input_dim, 
                            hidden_size = self.hidden_dim, 
                            num_layers=self.num_layers, 
                            batch_first=True)
        self.linear = nn.Linear(in_features=self.hidden_dim, out_features=output_dim)
        
    def forward(self, x):
        batch_size = x.shape[0]
        
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).requires_grad_().to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).requires_grad_().to(device)
        
        _, (hn, _) = self.lstm(x, (h0, c0)) 
        out = self.linear(hn[-1]) # Last hidden state
            
        return out

In [15]:
OUTPUT_DIM = len(tickers)

def validate_model(batch_size, hidden_dim, learning_rate, num_epochs, display_progress=False):
    model = LSTM(input_dim=X_train.shape[2], hidden_dim=hidden_dim, num_layers=1, output_dim=OUTPUT_DIM).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    torch.manual_seed(1)

    loader = DataLoader(TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)

    for epoch in range(num_epochs+1):
        model.train()
        for X_batch, y_batch in loader:
            y_pred_logits = model(X_batch)
            loss = loss_fn(y_pred_logits, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if display_progress:
            if epoch % 20 != 0:
                continue
            
            model.eval()
            with torch.no_grad():
                y_pred_logits = model(X_train)
                train_cross_entropy_loss = loss_fn(y_pred_logits, y_train) # CrossEntropyLoss expects logits, not probabilities
                print("Epoch %d: train Cross Entropy Loss %.4f" % (epoch, train_cross_entropy_loss.item()))

    model.eval()
    with torch.no_grad():
        y_pred_logits = model(X_test)
        test_cross_entropy_loss = loss_fn(y_pred_logits, y_test).item()
        y_pred_labels = torch.argmax(y_pred_logits, axis=1) # not necessary to use softmax for accuracy calculation, since max of softmax is also max of probabilities
        accuracy = accuracy_score(y_test.cpu().numpy(), y_pred_labels.cpu().numpy())
        return accuracy, test_cross_entropy_loss

In [16]:
batch_sizes = [64,128] # [16, 32, 64, 128]
hidden_size_values = [64] # [32, 64, 128]
learning_rate_values = [0.01] #[0.1, 0.01, 0.001, 0.0001]
num_epochs_values = [100] # [50, 100, 300, 1000]

best_accuracy = 0
best_hyperparameters = None
i = 0

for (batch_size, hidden_dim, learning_rate, num_epochs) in product(batch_sizes, hidden_size_values, learning_rate_values, num_epochs_values):
    curr_accuracy, curr_loss = validate_model(batch_size, hidden_dim, learning_rate, num_epochs, display_progress=True)
    if curr_accuracy > best_accuracy:
        best_accuracy = curr_accuracy
        best_hyperparameters = (batch_size, hidden_dim, learning_rate, num_epochs)
    i += 1
    if i % 5 == 0:
        print(f"{i} models trained")

print("Best accuracy: %.4f" % best_accuracy)
print("Best hyperparameters: %s" % str(best_hyperparameters))

Epoch 0: train Cross Entropy Loss 1.0611
Epoch 20: train Cross Entropy Loss 0.0106
Epoch 40: train Cross Entropy Loss 0.0016
Epoch 60: train Cross Entropy Loss 0.0007
Epoch 80: train Cross Entropy Loss 0.0004
Epoch 100: train Cross Entropy Loss 0.0003
Epoch 0: train Cross Entropy Loss 1.0636
Epoch 20: train Cross Entropy Loss 0.3008
Epoch 40: train Cross Entropy Loss 0.0092
Epoch 60: train Cross Entropy Loss 0.0028
Epoch 80: train Cross Entropy Loss 0.0015
Epoch 100: train Cross Entropy Loss 0.0009
Best accuracy: 0.3857
Best hyperparameters: (64, 64, 0.01, 100)


In [17]:
batch_size, hidden_dim, learning_rate, num_epochs = best_hyperparameters
model = LSTM(input_dim=X_train.shape[2], hidden_dim=hidden_dim, num_layers=1, output_dim=OUTPUT_DIM).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

torch.manual_seed(1)

loader = DataLoader(TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)

for epoch in range(num_epochs+1):
    model.train()
    for X_batch, y_batch in loader:
        y_pred_logits = model(X_batch)
        loss = loss_fn(y_pred_logits, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if epoch % 20 != 0:
        continue
    
    model.eval()
    with torch.no_grad():
        y_pred_logits = model(X_train)
        train_cross_entropy_loss = loss_fn(y_pred_logits, y_train)
        print("Epoch %d: train Cross Entropy Loss %.4f" % (epoch, train_cross_entropy_loss.item()))

model.eval()
with torch.no_grad():
    y_pred_logits = model(X_test)
    y_pred_labels = torch.argmax(y_pred_logits, axis=1) # not necessary to use softmax for accuracy calculation, since max of softmax is also max of probabilities
    print("Testing accuracy %.4f" % accuracy_score(y_test.cpu().numpy(), y_pred_labels.cpu().numpy()))
    print(confusion_matrix(y_test.cpu().numpy(), y_pred_labels.cpu().numpy()))
    print(classification_report(y_test.cpu().numpy(), y_pred_labels.cpu().numpy()))

Epoch 0: train Cross Entropy Loss 1.0614
Epoch 20: train Cross Entropy Loss 0.0100
Epoch 40: train Cross Entropy Loss 0.0016
Epoch 60: train Cross Entropy Loss 0.0007
Epoch 80: train Cross Entropy Loss 0.0004
Epoch 100: train Cross Entropy Loss 0.0003
Testing accuracy 0.4286
[[ 7 11 10]
 [ 5 11  7]
 [ 6  1 12]]
              precision    recall  f1-score   support

           0       0.39      0.25      0.30        28
           1       0.48      0.48      0.48        23
           2       0.41      0.63      0.50        19

    accuracy                           0.43        70
   macro avg       0.43      0.45      0.43        70
weighted avg       0.43      0.43      0.41        70



In [18]:
accuracies = []
for _ in range(10000):
    rand_preds = torch.tensor(np.random.randint(0, len(tickers), size=y_test.shape[0])).to(device)
    # print("Testing accuracy %.4f" % accuracy_score(y_test.cpu().numpy(), rand_preds.cpu().numpy()))
    # print(confusion_matrix(y_test.cpu().numpy(), rand_preds.cpu().numpy()))
    # print(classification_report(y_test.cpu().numpy(), rand_preds.cpu().numpy()))
    accuracies.append(accuracy_score(y_test.cpu().numpy(), rand_preds.cpu().numpy()))

print("Random accuracy: %.4f" % np.mean(accuracies))

Random accuracy: 0.3350
