In [1]:
%cd NYSE-Temporal-Graph-Construction

/teamspace/studios/this_studio/NYSE-Temporal-Graph-Construction


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [14]:
import networkx as nx
from matplotlib import pyplot as plt
from tqdm import tqdm
from os.path import join as join_path
import os
from torch.utils.data import Dataset
import kagglehub
import numpy as np
import pandas as pd
from functools import cache
import torch
from torch import nn
from torch.nn import functional as F

from torch_geometric.nn import GCN, global_mean_pool, GRUAggregation

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

Device: cpu


In [3]:
dataset_path = kagglehub.dataset_download("dgawlik/nyse")



In [4]:
fundamentals_df = pd.read_csv(join_path(dataset_path,"fundamentals.csv"))
df = pd.read_csv(join_path(dataset_path, "prices.csv"))
prices_split_adjusted_df = pd.read_csv(join_path(dataset_path, "prices-split-adjusted.csv"))
securities_df = pd.read_csv(join_path(dataset_path, "securities.csv"))

In [5]:
all_symbols = prices_split_adjusted_df['symbol'].unique()

@cache
def symbol_to_int(symbol: str):
    return all_symbols.tolist().index(symbol)

@cache
def int_to_symbol(idx):
    return all_symbols[idx]

print("Num of symbols:", len(all_symbols))

Num of symbols: 501


In [6]:
prices_split_adjusted_df['date'] = pd.to_datetime(prices_split_adjusted_df['date']).dt.date
dates = prices_split_adjusted_df['date'].sort_values().unique()


def any_to_date(date):
    if not isinstance(date, pd._libs.tslibs.timestamps.Timestamp):
        date = pd.to_datetime(date).date()
    return date

@cache
def date_to_int(date):
    date = any_to_date(date)
    return dates.tolist().index(date)

@cache
def int_to_date(idx):
    return dates[idx]

print(dates)

[datetime.date(2010, 1, 4) datetime.date(2010, 1, 5)
 datetime.date(2010, 1, 6) ... datetime.date(2016, 12, 28)
 datetime.date(2016, 12, 29) datetime.date(2016, 12, 30)]


In [7]:
def batch_dates(target_df, _add_missing_symbols=True):

    df = target_df.copy()
        
    df['date'] = df['date'].apply(date_to_int)
    df['symbol'] = df['symbol'].apply(symbol_to_int)


    df.sort_values(['date', 'symbol'], inplace=True)



    for date, group in df.groupby('date'):
        if _add_missing_symbols:
            group = add_missing_symbols(date, group)
        yield date, group

def add_missing_symbols(date, group):
    missing_symbols = [s_int for s in all_symbols if (s_int:=symbol_to_int(s)) not in group['symbol'].unique()]
    if len(missing_symbols) > 0:
        group = pd.concat([group, pd.DataFrame({'date': [date] * len(missing_symbols), 'symbol': missing_symbols})])
        group = group.sort_values(['date', 'symbol'])
        group = group.fillna(0)

    return group


In [10]:
for date, batch in batch_dates(prices_split_adjusted_df.sort_values('date').head(1000), _add_missing_symbols=False):
    print(add_missing_symbols(date, batch).shape)

(501, 7)
(501, 7)
(501, 7)


In [33]:
class Embdedder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim):
        super(Embdedder, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.layers = torch.nn.Sequential()
        if isinstance(hidden_dims, int):
            hidden_dims = [hidden_dims]
        elif hidden_dims is None or len(hidden_dims) == 0:
            self.hidden_dims = []
            self.layers.add_module("linear0", torch.nn.Linear(input_dim, output_dim))
            self.layers.add_module("sigmoid", torch.nn.Sigmoid())
            return

        self.hidden_dims = hidden_dims
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(input_dim, hidden_dims[0]),
            torch.nn.ReLU()
        )
        for i in range(len(hidden_dims)):
            if i == len(hidden_dims) - 1:
                self.layers.add_module(f"linear{i}", torch.nn.Linear(hidden_dims[i], output_dim))
                self.layers.add_module(f"sigmoid", torch.nn.Sigmoid())
            else:
                self.layers.add_module(f"linear{i}", torch.nn.Linear(hidden_dims[i], hidden_dims[i + 1]))
                self.layers.add_module(f"relu{i}", torch.nn.ReLU())

    def forward(self, x):
        print(x.shape, x.type())
        return self.layers(x)
            

class GraphConstructor(nn.Module):
    def __init__(self, num_nodes, feature_dim):
        super(GraphConstructor, self).__init__()
        self.num_nodes = num_nodes
        self.feature_dim = feature_dim
        
        # self.embedder = embedder or Embdedder(feature_dim, [feature_dim * 2], feature_dim * 4)
        
        self.feature_dim = feature_dim #self.embedder.output_dim

        self.query = nn.Linear(self.feature_dim, self.feature_dim)
        self.key = nn.Linear(self.feature_dim, self.feature_dim)
        self.value = nn.Linear(self.feature_dim, self.num_nodes)


    def forward(self, x):
        # x: [num_nodes, feature_dim]

        # embeddings = self.embedder(x) # shape: [num_nodes, feature_dim]

        query = self.query(x) # shape: [num_nodes, feature_dim]
        key = self.key(x) # shape: [num_nodes, feature_dim]
        value = self.value(x) # shape: [num_nodes, feature_dim]

        # shape: [num_nodes, num_nodes]
        attention = torch.matmul(query, key.T) / self.feature_dim ** 0.5
        attention = torch.softmax(attention, dim=1)

        # shape: [num_nodes, latent_dim]
        edge_weight = torch.matmul(attention, value)
        edge_weight = torch.sigmoid(edge_weight)
        return edge_weight


class Predictor(nn.Module):
    def __init__(self,  num_nodes, feature_dim, hidden_dim=None):
        super(Predictor, self).__init__()
        self.num_nodes = num_nodes
        self.feature_dim = feature_dim

        self.hidden_dim = hidden_dim or self.feature_dim

        self.graph_constructor = GraphConstructor(num_nodes, self.feature_dim)

        self.threshold = nn.Parameter(torch.tensor(0.5))
    
        self.gcn = GCN(feature_dim, feature_dim, num_layers=2, out_channels=self.hidden_dim, dropout=0.1)
        self.gru = nn.GRUCell(self.hidden_dim, self.hidden_dim)
        self.linear = nn.Linear(self.hidden_dim, 1)


    def forward(self, x, hidden_state=None):
        # x: [date_batch_size, embedding_dim]
        # embeddings = self.embedder(x) # shape: [date_batch_size, latent_dim]

        A = self.graph_constructor(x) # adjacency matrix, shape: [date_batch_size, date_batch_size]
        # print("A", A)
        A = A > self.threshold
        
        # convert Adjacency matrix to edge index
        edge_index = torch.nonzero(A, as_tuple=False).t()

        x = self.gcn(x, edge_index)
        if hidden_state is None:
            hidden_state = torch.zeros(x.shape[0], self.hidden_dim).to(device)
        h_next = self.gru(x, hidden_state)
        y = self.linear(h_next)
        y = torch.tanh(y)

        return y, h_next
        

In [34]:
d = None
for _, batch in batch_dates(prices_split_adjusted_df.sort_values('date').head(1000)):
    d = batch
    break

num_nodes = len(all_symbols)
emb_dim = 5 * 4
embedder = Embdedder(5, [5 * 2], emb_dim)
model = Predictor(num_nodes, emb_dim).to(device)

d = d.drop(columns=['date', 'symbol']).to_numpy()

d = torch.tensor(d).to(device, dtype=torch.float32)

print(d.shape)
y, hidden_state = model(embedder(d))
print("y", y.shape, y)
print("hidden_state", hidden_state.shape)

total_params = sum(p.numel() for p in model.parameters())
print(total_params)




torch.Size([501, 5])
torch.Size([501, 5]) torch.FloatTensor
y torch.Size([501, 1]) tensor([[-0.1575],
        [-0.3256],
        [-0.3256],
        [-0.2298],
        [-0.2224],
        [-0.3256],
        [-0.2121],
        [-0.3256],
        [-0.3256],
        [-0.2298],
        [-0.3256],
        [-0.3256],
        [-0.2237],
        [-0.2060],
        [-0.3256],
        [-0.3256],
        [-0.3256],
        [-0.3256],
        [-0.2015],
        [-0.2224],
        [-0.3256],
        [-0.3256],
        [-0.3256],
        [-0.3256],
        [-0.3256],
        [-0.3256],
        [-0.3256],
        [-0.2298],
        [-0.3256],
        [-0.2298],
        [-0.3256],
        [-0.3256],
        [-0.2009],
        [-0.2237],
        [-0.2298],
        [-0.2298],
        [-0.2097],
        [-0.2298],
        [-0.2376],
        [-0.2298],
        [-0.2298],
        [-0.2298],
        [-0.3256],
        [-0.3256],
        [-0.2298],
        [-0.2298],
        [-0.2060],
        [-0.2290],
     

In [None]:
def train(predictor, train_loader, optimizer, device):
    loss_fn = torch.nn.MSELoss()
    
    predictor.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = predictor(data)
        loss = criterion(output, data)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss