In [1]:
#Temporal Graph Network 

In [20]:
import pymongo
import pandas as pd
from pytz import timezone
import numpy as np
import matplotlib.pyplot as plt
import time
import matplotlib.dates as mdates
from sklearn.model_selection import train_test_split
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

In [6]:
# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["stock_data"]
collection = db["price_data"]

# Query all documents from the collection
results = collection.find({})

# Convert the query results to a pandas DataFrame
data = pd.DataFrame(list(results))

stocks = data.copy()

# Drop unnecessary columns
columns_to_drop = ["_id", "Adj Close"]
stocks = stocks.drop(columns=columns_to_drop)

# Create Change
stocks['Change'] = stocks['Close'] - stocks['Open']

# Drop 'Open' if it's no longer needed
stocks = stocks.drop(columns=['Open', 'High', 'Low'])

# Define rolling window sizes in minutes
window_sizes = {
    '10min': 10,
    '60min': 60,
    '3hr': 3 * 60,  # 3 hours in minutes
    '1day': 24 * 60,  # 1 day in minutes
    '5day': 5 * 24 * 60  # 5 days in minutes
}

# Calculate the moving averages for all stocks
for window_name, minutes in window_sizes.items():
    stocks[f'{window_name}_MA'] = stocks.groupby('ticker')['Close'].rolling(window=minutes, min_periods=1).mean().reset_index(level=0, drop=True)

# Fill NaN values with the first available value if there are any NaNs
stocks.fillna(method='bfill', inplace=True)

# Convert 'Datetime' to Eastern Time
eastern = timezone('US/Eastern')
stocks['Datetime_ET'] = stocks['Datetime'].dt.tz_localize('UTC').dt.tz_convert(eastern)

# Extract time features from 'Datetime_ET'
stocks['hour_of_day'] = stocks['Datetime_ET'].dt.hour + stocks['Datetime_ET'].dt.minute / 60
stocks['hour_of_day_normalized'] = 2 * np.pi * stocks['hour_of_day'] / 24
stocks['hour_sin'] = np.sin(stocks['hour_of_day_normalized'])
stocks['hour_cos'] = np.cos(stocks['hour_of_day_normalized'])

# Now convert 'Datetime' to UNIX timestamp if needed
stocks['Timestamp'] = stocks['Datetime'].astype('int64') // 1e9

# Drop the original 'Datetime' and 'Datetime_ET' if they are no longer needed
stocks = stocks.drop(columns=['Datetime', 'Datetime_ET'])

stocks.sort_values(['ticker', 'Timestamp'], inplace=True)

# Take 5 ticker symbols to validate code

# List of tickers we want to keep
tickers_to_keep = ['AAPL', 'ADBE', 'AMZN', 'MSFT', 'NVDA']

# Create a new DataFrame with only the specified tickers
five_stocks = stocks[stocks['ticker'].isin(tickers_to_keep)].copy()

# Now, five_stocks contains only the data for the five specified tickers
five_stocks.sort_values(['ticker', 'Timestamp'], inplace=True)
five_stocks.head(5)

# now have a dataframe with data for five stocks
columns = five_stocks.columns.tolist()  # Convert Index to a list
print(columns)

  stocks.fillna(method='bfill', inplace=True)


['Close', 'Volume', 'ticker', 'Change', '10min_MA', '60min_MA', '3hr_MA', '1day_MA', '5day_MA', 'hour_of_day', 'hour_of_day_normalized', 'hour_sin', 'hour_cos', 'Timestamp']


In [10]:
# Assuming 'five_stocks' has columns: 'timestamp', 'ticker', 'feature1', 'feature2', ..., 'Close'

# Convert the timestamps to numerical format (e.g., seconds since some start time)
five_stocks['Timestamp'] = pd.to_datetime(five_stocks['Timestamp'])
five_stocks['Timestamp'] = (five_stocks['Timestamp'] - five_stocks['Timestamp'].min()).dt.total_seconds()

# Encode tickers as node IDs
ticker_to_id = {ticker: i for i, ticker in enumerate(five_stocks['ticker'].unique())}
five_stocks['node_id'] = five_stocks['ticker'].map(ticker_to_id)

# Here, we're making an assumption that each stock is connected to itself over time.
# For a more complex network, you would define the edges based on your specific use case.
edges = five_stocks[['node_id', 'node_id', 'Timestamp', 'Close']]


In [15]:
#live by the gpt...
five_stocks['Timestamp'] = pd.to_datetime(five_stocks['Timestamp'], unit='s')

In [17]:
from torch_geometric_temporal.signal import DynamicGraphTemporalSignal

# Create a list of edge_index, edge_weight, and feature matrices for each time step
edge_indices = []
edge_weights = []
features = []
targets = []

# Group by daily data
for day, day_df in five_stocks.groupby(five_stocks['Timestamp'].dt.floor('d')):
    node_indices = day_df['node_id'].values
    # Convert list of numpy arrays to a single numpy array before creating the tensor
    edge_index = torch.tensor(np.array([node_indices, node_indices]), dtype=torch.long)
    edge_weight = torch.tensor(day_df['Close'].values, dtype=torch.float)
    feature = torch.tensor(day_df.drop(['Timestamp', 'ticker', 'node_id', 'Close'], axis=1).values, dtype=torch.float)
    target = torch.tensor(day_df['Close'].values, dtype=torch.float).view(-1, 1)
    
    edge_indices.append(edge_index)
    edge_weights.append(edge_weight)
    features.append(feature)
    targets.append(target)
    
# Create a DynamicGraphTemporalSignal
dataset = DynamicGraphTemporalSignal(edge_indices, edge_weights, features, targets)


In [None]:
''' THIS FAILS BECAUSE CANT IMPORT THE MODULE ON AN M2
import torch
from torch_geometric_temporal.nn.recurrent import TGNMemory, LSTM
from torch_geometric.nn import GCNConv

class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features, periods, memory_dimension, embedding_dimension):
        super(RecurrentGCN, self).__init__()
        self.memory = TGNMemory(
            memory_dimension=memory_dimension,
            input_dimension=embedding_dimension,
            message_dimension=embedding_dimension,
            node_features=node_features,
            periods=periods
        )
        self.embedding_dimension = embedding_dimension
        self.gcn = GCNConv(embedding_dimension, 1)

    def forward(self, x, edge_index, edge_weight):
        h = self.memory(x, edge_index, None, edge_weight)
        h = self.gcn(h, edge_index, edge_weight)
        return h

SO I HAVE TO DUMB IT DOWN
'''

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class LSTMGCN(torch.nn.Module):
    def __init__(self, node_features, lstm_hidden_dim, gcn_output_dim):
        super(LSTMGCN, self).__init__()
        self.lstm = nn.LSTM(node_features, lstm_hidden_dim, batch_first=True)
        self.gcn = GCNConv(lstm_hidden_dim, gcn_output_dim)

    def forward(self, x, edge_index):
        # Reshape for LSTM
        # Assuming x is of shape (num_nodes, num_features)
        x, _ = self.lstm(x.view(len(x), 1, -1))
        
        # Reshape back for GCN
        x = x.view(len(x), -1)
        
        # Apply GCN
        x = self.gcn(x, edge_index)
        return F.relu(x)

In [24]:
columns = five_stocks.columns.tolist()  # Convert Index to a list
print(columns)

['Close', 'Volume', 'ticker', 'Change', '10min_MA', '60min_MA', '3hr_MA', '1day_MA', '5day_MA', 'hour_of_day', 'hour_of_day_normalized', 'hour_sin', 'hour_cos', 'Timestamp', 'node_id']


In [25]:
# Selecting features
feature_columns = ['Volume', 'Change', '10min_MA', '60min_MA', '3hr_MA', 
                   '1day_MA', '5day_MA', 'hour_of_day', 'hour_of_day_normalized', 
                   'hour_sin', 'hour_cos']

# Convert features and target to tensors
features = torch.tensor(five_stocks[feature_columns].values, dtype=torch.float)
targets = torch.tensor(five_stocks['Close'].values, dtype=torch.float).view(-1, 1)


In [27]:
from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F

class LSTMGCN(torch.nn.Module):
    def __init__(self, node_features, lstm_hidden_dim, gcn_output_dim):
        super(LSTMGCN, self).__init__()
        self.lstm = nn.LSTM(node_features, lstm_hidden_dim, batch_first=True)
        self.gcn = GCNConv(lstm_hidden_dim, gcn_output_dim)

    def forward(self, x, edge_index):
        # Reshape for LSTM
        x, _ = self.lstm(x.view(len(x), 1, -1))
        
        # Reshape back for GCN
        x = x.view(len(x), -1)
        
        # Apply GCN
        x = self.gcn(x, edge_index)
        return F.relu(x)

# Initialize the model
node_features = len(feature_columns)
lstm_hidden_dim = 64  # You can adjust this
gcn_output_dim = 1    # Assuming you want a single output value per node (e.g., predicted 'Close' value)

model = LSTMGCN(node_features, lstm_hidden_dim, gcn_output_dim)

In [28]:
# Calculate indices for splitting
total_samples = features.shape[0]
train_end = int(total_samples * 0.7)
validate_end = int(total_samples * 0.85)

# Split features and targets
train_features, validate_features, test_features = features[:train_end], features[train_end:validate_end], features[validate_end:]
train_targets, validate_targets, test_targets = targets[:train_end], targets[train_end:validate_end], targets[validate_end:]

In [29]:
def create_edges(num_nodes, k=5):
    edge_index = []
    for i in range(num_nodes):
        for j in range(max(0, i - k), min(num_nodes, i + k + 1)):
            if i != j:
                edge_index.append((i, j))
    return torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# Create edges for each split
train_edge_index = create_edges(train_features.shape[0])
validate_edge_index = create_edges(validate_features.shape[0])
test_edge_index = create_edges(test_features.shape[0])

In [33]:
import torch.optim as optim

# Assuming your model is already defined and named 'model'
# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Define the loss function
criterion = torch.nn.MSELoss()

# Define the number of epochs
epochs = 100  # Adjust as needed

# Now the training and validation loop
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Clear the gradients
    train_out = model(train_features, train_edge_index)  # Forward pass on train data
    train_loss = criterion(train_out, train_targets)  # Compute the loss on train data
    train_loss.backward()  # Backpropagation
    optimizer.step()  # Update model parameters

    # Validation step
    if epoch % 10 == 0:  # For example, validate every 10 epochs
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            validate_out = model(validate_features, validate_edge_index)  # Forward pass on validation data
            validate_loss = criterion(validate_out, validate_targets)  # Compute the loss on validation data
            print(f'Epoch {epoch}, Train Loss: {train_loss.item()}, Validation Loss: {validate_loss.item()}')


Epoch 0, Train Loss: 110870.03125, Validation Loss: 138658.5
Epoch 10, Train Loss: 108684.34375, Validation Loss: 136408.234375
Epoch 20, Train Loss: 106973.9140625, Validation Loss: 133761.15625
Epoch 30, Train Loss: 105669.515625, Validation Loss: 132056.40625
Epoch 40, Train Loss: 104419.1953125, Validation Loss: 130432.3671875
Epoch 50, Train Loss: 103081.5703125, Validation Loss: 128686.6796875
Epoch 60, Train Loss: 101823.1875, Validation Loss: 127045.25
Epoch 70, Train Loss: 100587.3515625, Validation Loss: 125432.0859375
Epoch 80, Train Loss: 99379.8046875, Validation Loss: 123852.671875
Epoch 90, Train Loss: 98198.671875, Validation Loss: 122304.3984375


In [34]:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient computation
    test_out = model(test_features, test_edge_index)  # Forward pass on test data
    test_loss = criterion(test_out, test_targets)  # Compute the loss on test data
    print(f'Test Loss: {test_loss.item()}')

    # Optionally, compare the first few predicted and actual values
    for i in range(min(5, test_out.size(0))):  # Just show the first 5 predictions
        print(f'Predicted value: {test_out[i].item()} - Actual value: {test_targets[i].item()}')


Test Loss: 174494.453125
Predicted value: 21.80135154724121 - Actual value: 455.19281005859375
Predicted value: 23.036176681518555 - Actual value: 455.0799865722656
Predicted value: 24.211462020874023 - Actual value: 455.07000732421875
Predicted value: 25.33391571044922 - Actual value: 455.1300048828125
Predicted value: 26.409433364868164 - Actual value: 455.1650085449219
