In [1]:
import pymongo
import pandas as pd
from pytz import timezone
import numpy as np
import matplotlib.pyplot as plt
import time
import matplotlib.dates as mdates
from sklearn.model_selection import train_test_split
import torch_geometric
import torch_geometric_temporal 

In [2]:
# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["stock_data"]
collection = db["price_data"]

# Query all documents from the collection
results = collection.find({})

# Convert the query results to a pandas DataFrame
data = pd.DataFrame(list(results))

In [3]:
stocks = data.copy()

# Drop unnecessary columns
columns_to_drop = ["_id", "Adj Close"]
stocks = stocks.drop(columns=columns_to_drop)

# Create Change
stocks['Change'] = stocks['Close'] - stocks['Open']

# Drop 'Open' if it's no longer needed
stocks = stocks.drop(columns=['Open', 'High', 'Low'])

# Define rolling window sizes in minutes
window_sizes = {
    '10min': 10,
    '60min': 60,
    '3hr': 3 * 60,  # 3 hours in minutes
    '1day': 24 * 60,  # 1 day in minutes
    '5day': 5 * 24 * 60  # 5 days in minutes
}

# Calculate the moving averages for all stocks
for window_name, minutes in window_sizes.items():
    stocks[f'{window_name}_MA'] = stocks.groupby('ticker')['Close'].rolling(window=minutes, min_periods=1).mean().reset_index(level=0, drop=True)

# Fill NaN values with the first available value if there are any NaNs
stocks.fillna(method='bfill', inplace=True)

# Convert 'Datetime' to Eastern Time
eastern = timezone('US/Eastern')
stocks['Datetime_ET'] = stocks['Datetime'].dt.tz_localize('UTC').dt.tz_convert(eastern)

# Extract time features from 'Datetime_ET'
stocks['hour_of_day'] = stocks['Datetime_ET'].dt.hour + stocks['Datetime_ET'].dt.minute / 60
stocks['hour_of_day_normalized'] = 2 * np.pi * stocks['hour_of_day'] / 24
stocks['hour_sin'] = np.sin(stocks['hour_of_day_normalized'])
stocks['hour_cos'] = np.cos(stocks['hour_of_day_normalized'])

# Now convert 'Datetime' to UNIX timestamp if needed
stocks['Timestamp'] = stocks['Datetime'].astype('int64') // 1e9

# Drop the original 'Datetime' and 'Datetime_ET' if they are no longer needed
stocks = stocks.drop(columns=['Datetime', 'Datetime_ET'])

  stocks.fillna(method='bfill', inplace=True)


In [4]:
stocks.sort_values(['ticker', 'Timestamp'], inplace=True)

In [5]:
# Take 5 ticker symbols to validate code

# List of tickers we want to keep
tickers_to_keep = ['AAPL', 'ADBE', 'AMZN', 'MSFT', 'NVDA']

# Create a new DataFrame with only the specified tickers
five_stocks = stocks[stocks['ticker'].isin(tickers_to_keep)].copy()

# Now, five_stocks contains only the data for the five specified tickers
five_stocks.sort_values(['ticker', 'Timestamp'], inplace=True)
five_stocks.head(5)

Unnamed: 0,Close,Volume,ticker,Change,10min_MA,60min_MA,3hr_MA,1day_MA,5day_MA,hour_of_day,hour_of_day_normalized,hour_sin,hour_cos,Timestamp
269013,171.110001,1386526,AAPL,-0.110001,171.110001,171.110001,171.110001,171.110001,171.110001,9.5,2.487094,0.608761,-0.793353,1696253000.0
269014,172.300003,467065,AAPL,1.184998,171.705002,171.705002,171.705002,171.705002,171.705002,9.516667,2.491458,0.605294,-0.796002,1696253000.0
269015,172.619995,345839,AAPL,0.304993,172.01,172.01,172.01,172.01,172.01,9.533333,2.495821,0.601815,-0.798636,1696254000.0
269016,172.779999,384766,AAPL,0.160004,172.202499,172.202499,172.202499,172.202499,172.202499,9.55,2.500184,0.598325,-0.801254,1696254000.0
269017,172.990005,345982,AAPL,0.230011,172.360001,172.360001,172.360001,172.360001,172.360001,9.566667,2.504547,0.594823,-0.803857,1696254000.0


# from other notebooks
- now have a dataframe with data for five stocks
- lets start a GNN model

In [6]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

In [7]:
columns = five_stocks.columns.tolist()  # Convert Index to a list
print(columns)

['Close', 'Volume', 'ticker', 'Change', '10min_MA', '60min_MA', '3hr_MA', '1day_MA', '5day_MA', 'hour_of_day', 'hour_of_day_normalized', 'hour_sin', 'hour_cos', 'Timestamp']


In [8]:
# List of columns to be used as features
feature_columns = [
    'Volume', 'Change', '10min_MA', '60min_MA', '3hr_MA', '1day_MA', '5day_MA',
    'hour_of_day_normalized', 'hour_sin', 'hour_cos'
]

# Convert DataFrame to PyTorch tensors
features = torch.tensor(five_stocks[feature_columns].values, dtype=torch.float)
targets = torch.tensor(five_stocks['Close'].values, dtype=torch.float).view(-1, 1)

In [9]:
# Define the edges for nearest neighbors
def nearest_neighbors_edge_index(num_nodes, k=5):
    edge_index = []
    for i in range(num_nodes):
        for j in range(max(0, i-k), min(num_nodes, i+k+1)):
            if i != j:
                edge_index.append((i, j))
    return torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# Get the edge_index for the nearest neighbors THIS IS WHAT I MADE SMALLER
edge_index = nearest_neighbors_edge_index(features.size(0), k=5)

# Create the graph data structure
data = Data(x=features, edge_index=edge_index, y=targets)

# Define a simple GNN model
class GCN(torch.nn.Module):
    def __init__(self, num_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x.squeeze()  # Adjusted to ensure output shape matches target shape

model = GCN(data.num_features)

# Define your loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [10]:
# Train the model
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out, data.y.squeeze())  # Make sure the target tensor is the correct shape
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 273045280.0
Epoch 1, Loss: 194619904.0
Epoch 2, Loss: 132721072.0
Epoch 3, Loss: 85464200.0
Epoch 4, Loss: 56265248.0
Epoch 5, Loss: 36387760.0
Epoch 6, Loss: 22945220.0
Epoch 7, Loss: 13262156.0
Epoch 8, Loss: 6676541.5
Epoch 9, Loss: 2736124.5
Epoch 10, Loss: 769573.875
Epoch 11, Loss: 76176.25
Epoch 12, Loss: 344858.75
Epoch 13, Loss: 1271091.875
Epoch 14, Loss: 2558934.0
Epoch 15, Loss: 3948204.5
Epoch 16, Loss: 4974542.0
Epoch 17, Loss: 5714764.0
Epoch 18, Loss: 6164210.0
Epoch 19, Loss: 6318521.0
Epoch 20, Loss: 6204498.5
Epoch 21, Loss: 5870858.0
Epoch 22, Loss: 5364675.5
Epoch 23, Loss: 4747608.0
Epoch 24, Loss: 4074516.5
Epoch 25, Loss: 3391091.0
Epoch 26, Loss: 2734122.5
Epoch 27, Loss: 2130990.0
Epoch 28, Loss: 1599832.75
Epoch 29, Loss: 1150516.5
Epoch 30, Loss: 785978.4375
Epoch 31, Loss: 503736.84375
Epoch 32, Loss: 297379.96875
Epoch 33, Loss: 157936.84375
Epoch 34, Loss: 75045.09375
Epoch 35, Loss: 37890.88671875
Epoch 36, Loss: 35918.078125
Epoch 37, Los

In [13]:
# Evaluate the model
model.eval()
with torch.no_grad():
    predictions = model(data)

# Calculate Mean Squared Error
mse = criterion(predictions, data.y.squeeze()).item()
print(f'Mean Squared Error: {mse}')

# Optionally, compare the first few predictions with the actual values
for i in range(min(5, predictions.size(0))):  # Just show the first 5 predictions
    print(f'Predicted value: {predictions[i].item()} - Actual value: {data.y[i].item()}')

Mean Squared Error: 16.28984832763672
Predicted value: 138.2963409423828 - Actual value: 171.11000061035156
Predicted value: 148.4610137939453 - Actual value: 172.3000030517578
Predicted value: 157.609375 - Actual value: 172.6199951171875
Predicted value: 166.02310180664062 - Actual value: 172.77999877929688
Predicted value: 173.89646911621094 - Actual value: 172.99000549316406
