In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn


class LSTMModel(nn.Module):
    """
    A simple LSTM model for sequence data.
    """

    def __init__(
        self, input_size, hidden_size, num_layers, output_size, dropout_rate=0.5
    ):
        """
        :param input_size: The number of expected features in the input (e.g., word embedding dimension).
        :param hidden_size: The number of features in the hidden state h.
        :param num_layers: The number of recurrent layers.
        :param output_size: The number of output features (e.g., number of classes).
        """
        super(LSTMModel, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # 1. The core LSTM layer
        # batch_first=True makes input/output tensors shaped as (batch_size, seq_len, features)
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate
        )

        # 2. Fully connected layer for output
        self.fc = nn.Linear(hidden_size, output_size)

        # 3. Optional: Activation function for the final output (e.g., for classification)
        # self.activation = nn.Sigmoid()

    def forward(self, x):
        # x shape: (batch_size, seq_len, input_size)
        batch_size = x.size(0)

        # Initialize hidden and cell states
        # The states must be initialized for the current batch and moved to the same device as the input.
        # h0 shape: (num_layers, batch_size, hidden_size)
        # c0 shape: (num_layers, batch_size, hidden_size)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)

        # LSTM forward pass
        # output shape: (batch_size, seq_len, hidden_size)
        # hn shape: (num_layers, batch_size, hidden_size)
        # cn shape: (num_layers, batch_size, hidden_size)
        output, (hn, cn) = self.lstm(x, (h0, c0))

        # We typically use the output from the last time step for prediction.
        # This is contained in the hidden state (hn) for the last layer (hn[-1]).
        # The output tensor also contains all time steps, so we can use output[:, -1, :]

        # Option A: Use the hidden state of the last layer at the last time step
        # out = self.fc(hn[-1])

        # Option B: Use the output of the last time step (simpler)
        # output[:, -1, :] selects the last time step's output for all sequences in the batch
        out = self.fc(output[:, -1, :])

        # Apply final activation if needed
        # out = self.activation(out)

        return out

In [None]:
# --- Configuration ---
INPUT_DIM = 100  # e.g., embedding dimension
HIDDEN_DIM = 128
NUM_LAYERS = 2
OUTPUT_DIM = 1  # e.g., binary classification (1) or a regression value (1)
SEQUENCE_LEN = 20  # The number of time steps (words, minutes, etc.)
BATCH_SIZE = 32

# --- Model Instantiation ---
model = LSTMModel(
    input_size=INPUT_DIM,
    hidden_size=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    output_size=OUTPUT_DIM,
)

# --- Dummy Data Creation ---
# Simulate a batch of data: 32 sequences, each with 20 time steps, where each step has 100 features.
dummy_input = torch.randn(BATCH_SIZE, SEQUENCE_LEN, INPUT_DIM)

# --- Forward Pass ---
predictions = model(dummy_input)

# --- Check Shapes ---
print(f"\nInput shape: {dummy_input.shape}")
print(f"Output shape: {predictions.shape}")
# Expected output shape: (32, 1) -> 32 predictions, 1 output feature each


Input shape: torch.Size([32, 20, 100])
Output shape: torch.Size([32, 1])


In [None]:
stock_price_by_min = pd.read_csv("../data/raw/TSLA_5_year_by_min.csv")
stock_price_by_min_2018_2020 = pd.read_csv("../data/raw/TSLA_2018_to_2020.csv")
tweets = pd.read_csv("../data/raw/elonmusk_raw.csv")

stock_price_by_min = pd.concat(
    [stock_price_by_min, stock_price_by_min_2018_2020]
).drop_duplicates()


NUM_MINUTES_AHEAD = 5

tweet_wanted_attributes = ["createdAt", "isRetweet", "isReply", "isQuote", "fullText"]
stock_wanted_attributes = ["ts_event", "open", "close"]

# filtering for only original tweets
X = tweets[tweet_wanted_attributes]
X = X[(X["isRetweet"] == False) & (X["isReply"] == False) & (X["isQuote"] == False)]
print(f"Number of original tweets: {len(X)} / {len(tweets)}")

# converting createdAt to datetime
X["createdAt"] = pd.to_datetime(X["createdAt"])

# featurizing day of week and hour of day using sin and cos transformations
X["day_sin"] = X["createdAt"].dt.dayofweek.apply(lambda x: np.sin(2 * np.pi * x / 7))
X["day_cos"] = X["createdAt"].dt.dayofweek.apply(lambda x: np.cos(2 * np.pi * x / 7))
X["hour_sin"] = X["createdAt"].dt.hour.apply(lambda x: np.sin(2 * np.pi * x / 24))
X["hour_cos"] = X["createdAt"].dt.hour.apply(lambda x: np.cos(2 * np.pi * x / 24))

# lets now get the stock prices NUM_MINUTES_AHEAD after each tweet
stock_price_by_min["ts_event"] = pd.to_datetime(stock_price_by_min["ts_event"])
y = stock_price_by_min[stock_wanted_attributes]

# create window start and end times to join on
X["window_start_time"] = X["createdAt"].dt.ceil("min")
X["window_end_time"] = X["window_start_time"] + pd.Timedelta(minutes=NUM_MINUTES_AHEAD)

X = X.sort_values("createdAt").reset_index(drop=True)
y = y.sort_values("ts_event").reset_index(drop=True)

# filter X to only have tweets that have stock price data after them
X = X[y["ts_event"].min() <= X["window_start_time"]]
# filter to weekdays and market hours (data is in UTC)
# this range is a little wider to account for Daylight Savings Time changes
X = X[
    (X["createdAt"].dt.dayofweek < 5)
    & (X["createdAt"].dt.hour >= 13)
    & (X["createdAt"].dt.hour < 21)
]

# join to get stock prices at the start of the window
data = pd.merge_asof(
    X, y, left_on="window_start_time", right_on="ts_event", direction="forward"
)
# join to get stock prices at the end of the window
data = pd.merge_asof(
    data,
    y,
    left_on="window_end_time",
    right_on="ts_event",
    direction="forward",
    suffixes=("_start", "_end"),
)

# 1. Use the sorted y for the rolling average calculation.
y_for_merge = y.set_index("ts_event").copy()

# 2. Calculate the rolling average of the PREVIOUS 5 minutes.
# If this is still constant, the issue is in the 'y' DataFrame itself.
y_for_merge["avg_last_5_prices"] = (
    y_for_merge["close"].rolling(window="5min", closed="left", min_periods=1).mean()
)
# 3. Merge this new feature back into 'data'.
data = pd.merge_asof(
    data,
    y_for_merge["avg_last_5_prices"].reset_index(),
    left_on="window_start_time",
    right_on="ts_event",
    direction="backward",
)

data["avg_last_5_prices"] = (
    10000 * (data["avg_last_5_prices"] - data["open_start"]) / data["open_start"]
)

# calculate the % change and convert to basis points (0.01%) so we dont have to deal
# with small decimals
data["bps_change"] = (
    10000 * (data["close_end"] - data["open_start"]) / data["open_start"]
)

# 4. Drop the extra ts_event column created by the merge_asof
data = data.drop(columns=["ts_event"])

# drop unnecessary columns
data = data.drop(
    columns=[
        "createdAt",
        "isRetweet",
        "isReply",
        "isQuote",
        "ts_event_start",
        "ts_event_end",
        "open_start",
        "open_end",
        "close_start",
        "close_end",
        "window_start_time",
        "window_end_time",
    ]
)
data.dropna(inplace=True)

from sklearn.model_selection import train_test_split

# split it into train, val and test sets
X, y = data[data.columns[:-1]], data["bps_change"]

# temporal split - no shuffling. 70-15-15 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, shuffle=False
)