In [7]:
import pandas as pd
import numpy as np

In [12]:
df_sent_faiss = pd.read_csv(
    "../data/aapl_daily_sentiment_faiss_mean.csv",
    parse_dates=["date"]
)

df_sent_faiss = df_sent_faiss.sort_values("date").reset_index(drop=True)


In [17]:
df_sent_raw = pd.read_csv(
    "../data/aapl_daily_sentiment_raw_mean.csv",
    parse_dates=["date"]
)

df_sent_raw = df_sent_raw.sort_values("date").reset_index(drop=True)

In [21]:
df_sent_raw_w = pd.read_csv(
    "../data/aapl_daily_sentiment_raw_weighted.csv",
    parse_dates=["date"]
)

df_sent_raw_w = df_sent_raw_w.sort_values("date").reset_index(drop=True)

df_sent_faiss_w = pd.read_csv(
    "../data/aapl_daily_sentiment_faiss_weighted.csv",
    parse_dates=["date"]
)

df_sent_faiss_w = df_sent_faiss_w.sort_values("date").reset_index(drop=True)

In [8]:
df_market = pd.read_csv(
    "../data/sp500_features.csv",
    parse_dates=["date"]
)

# filter to AAPL only
df_market = df_market[df_market["ticker"] == "AAPL"] \
    .sort_values("date") \
    .reset_index(drop=True)

# ensure sorted
df_market = df_market.sort_values(["ticker", "date"]).reset_index(drop=True)

# compute next-day log return per ticker
df_market["y_reg"] = (
    np.log(df_market.groupby("ticker")["close"].shift(-1))
    - np.log(df_market["close"])
)

# drop last day per ticker (no target)
df_market = df_market.dropna(subset=["y_reg"]).reset_index(drop=True)


In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

market_cols = [
    "open", "high", "low", "close", "volume",
    "sma_10", "ema_10", "rsi_14", "macd", "vol_20"
]

sentiment_cols = [
    "polarity", "intensity", "relevance",
    "short_term", "long_term",
    "volatility", "novelty", "credibility"
]


In [33]:
# build dates alongside sequences
def make_sequences_with_dates(X, y, dates, window):
    Xs, ys, ds = [], [], []
    for i in range(len(X) - window):
        Xs.append(X[i:i + window])
        ys.append(y[i + window])
        ds.append(dates[i + window])
    return np.array(Xs), np.array(ys), np.array(ds)


In [34]:
dates = df_market["date"].values

X_seq, y_seq, d_seq = make_sequences_with_dates(
    X, y, dates, WINDOW
)

# same split
X_test, y_test, d_test = (
    X_seq[val_end:], 
    y_seq[val_end:], 
    d_seq[val_end:]
)


In [40]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# sentiment columns
sentiment_cols = [
    "polarity", "intensity", "relevance",
    "short_term", "long_term",
    "volatility", "novelty", "credibility"
]

# index sentiment by date
df_sent_idx = df_sent_raw.copy()
df_sent_idx["date"] = pd.to_datetime(df_sent_idx["date"])
df_sent_idx = df_sent_idx.set_index("date")

# index market by date
df_market_idx = df_market.copy()
df_market_idx["date"] = pd.to_datetime(df_market_idx["date"])
df_market_idx = df_market_idx.set_index("date")


In [41]:
# training dates = before validation split
train_dates = d_seq[:val_end]

# testing dates = LSTM test dates
test_dates = d_test


In [42]:
# sentiment features
X_sent_train = (
    df_sent_idx
    .reindex(train_dates)[sentiment_cols]
    .fillna(0.0)
    .values
)

X_sent_test = (
    df_sent_idx
    .reindex(test_dates)[sentiment_cols]
    .fillna(0.0)
    .values
)

# targets (returns)
y_sent_train = (
    df_market_idx
    .reindex(train_dates)["y_reg"]
    .values
)

y_sent_test = (
    df_market_idx
    .reindex(test_dates)["y_reg"]
    .values
)


In [49]:
lr = LinearRegression()
lr.fit(X_sent_train, y_sent_train)

y_pred_sent = lr.predict(X_sent_test)

mse_sent_only = mean_squared_error(y_sent_test, y_pred_sent)

print("Sentiment-only Linear Test MSE:", mse_sent_only)


Sentiment-only Linear Test MSE: 0.00018013154049815604


In [58]:
df_sent_raw_lag = df_sent_raw.copy()
df_sent_raw_lag[sentiment_cols] = (
    df_sent_raw_lag
    .sort_values("date")[sentiment_cols]
    .shift(1)
)



In [59]:
# use lagged sentiment
df_sent_idx_lag = df_sent_raw_lag.copy()
df_sent_idx_lag["date"] = pd.to_datetime(df_sent_idx_lag["date"])
df_sent_idx_lag = df_sent_idx_lag.set_index("date")

# rebuild train/test sentiment matrices
X_sent_train = (
    df_sent_idx_lag
    .reindex(train_dates)[sentiment_cols]
    .fillna(0.0)
    .values
)

X_sent_test = (
    df_sent_idx_lag
    .reindex(test_dates)[sentiment_cols]
    .fillna(0.0)
    .values
)

# targets unchanged
y_sent_train = (
    df_market_idx
    .reindex(train_dates)["y_reg"]
    .values
)

y_sent_test = (
    df_market_idx
    .reindex(test_dates)["y_reg"]
    .values
)

# train + eval
lr_lag = LinearRegression()
lr_lag.fit(X_sent_train, y_sent_train)

y_pred_sent_lag = lr_lag.predict(X_sent_test)
mse_sent_lag = mean_squared_error(y_sent_test, y_pred_sent_lag)

print("Sentiment-only (lagged) Test MSE:", mse_sent_lag)


Sentiment-only (lagged) Test MSE: 0.0001801214849332649


In [None]:
print(
    "market preds:", len(y_pred_market),
    "sentiment preds:", len(y_pred_sent),
    "true y:", len(y_test)
)


market preds: 324 sentiment preds: 324 true y: 324


In [28]:
# =========================================
# Market-Only LSTM (Baseline)
# =========================================

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

# -----------------------
# CONFIG
# -----------------------
WINDOW = 90
EPOCHS = 10
BATCH_SIZE = 32
LR = 1e-3

device = "cuda" if torch.cuda.is_available() else "cpu"

market_cols = [
    "open", "high", "low", "close", "volume",
    "sma_10", "ema_10", "rsi_14", "macd", "vol_20"
]

# -----------------------
# SORT DATA
# -----------------------
df = df_market.sort_values("date").reset_index(drop=True)

X = df[market_cols].values
y = df["y_reg"].values

# -----------------------
# SEQUENCE BUILDER
# -----------------------
def make_sequences(X, y, window):
    Xs, ys = [], []
    for i in range(len(X) - window):
        Xs.append(X[i:i + window])
        ys.append(y[i + window])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = make_sequences(X, y, WINDOW)

# -----------------------
# TRAIN / TEST SPLIT
# -----------------------
n = len(X_seq)
train_end = int(0.70 * n)
val_end   = int(0.85 * n)

X_train, y_train = X_seq[:train_end], y_seq[:train_end]
X_test,  y_test  = X_seq[val_end:],  y_seq[val_end:]

train_ds = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)

test_ds = TensorDataset(
    torch.tensor(X_test, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.float32)
)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)

# -----------------------
# MODEL
# -----------------------
class LSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=5):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1]).squeeze()

model_market = LSTMRegressor(input_dim=len(market_cols)).to(device)
optimizer = torch.optim.Adam(model_market.parameters(), lr=LR)
loss_fn = nn.MSELoss()

# -----------------------
# TRAIN
# -----------------------
for epoch in range(EPOCHS):
    model_market.train()
    total_loss = 0.0

    for xb, yb in tqdm(
        train_loader,
        desc=f"Epoch {epoch+1}/{EPOCHS} [Market-only]",
        leave=False
    ):
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        loss = loss_fn(model_market(xb), yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train MSE: {total_loss / len(train_loader):.6f}")

# -----------------------
# EVALUATE
# -----------------------
model_market.eval()
preds, trues = [], []

with torch.no_grad():
    for xb, yb in tqdm(test_loader, desc="Evaluating Market-only", leave=False):
        xb = xb.to(device)
        preds.append(model_market(xb).cpu().numpy())
        trues.append(yb.numpy())

y_pred_market = np.concatenate(preds)
y_true_market = np.concatenate(trues)

mse_market = mean_squared_error(y_true_market, y_pred_market)

print("=================================")
print("Market-only LSTM Test MSE:", mse_market)


Epoch 1/10 [Market-only]:   0%|          | 0/48 [00:00<?, ?it/s]

                                                                          

Epoch 1/10 | Train MSE: 0.000449


                                                                          

Epoch 2/10 | Train MSE: 0.000356


                                                                          

Epoch 3/10 | Train MSE: 0.000356


                                                                          

Epoch 4/10 | Train MSE: 0.000360


                                                                          

Epoch 5/10 | Train MSE: 0.000372


                                                                          

Epoch 6/10 | Train MSE: 0.000352


                                                                          

Epoch 7/10 | Train MSE: 0.000370


                                                                          

Epoch 8/10 | Train MSE: 0.000362


                                                                          

Epoch 9/10 | Train MSE: 0.000359


                                                                           

Epoch 10/10 | Train MSE: 0.000367


                                                              

Market-only LSTM Test MSE: 0.0001916165347211063




In [55]:
# =========================================
# Market + Sentiment LSTM (Early Fusion)
# =========================================

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error

# -----------------------
# CONFIG
# -----------------------
WINDOW = 90
EPOCHS = 10
BATCH_SIZE = 32
LR = 1e-3

device = "cuda" if torch.cuda.is_available() else "cpu"

market_cols = [
    "open", "high", "low", "close", "volume",
    "sma_10", "ema_10", "rsi_14", "macd", "vol_20"
]

sentiment_cols = [
    "polarity", "intensity", "relevance",
    "short_term", "long_term",
    "volatility", "novelty", "credibility"
]

# -----------------------
# CHOOSE SENTIMENT SOURCE
# -----------------------
# Use ONE of these:
# df_sent = df_sent_faiss.copy()   # ← FAISS sentiment
df_sent = df_sent_raw_w.copy()   # ← RAW sentiment

# -----------------------
# MERGE MARKET + SENTIMENT
# -----------------------
df = df_market.merge(
    df_sent,
    on="date",
    how="left"
).sort_values("date").reset_index(drop=True)

# fill missing sentiment days with zeros
df[sentiment_cols] = df[sentiment_cols].fillna(0.0)

feature_cols = market_cols + sentiment_cols

X = df[feature_cols].values
y = df["y_reg"].values

# -----------------------
# SEQUENCE BUILDER
# -----------------------
def make_sequences(X, y, window):
    Xs, ys = [], []
    for i in range(len(X) - window):
        Xs.append(X[i:i + window])
        ys.append(y[i + window])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = make_sequences(X, y, WINDOW)

# -----------------------
# TRAIN / VAL / TEST SPLIT
# -----------------------
n = len(X_seq)
train_end = int(0.70 * n)
val_end   = int(0.85 * n)

X_train, y_train = X_seq[:train_end], y_seq[:train_end]
X_test,  y_test  = X_seq[val_end:],  y_seq[val_end:]

train_ds = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)

test_ds = TensorDataset(
    torch.tensor(X_test, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.float32)
)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)

# -----------------------
# LSTM MODEL
# -----------------------
class LSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=5):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1]).squeeze()

model = LSTMRegressor(input_dim=len(feature_cols)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

# -----------------------
# TRAIN
# -----------------------
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        loss = loss_fn(model(xb), yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train MSE: {total_loss / len(train_loader):.6f}")

# -----------------------
# EVALUATE
# -----------------------
model.eval()
preds, trues = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds.append(model(xb).cpu().numpy())
        trues.append(yb.numpy())

y_pred = np.concatenate(preds)
y_true = np.concatenate(trues)

mse_fused = mean_squared_error(y_true, y_pred)

print("=================================")
print("Market + Sentiment LSTM Test MSE:", mse_fused)


Epoch 1/10 | Train MSE: 0.000534
Epoch 2/10 | Train MSE: 0.000364
Epoch 3/10 | Train MSE: 0.000360
Epoch 4/10 | Train MSE: 0.000358
Epoch 5/10 | Train MSE: 0.000381
Epoch 6/10 | Train MSE: 0.000361
Epoch 7/10 | Train MSE: 0.000357
Epoch 8/10 | Train MSE: 0.000367
Epoch 9/10 | Train MSE: 0.000360
Epoch 10/10 | Train MSE: 0.000356
Market + Sentiment LSTM Test MSE: 0.00018125210772268474


In [48]:
alpha = 0.5

y_pred_late = alpha * y_pred_market + (1 - alpha) * y_pred_sent

mse_late = mean_squared_error(y_test, y_pred_late)

print("Late Fusion Test MSE:", mse_late)


Late Fusion Test MSE: 0.000183432176223541
