In [1]:
import pandas as pd
import numpy as np

In [3]:
df_sent_faiss = pd.read_csv(
    "../data/aapl_daily_sentiment_faiss_mean.csv",
    parse_dates=["date"]
)

df_sent_faiss = df_sent_faiss.sort_values("date").reset_index(drop=True)


In [4]:
df_sent_raw = pd.read_csv(
    "../data/aapl_daily_sentiment_raw_mean.csv",
    parse_dates=["date"]
)

df_sent_raw = df_sent_raw.sort_values("date").reset_index(drop=True)

In [5]:
df_sent_raw_w = pd.read_csv(
    "../data/aapl_daily_sentiment_raw_weighted.csv",
    parse_dates=["date"]
)

df_sent_raw_w = df_sent_raw_w.sort_values("date").reset_index(drop=True)

df_sent_faiss_w = pd.read_csv(
    "../data/aapl_daily_sentiment_faiss_weighted.csv",
    parse_dates=["date"]
)

df_sent_faiss_w = df_sent_faiss_w.sort_values("date").reset_index(drop=True)

In [7]:
df_market = pd.read_csv(
    "../sp500_features.csv",
    parse_dates=["date"]
)

# filter to AAPL only
df_market = df_market[df_market["ticker"] == "AAPL"] \
    .sort_values("date") \
    .reset_index(drop=True)

# ensure sorted
df_market = df_market.sort_values(["ticker", "date"]).reset_index(drop=True)

# compute next-day log return per ticker
df_market["y_reg"] = (
    np.log(df_market.groupby("ticker")["close"].shift(-1))
    - np.log(df_market["close"])
)

# drop last day per ticker (no target)
df_market = df_market.dropna(subset=["y_reg"]).reset_index(drop=True)


In [8]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

market_cols = [
    "open", "high", "low", "close", "volume",
    "sma_10", "ema_10", "rsi_14", "macd", "vol_20"
]

sentiment_cols = [
    "polarity", "intensity", "relevance",
    "short_term", "long_term",
    "volatility", "novelty", "credibility"
]


In [10]:
# build dates alongside sequences
def make_sequences_with_dates(X, y, dates, window):
    Xs, ys, ds = [], [], []
    for i in range(len(X) - window):
        Xs.append(X[i:i + window])
        ys.append(y[i + window])
        ds.append(dates[i + window])
    return np.array(Xs), np.array(ys), np.array(ds)


In [12]:
display(df_market)

Unnamed: 0,date,ticker,open,high,low,close,volume,sma_10,ema_10,rsi_14,macd,vol_20,y_reg
0,2016-01-22,AAPL,24.657499,25.365000,24.592501,25.355000,263202000,24.516500,24.698870,42.357054,-0.460448,0.025491,-0.019716
1,2016-01-25,AAPL,25.379999,25.382500,24.802500,24.860001,207178000,24.578500,24.728166,39.328277,-0.419185,0.024904,0.005516
2,2016-01-26,AAPL,24.982500,25.219999,24.517500,24.997499,300308000,24.615000,24.777136,44.687492,-0.371111,0.024119,-0.067965
3,2016-01-27,AAPL,24.010000,24.157499,23.334999,23.355000,533478800,24.451500,24.518566,37.931026,-0.460243,0.028055,0.007146
4,2016-01-28,AAPL,23.447500,23.629999,23.097500,23.522499,222715200,24.369000,24.337463,45.560566,-0.511468,0.027384,0.033958
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2244,2024-12-20,AAPL,248.039993,255.000000,245.690002,254.490005,147495300,249.395001,248.879461,75.940140,5.577056,0.008878,0.003060
2245,2024-12-23,AAPL,254.770004,255.649994,253.449997,255.270004,40858800,250.247002,250.041378,73.865387,5.749986,0.008893,0.011413
2246,2024-12-24,AAPL,255.490005,258.209991,255.289993,258.200012,23234700,251.290002,251.524766,76.180663,6.053678,0.008828,0.003171
2247,2024-12-26,AAPL,258.190002,260.100006,257.630005,259.019989,27237100,252.543001,252.887534,76.812098,6.288037,0.008782,-0.013331


In [14]:
X = df_market[market_cols].values.astype(np.float32)
y = df_market["y_reg"].values.astype(np.float32)


In [19]:
dates = df_market["date"].values

X_seq, y_seq, d_seq = make_sequences_with_dates(
    X, y, dates, WINDOW
)

# same split
X_test, y_test, d_test = (
    X_seq[val_end:], 
    y_seq[val_end:], 
    d_seq[val_end:]
)


In [20]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# sentiment columns
sentiment_cols = [
    "polarity", "intensity", "relevance",
    "short_term", "long_term",
    "volatility", "novelty", "credibility"
]

# index sentiment by date
df_sent_idx = df_sent_raw.copy()
df_sent_idx["date"] = pd.to_datetime(df_sent_idx["date"])
df_sent_idx = df_sent_idx.set_index("date")

# index market by date
df_market_idx = df_market.copy()
df_market_idx["date"] = pd.to_datetime(df_market_idx["date"])
df_market_idx = df_market_idx.set_index("date")


In [21]:
# training dates = before validation split
train_dates = d_seq[:val_end]

# testing dates = LSTM test dates
test_dates = d_test


In [22]:
# sentiment features
X_sent_train = (
    df_sent_idx
    .reindex(train_dates)[sentiment_cols]
    .fillna(0.0)
    .values
)

X_sent_test = (
    df_sent_idx
    .reindex(test_dates)[sentiment_cols]
    .fillna(0.0)
    .values
)

# targets (returns)
y_sent_train = (
    df_market_idx
    .reindex(train_dates)["y_reg"]
    .values
)

y_sent_test = (
    df_market_idx
    .reindex(test_dates)["y_reg"]
    .values
)


In [23]:
lr = LinearRegression()
lr.fit(X_sent_train, y_sent_train)

y_pred_sent = lr.predict(X_sent_test)

mse_sent_only = mean_squared_error(y_sent_test, y_pred_sent)

print("Sentiment-only Linear Test MSE:", mse_sent_only)


Sentiment-only Linear Test MSE: 0.00018013154049815617


In [24]:
df_sent_raw_lag = df_sent_raw.copy()
df_sent_raw_lag[sentiment_cols] = (
    df_sent_raw_lag
    .sort_values("date")[sentiment_cols]
    .shift(1)
)



In [25]:
# use lagged sentiment
df_sent_idx_lag = df_sent_raw_lag.copy()
df_sent_idx_lag["date"] = pd.to_datetime(df_sent_idx_lag["date"])
df_sent_idx_lag = df_sent_idx_lag.set_index("date")

# rebuild train/test sentiment matrices
X_sent_train = (
    df_sent_idx_lag
    .reindex(train_dates)[sentiment_cols]
    .fillna(0.0)
    .values
)

X_sent_test = (
    df_sent_idx_lag
    .reindex(test_dates)[sentiment_cols]
    .fillna(0.0)
    .values
)

# targets unchanged
y_sent_train = (
    df_market_idx
    .reindex(train_dates)["y_reg"]
    .values
)

y_sent_test = (
    df_market_idx
    .reindex(test_dates)["y_reg"]
    .values
)

# train + eval
lr_lag = LinearRegression()
lr_lag.fit(X_sent_train, y_sent_train)

y_pred_sent_lag = lr_lag.predict(X_sent_test)
mse_sent_lag = mean_squared_error(y_sent_test, y_pred_sent_lag)

print("Sentiment-only (lagged) Test MSE:", mse_sent_lag)


Sentiment-only (lagged) Test MSE: 0.000180121484933265


In [26]:
print(
    "market preds:", len(y_pred_market),
    "sentiment preds:", len(y_pred_sent),
    "true y:", len(y_test)
)


market preds: 324 sentiment preds: 324 true y: 324


In [17]:
# =========================================
# Market-Only LSTM (Baseline)
# =========================================

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

# -----------------------
# CONFIG
# -----------------------
WINDOW = 90
EPOCHS = 10
BATCH_SIZE = 32
LR = 1e-3

device = "cuda" if torch.cuda.is_available() else "cpu"

market_cols = [
    "open", "high", "low", "close", "volume",
    "sma_10", "ema_10", "rsi_14", "macd", "vol_20"
]

# -----------------------
# SORT DATA
# -----------------------
df = df_market.sort_values("date").reset_index(drop=True)

X = df[market_cols].values
y = df["y_reg"].values

# -----------------------
# SEQUENCE BUILDER
# -----------------------
def make_sequences(X, y, window):
    Xs, ys = [], []
    for i in range(len(X) - window):
        Xs.append(X[i:i + window])
        ys.append(y[i + window])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = make_sequences(X, y, WINDOW)

# -----------------------
# TRAIN / TEST SPLIT
# -----------------------
n = len(X_seq)
train_end = int(0.70 * n)
val_end   = int(0.85 * n)

X_train, y_train = X_seq[:train_end], y_seq[:train_end]
X_test,  y_test  = X_seq[val_end:],  y_seq[val_end:]

train_ds = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)

test_ds = TensorDataset(
    torch.tensor(X_test, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.float32)
)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)

# -----------------------
# MODEL
# -----------------------
class LSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=5):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1]).squeeze()

model_market = LSTMRegressor(input_dim=len(market_cols)).to(device)
optimizer = torch.optim.Adam(model_market.parameters(), lr=LR)
loss_fn = nn.MSELoss()

# -----------------------
# TRAIN
# -----------------------
for epoch in range(EPOCHS):
    model_market.train()
    total_loss = 0.0

    for xb, yb in tqdm(
        train_loader,
        desc=f"Epoch {epoch+1}/{EPOCHS} [Market-only]",
        leave=False
    ):
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        loss = loss_fn(model_market(xb), yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train MSE: {total_loss / len(train_loader):.6f}")

# -----------------------
# EVALUATE
# -----------------------
model_market.eval()
preds, trues = [], []

with torch.no_grad():
    for xb, yb in tqdm(test_loader, desc="Evaluating Market-only", leave=False):
        xb = xb.to(device)
        preds.append(model_market(xb).cpu().numpy())
        trues.append(yb.numpy())

y_pred_market = np.concatenate(preds)
y_true_market = np.concatenate(trues)

mse_market = mean_squared_error(y_true_market, y_pred_market)

print("=================================")
print("Market-only LSTM Test MSE:", mse_market)


                                                                         

Epoch 1/10 | Train MSE: 0.000541


                                                                         

Epoch 2/10 | Train MSE: 0.000385


                                                                         

Epoch 3/10 | Train MSE: 0.000368


                                                                         

Epoch 4/10 | Train MSE: 0.000362


                                                                         

Epoch 5/10 | Train MSE: 0.000364


                                                                         

Epoch 6/10 | Train MSE: 0.000358


                                                                         

Epoch 7/10 | Train MSE: 0.000381


                                                                         

Epoch 8/10 | Train MSE: 0.000364


                                                                         

Epoch 9/10 | Train MSE: 0.000359


                                                                          

Epoch 10/10 | Train MSE: 0.000362


                                                                       

Market-only LSTM Test MSE: 0.00021118602307979017




In [27]:
# =========================================
# Market + Sentiment LSTM (Early Fusion)
# =========================================

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error

# -----------------------
# CONFIG
# -----------------------
WINDOW = 90
EPOCHS = 10
BATCH_SIZE = 32
LR = 1e-3

device = "cuda" if torch.cuda.is_available() else "cpu"

market_cols = [
    "open", "high", "low", "close", "volume",
    "sma_10", "ema_10", "rsi_14", "macd", "vol_20"
]

sentiment_cols = [
    "polarity", "intensity", "relevance",
    "short_term", "long_term",
    "volatility", "novelty", "credibility"
]

# -----------------------
# CHOOSE SENTIMENT SOURCE
# -----------------------
# Use ONE of these:
# df_sent = df_sent_faiss.copy()   # ← FAISS sentiment
df_sent = df_sent_raw_w.copy()   # ← RAW sentiment

# -----------------------
# MERGE MARKET + SENTIMENT
# -----------------------
df = df_market.merge(
    df_sent,
    on="date",
    how="left"
).sort_values("date").reset_index(drop=True)

# fill missing sentiment days with zeros
df[sentiment_cols] = df[sentiment_cols].fillna(0.0)

feature_cols = market_cols + sentiment_cols

X = df[feature_cols].values
y = df["y_reg"].values

# -----------------------
# SEQUENCE BUILDER
# -----------------------
def make_sequences(X, y, window):
    Xs, ys = [], []
    for i in range(len(X) - window):
        Xs.append(X[i:i + window])
        ys.append(y[i + window])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = make_sequences(X, y, WINDOW)

# -----------------------
# TRAIN / VAL / TEST SPLIT
# -----------------------
n = len(X_seq)
train_end = int(0.70 * n)
val_end   = int(0.85 * n)

X_train, y_train = X_seq[:train_end], y_seq[:train_end]
X_test,  y_test  = X_seq[val_end:],  y_seq[val_end:]

train_ds = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)

test_ds = TensorDataset(
    torch.tensor(X_test, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.float32)
)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)

# -----------------------
# LSTM MODEL
# -----------------------
class LSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=5):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1]).squeeze()

model = LSTMRegressor(input_dim=len(feature_cols)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

# -----------------------
# TRAIN
# -----------------------
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        loss = loss_fn(model(xb), yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train MSE: {total_loss / len(train_loader):.6f}")

# -----------------------
# EVALUATE
# -----------------------
model.eval()
preds, trues = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds.append(model(xb).cpu().numpy())
        trues.append(yb.numpy())

y_pred = np.concatenate(preds)
y_true = np.concatenate(trues)

mse_fused = mean_squared_error(y_true, y_pred)

print("=================================")
print("Market + Sentiment LSTM Test MSE:", mse_fused)


Epoch 1/10 | Train MSE: 0.000392
Epoch 2/10 | Train MSE: 0.000385
Epoch 3/10 | Train MSE: 0.000372
Epoch 4/10 | Train MSE: 0.000354
Epoch 5/10 | Train MSE: 0.000358
Epoch 6/10 | Train MSE: 0.000357
Epoch 7/10 | Train MSE: 0.000367
Epoch 8/10 | Train MSE: 0.000361
Epoch 9/10 | Train MSE: 0.000362
Epoch 10/10 | Train MSE: 0.000362
Market + Sentiment LSTM Test MSE: 0.0001814055722206831


In [50]:
# =========================================
# Market + Sentiment LSTM (Early Fusion) - Shiva's update
# =========================================

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# -----------------------
# CONFIG
# -----------------------
WINDOW = 90
EPOCHS = 10
BATCH_SIZE = 32
LR = 1e-3

device = "cuda" if torch.cuda.is_available() else "cpu"

market_cols = [
    "open", "high", "low", "close", "volume",
    "sma_10", "ema_10", "rsi_14", "macd", "vol_20"
]

sentiment_cols = [
    "polarity", "intensity", "relevance",
    "short_term", "long_term",
    "volatility", "novelty", "credibility"
]

# -----------------------
# CHOOSE SENTIMENT SOURCE
# -----------------------
# df_sent = df_sent_faiss.copy()
df_sent = df_sent_raw_w.copy()   # best for you so far

# -----------------------
# MERGE MARKET + SENTIMENT
# -----------------------
df = df_market.merge(
    df_sent,
    on="date",
    how="left"
).sort_values("date").reset_index(drop=True)

# fill missing sentiment days with zeros
df[sentiment_cols] = df[sentiment_cols].fillna(0.0)

feature_cols = market_cols + sentiment_cols

X = df[feature_cols].values
y = df["y_reg"].values


# -----------------------
# SEQUENCE BUILDER
# -----------------------
def make_sequences(X, y, window):
    Xs, ys = [], []
    for i in range(len(X) - window):
        Xs.append(X[i:i + window])
        ys.append(y[i + window])
    return np.array(Xs, dtype=np.float32), np.array(ys, dtype=np.float32)

X_seq, y_seq = make_sequences(X, y, WINDOW)

# -----------------------
# TRAIN / TEST SPLIT
# -----------------------
n = len(X_seq)
train_end = int(0.70 * n)
val_end   = int(0.85 * n)   # keep identical split logic

X_train, y_train = X_seq[:train_end], y_seq[:train_end]
X_test,  y_test  = X_seq[val_end:],  y_seq[val_end:]

# -----------------------
# Scaling
# -----------------------
scaler = StandardScaler()

B, T, F = X_train.shape
X_train_2d = X_train.reshape(B*T, F)
X_test_2d  = X_test.reshape(X_test.shape[0]*T, F)

X_train = scaler.fit_transform(X_train_2d).reshape(B, T, F)
X_test  = scaler.transform(X_test_2d).reshape(X_test.shape[0], T, F)

# -----------------------
# DataLoaders
# -----------------------
train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
test_ds  = TensorDataset(torch.tensor(X_test),  torch.tensor(y_test))

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE)

# -----------------------
# LSTM MODEL
# -----------------------
class LSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=5):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        out, _ = self.lstm(x)        # (B, T, H)
        last = out[:, -1, :]         # last timestep output
        return self.fc(last).squeeze(-1)

model = LSTMRegressor(input_dim=len(feature_cols)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

# -----------------------
# TRAIN
# -----------------------
for epoch in range(EPOCHS):
    model.train()
    losses = []

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = loss_fn(model(xb), yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        losses.append(loss.item())

    print(f"Epoch {epoch+1}/{EPOCHS} | Train MSE: {np.mean(losses):.6f}")

# -----------------------
# EVALUATE
# -----------------------
model.eval()
preds, trues = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds.append(model(xb).cpu().numpy())
        trues.append(yb.numpy())

y_pred = np.concatenate(preds)
y_true = np.concatenate(trues)

mse_fused = mean_squared_error(y_true, y_pred)

print("=================================")
print("Market + Sentiment LSTM Test MSE:", mse_fused)


Epoch 1/10 | Train MSE: 0.001498
Epoch 2/10 | Train MSE: 0.000472
Epoch 3/10 | Train MSE: 0.000462
Epoch 4/10 | Train MSE: 0.000446
Epoch 5/10 | Train MSE: 0.000429
Epoch 6/10 | Train MSE: 0.000441
Epoch 7/10 | Train MSE: 0.000436
Epoch 8/10 | Train MSE: 0.000426
Epoch 9/10 | Train MSE: 0.000430
Epoch 10/10 | Train MSE: 0.000413
Market + Sentiment LSTM Test MSE: 0.00018012789951171726


In [40]:
display(df)

Unnamed: 0,date,ticker,open,high,low,close,volume,sma_10,ema_10,rsi_14,...,vol_20,y_reg,polarity,intensity,relevance,short_term,long_term,volatility,novelty,credibility
0,2016-01-22,AAPL,24.657499,25.365000,24.592501,25.355000,263202000,24.516500,24.698870,42.357054,...,0.025491,-0.019716,0.500,0.699999,0.949999,0.300,0.200,0.599999,0.549999,0.899999
1,2016-01-25,AAPL,25.379999,25.382500,24.802500,24.860001,207178000,24.578500,24.728166,39.328277,...,0.024904,0.005516,-0.200,0.549999,0.849999,-0.300,-0.150,0.649999,0.699999,0.949999
2,2016-01-26,AAPL,24.982500,25.219999,24.517500,24.997499,300308000,24.615000,24.777136,44.687492,...,0.024119,-0.067965,-0.550,0.750000,0.950000,-0.500,-0.325,0.650000,0.550000,0.925000
3,2016-01-27,AAPL,24.010000,24.157499,23.334999,23.355000,533478800,24.451500,24.518566,37.931026,...,0.028055,0.007146,0.150,0.650000,0.900000,-0.075,-0.025,0.625000,0.525000,0.925000
4,2016-01-28,AAPL,23.447500,23.629999,23.097500,23.522499,222715200,24.369000,24.337463,45.560566,...,0.027384,0.033958,-0.025,0.700000,0.925000,-0.025,0.000,0.600000,0.550000,0.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2244,2024-12-20,AAPL,248.039993,255.000000,245.690002,254.490005,147495300,249.395001,248.879461,75.940140,...,0.008878,0.003060,0.000,0.000000,0.000000,0.000,0.000,0.000000,0.000000,0.000000
2245,2024-12-23,AAPL,254.770004,255.649994,253.449997,255.270004,40858800,250.247002,250.041378,73.865387,...,0.008893,0.011413,0.000,0.000000,0.000000,0.000,0.000,0.000000,0.000000,0.000000
2246,2024-12-24,AAPL,255.490005,258.209991,255.289993,258.200012,23234700,251.290002,251.524766,76.180663,...,0.008828,0.003171,0.000,0.000000,0.000000,0.000,0.000,0.000000,0.000000,0.000000
2247,2024-12-26,AAPL,258.190002,260.100006,257.630005,259.019989,27237100,252.543001,252.887534,76.812098,...,0.008782,-0.013331,0.000,0.000000,0.000000,0.000,0.000,0.000000,0.000000,0.000000


In [44]:
print(feature_cols)

['open', 'high', 'low', 'close', 'volume', 'sma_10', 'ema_10', 'rsi_14', 'macd', 'vol_20', 'polarity', 'intensity', 'relevance', 'short_term', 'long_term', 'volatility', 'novelty', 'credibility']


In [51]:
print(y_pred)

[0.00142076 0.00141831 0.00142216 0.00143077 0.0014392  0.00144262
 0.00143906 0.00143074 0.00142192 0.00141816 0.00142381 0.00143866
 0.00146138 0.00149086 0.00152299 0.00155284 0.0015746  0.00158476
 0.00158446 0.00157826 0.00156845 0.00155412 0.00153832 0.00152571
 0.00151778 0.00151246 0.00150447 0.00149275 0.00147999 0.00146645
 0.00144856 0.00142486 0.00139948 0.00137832 0.00136361 0.00135408
 0.00134779 0.00134429 0.00134325 0.00134491 0.00134924 0.00135334
 0.00135427 0.00134958 0.00134262 0.00133661 0.00133167 0.00132554
 0.00131535 0.00129827 0.00127525 0.00124752 0.00121622 0.00118189
 0.00114664 0.00111227 0.0010812  0.00105321 0.00102828 0.00100578
 0.00098531 0.0009649  0.00094392 0.00092234 0.00090196 0.00088104
 0.00086121 0.00084676 0.00083981 0.00083983 0.00084691 0.00086006
 0.00087665 0.0008945  0.00091069 0.00092226 0.00092991 0.00093458
 0.00093827 0.0009454  0.00096291 0.00099289 0.00103246 0.00107696
 0.00112044 0.00116006 0.00119231 0.00121655 0.00123383 0.0012

In [45]:
display(X_train[0])

array([[-1.0587894 , -1.047562  , -1.0568208 , ...,  0.80139726,
         0.5717702 ,  0.67822677],
       [-1.0426335 , -1.0471755 , -1.0520644 , ...,  0.9771232 ,
         1.0812593 ,  0.7899822 ],
       [-1.051522  , -1.0507643 , -1.0585195 , ...,  0.97712445,
         0.571771  ,  0.7341055 ],
       ...,
       [-1.0529196 , -1.0515926 , -1.0552919 , ..., -1.3073167 ,
        -1.2963566 , -1.3333716 ],
       [-1.0542612 , -1.053028  , -1.0518379 , ...,  0.44994482,
         1.4209186 ,  0.7899822 ],
       [-1.0533669 , -1.0534146 , -1.0542728 , ..., -1.3073167 ,
        -1.2963566 , -1.3333716 ]], shape=(90, 18), dtype=float32)

In [34]:
alpha = 0.5

y_pred_late = alpha * y_pred_market + (1 - alpha) * y_pred_sent

mse_late = mean_squared_error(y_test, y_pred_late)

print("Late Fusion Test MSE:", mse_late)


Late Fusion Test MSE: 0.00018709084511628487
