In [15]:
import sys
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.data.dataset import make_time_splits

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

In [16]:
PROJECT_ROOT = Path.cwd().resolve().parent  # adjust if needed

DATA_PATH = PROJECT_ROOT / "data" / "processed" / "daily_merged.parquet"
print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_PATH exists:", DATA_PATH.exists())


PROJECT_ROOT: C:\Users\KDP only\Documents\ANN_Final_Project\spy-ann
DATA_PATH exists: True


In [17]:
df = pd.read_parquet(DATA_PATH)
df["date"] = pd.to_datetime(df["date"])

print("=== DATA PREVIEW ===")
display(df.head())
print("\nShape:", df.shape)
print("Date range:", df["date"].min(), "→", df["date"].max())

print("\nColumns:", df.columns.tolist())


=== DATA PREVIEW ===


Unnamed: 0,date,Close,High,Low,Open,Volume,ret_1d,log_ret_1d,ma_close_5,ma_close_20,vol_5,vol_20,future_price,future_ret_1d,label_up
0,2010-02-02,83.059364,83.217386,81.930636,82.216584,216327900,0.012104,0.012031,82.055548,84.347997,0.012653,0.010585,82.645493,-0.004983,0
1,2010-02-03,82.645493,83.134609,82.404697,82.683113,172730700,-0.004983,-0.004995,82.055548,84.205024,0.012873,0.010574,80.094604,-0.030865,0
2,2010-02-04,80.094604,82.04354,80.079552,82.005919,356715700,-0.030865,-0.031352,81.734995,83.931498,0.018783,0.012403,80.260124,0.002067,1
3,2010-02-05,80.260124,80.425666,78.694953,80.184871,493585800,0.002067,0.002064,81.625131,83.648186,0.018457,0.012344,79.68071,-0.007219,0
4,2010-02-08,79.68071,80.764291,79.62051,80.320322,224166900,-0.007219,-0.007245,81.148059,83.321606,0.015917,0.01227,80.681526,0.01256,1



Shape: (3753, 15)
Date range: 2010-02-02 00:00:00 → 2024-12-30 00:00:00

Columns: ['date', 'Close', 'High', 'Low', 'Open', 'Volume', 'ret_1d', 'log_ret_1d', 'ma_close_5', 'ma_close_20', 'vol_5', 'vol_20', 'future_price', 'future_ret_1d', 'label_up']


In [18]:
target_col = "future_ret_1d"

drop_cols = ["date", "future_price", "label_up", target_col]
feature_cols = [c for c in df.columns if c not in drop_cols]

print("\nFeature columns:", feature_cols)
print("Num features:", len(feature_cols))

X_all = df[feature_cols].values.astype("float32")
y_all = df[target_col].values.astype("float32")

print("X_all shape:", X_all.shape)
print("y_all shape:", y_all.shape)
print("y_all stats: mean={:.6f}, std={:.6f}".format(y_all.mean(), y_all.std()))



Feature columns: ['Close', 'High', 'Low', 'Open', 'Volume', 'ret_1d', 'log_ret_1d', 'ma_close_5', 'ma_close_20', 'vol_5', 'vol_20']
Num features: 11
X_all shape: (3753, 11)
y_all shape: (3753,)
y_all stats: mean=0.000576, std=0.010741


In [19]:
train_end = "2018-12-31"
val_end = "2021-12-31"

splits = make_time_splits(df, train_end=train_end, val_end=val_end)

train_idx = splits.train_idx
val_idx = splits.val_idx
test_idx = splits.test_idx

print("Train size:", len(train_idx))
print("Val size:", len(val_idx))
print("Test size:", len(test_idx))

X_train, X_val, X_test = X_all[train_idx], X_all[val_idx], X_all[test_idx]
y_train, y_val, y_test = y_all[train_idx], y_all[val_idx], y_all[test_idx]


Train size: 2244
Val size: 757
Test size: 752


In [20]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_s = scaler.transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

X_train_s.shape, X_val_s.shape, X_test_s.shape


((2244, 11), (757, 11), (752, 11))

In [21]:
class MLPRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dims=(64, 32)):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.ReLU())
            prev = h
        layers.append(nn.Linear(prev, 1))  # scalar output
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        out = self.net(x)
        return out.squeeze(-1)  # (batch,)
    
    
    def train_mlp_reg(
        Xtr, ytr,
        Xval, yval,
        Xte, yte,
        hidden_dims=(64, 32),
        num_epochs=50,
        batch_size=64,
        lr=1e-3,
        weight_decay=0.0,
        verbose=True,
    ):
        input_dim = Xtr.shape[1]
        model = MLPRegressor(input_dim, hidden_dims).to(device)

        train_ds = TensorDataset(
            torch.from_numpy(Xtr).float(),
            torch.from_numpy(ytr).float()
        )
        val_ds = TensorDataset(
            torch.from_numpy(Xval).float(),
            torch.from_numpy(yval).float()
        )
        test_ds = TensorDataset(
            torch.from_numpy(Xte).float(),
            torch.from_numpy(yte).float()
        )

        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
        test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

        criterion = nn.MSELoss()
        optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

        best_val_loss = float("inf")
        best_state = None

        for epoch in range(1, num_epochs + 1):
            # ----- train -----
            model.train()
            total_loss = 0.0
            n = 0
            for xb, yb in train_loader:
                xb = xb.to(device)
                yb = yb.to(device)

                optim.zero_grad()
                preds = model(xb)
                loss = criterion(preds, yb)
                loss.backward()
                optim.step()

                total_loss += loss.item() * yb.size(0)
                n += yb.size(0)
            train_loss = total_loss / n

            # ----- val -----
            model.eval()
            total_loss_val = 0.0
            n_val = 0
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb = xb.to(device)
                    yb = yb.to(device)
                    preds = model(xb)
                    loss = criterion(preds, yb)
                    total_loss_val += loss.item() * yb.size(0)
                    n_val += yb.size(0)
            val_loss = total_loss_val / n_val

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_state = model.state_dict()

            if verbose and (epoch % 10 == 0 or epoch == 1):
                print(f"Epoch {epoch:02d} | train_loss={train_loss:.6f} | val_loss={val_loss:.6f}")

        if best_state is not None:
            model.load_state_dict(best_state)

        # ----- test metrics -----
        model.eval()
        preds_list = []
        y_list = []
        with torch.no_grad():
            for xb, yb in test_loader:
                xb = xb.to(device)
                yb = yb.to(device)
                preds = model(xb)
                preds_list.append(preds.cpu().numpy())
                y_list.append(yb.cpu().numpy())

        y_true = np.concatenate(y_list)
        y_pred = np.concatenate(preds_list)

        mse = np.mean((y_pred - y_true) ** 2)
        mae = np.mean(np.abs(y_pred - y_true))
        corr = pearsonr(y_true, y_pred)[0] if y_true.std() > 0 and y_pred.std() > 0 else np.nan

        # directional accuracy
        dir_true = (y_true > 0).astype(int)
        dir_pred = (y_pred > 0).astype(int)
        dir_acc = (dir_true == dir_pred).mean()

        print("\n=== MLP Regression Test Metrics ===")
        print(f"MSE: {mse:.8f}")
        print(f"MAE: {mae:.8f}")
        print(f"Corr(y_true, y_pred): {corr:.4f}")
        print(f"Directional accuracy (sign(pred) vs sign(true)): {dir_acc:.4f}")
        print(f"Mean(true): {y_true.mean():.6f}, Mean(pred): {y_pred.mean():.6f}")

        return model, (mse, mae, corr, dir_acc)
    
    mlp_model, mlp_metrics = train_mlp_reg(
    X_train_s, y_train,
    X_val_s, y_val,
    X_test_s, y_test,
    hidden_dims=(64, 32),
    num_epochs=50,
    batch_size=64,
    lr=1e-3,
    weight_decay=1e-4,
    verbose=True,
    )
    mlp_metrics



Epoch 01 | train_loss=0.001006 | val_loss=0.000537
Epoch 10 | train_loss=0.000094 | val_loss=0.000305
Epoch 20 | train_loss=0.000088 | val_loss=0.000481
Epoch 30 | train_loss=0.000091 | val_loss=0.000252
Epoch 40 | train_loss=0.000086 | val_loss=0.000192
Epoch 50 | train_loss=0.000087 | val_loss=0.000186

=== MLP Regression Test Metrics ===
MSE: 0.00013306
MAE: 0.00885392
Corr(y_true, y_pred): -0.0371
Directional accuracy (sign(pred) vs sign(true)): 0.4681
Mean(true): 0.000391, Mean(pred): -0.002280


In [22]:
def build_windows(X, y, window_size):
    X_seq, y_seq = [], []
    T = len(X)
    for t in range(T - window_size):
        X_seq.append(X[t:t+window_size])
        y_seq.append(y[t+window_size])
    return np.stack(X_seq), np.array(y_seq, dtype="float32")


In [24]:
class SeqDataset(Dataset):
    def __init__(self, X_seq, y_seq):
        self.X = torch.from_numpy(X_seq).float()
        self.y = torch.from_numpy(y_seq).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class GRURegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=32, num_layers=1):
        super().__init__()
        self.gru = nn.GRU(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        # x: (batch, seq_len, input_dim)
        out, h_n = self.gru(x)
        last_hidden = h_n[-1]  # (batch, hidden_dim)
        out = self.fc(last_hidden)
        return out.squeeze(-1)
    
def train_gru_reg(
    Xtr, ytr,
    Xval, yval,
    Xte, yte,
    window_size=30,
    hidden_dim=32,
    num_layers=1,
    num_epochs=30,
    batch_size=64,
    lr=1e-3,
    weight_decay=1e-4,
    verbose=True,
):
# build windows separately for train/val/test
    Xtr_seq, ytr_seq = build_windows(Xtr, ytr, window_size)
    Xval_seq, yval_seq = build_windows(Xval, yval, window_size)
    Xte_seq, yte_seq = build_windows(Xte, yte, window_size)

    print(f"Train windows: {Xtr_seq.shape[0]}, Val: {Xval_seq.shape[0]}, Test: {Xte_seq.shape[0]}")

    train_ds = SeqDataset(Xtr_seq, ytr_seq)
    val_ds   = SeqDataset(Xval_seq, yval_seq)
    test_ds  = SeqDataset(Xte_seq, yte_seq)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    input_dim = Xtr.shape[1]
    model = GRURegressor(input_dim, hidden_dim=hidden_dim, num_layers=num_layers).to(device)

    criterion = nn.MSELoss()
    optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val_loss = float("inf")
    best_state = None

    for epoch in range(1, num_epochs + 1):
        # ---- train ----
        model.train()
        tot_loss = 0.0
        n = 0
        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            optim.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optim.step()

            tot_loss += loss.item() * yb.size(0)
            n += yb.size(0)
        train_loss = tot_loss / n

        # ---- val ----
        model.eval()
        tot_val = 0.0
        n_val = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                yb = yb.to(device)
                preds = model(xb)
                loss = criterion(preds, yb)
                tot_val += loss.item() * yb.size(0)
                n_val += yb.size(0)
        val_loss = tot_val / n_val

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_state = model.state_dict()

        if verbose and (epoch % 5 == 0 or epoch == 1):
            print(f"Epoch {epoch:02d} | train_loss={train_loss:.6f} | val_loss={val_loss:.6f}")

    if best_state is not None:
        model.load_state_dict(best_state)

    # ---- test metrics ----
    model.eval()
    preds_list, y_list = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            yb = yb.to(device)
            preds = model(xb)
            preds_list.append(preds.cpu().numpy())
            y_list.append(yb.cpu().numpy())

    y_true = np.concatenate(y_list)
    y_pred = np.concatenate(preds_list)

    mse = np.mean((y_pred - y_true) ** 2)
    mae = np.mean(np.abs(y_pred - y_true))
    corr = pearsonr(y_true, y_pred)[0] if y_true.std() > 0 and y_pred.std() > 0 else np.nan

    dir_true = (y_true > 0).astype(int)
    dir_pred = (y_pred > 0).astype(int)
    dir_acc = (dir_true == dir_pred).mean()

    print("\n=== GRU Regression Test Metrics ===")
    print(f"MSE: {mse:.8f}")
    print(f"MAE: {mae:.8f}")
    print(f"Corr(y_true, y_pred): {corr:.4f}")
    print(f"Directional accuracy (sign(pred) vs sign(true)): {dir_acc:.4f}")
    print(f"Mean(true): {y_true.mean():.6f}, Mean(pred): {y_pred.mean():.6f}")

    return model, (mse, mae, corr, dir_acc)

gru_model, gru_metrics = train_gru_reg(
    X_train_s, y_train,
    X_val_s, y_val,
    X_test_s, y_test,
    window_size=30,
    hidden_dim=32,
    num_layers=1,
    num_epochs=30,
    batch_size=64,
    lr=1e-3,
    weight_decay=1e-4,
    verbose=True,
)
gru_metrics



Train windows: 2214, Val: 727, Test: 722
Epoch 01 | train_loss=0.009207 | val_loss=0.003508
Epoch 05 | train_loss=0.000130 | val_loss=0.000802
Epoch 10 | train_loss=0.000104 | val_loss=0.000933
Epoch 15 | train_loss=0.000099 | val_loss=0.000982
Epoch 20 | train_loss=0.000093 | val_loss=0.000694
Epoch 25 | train_loss=0.000092 | val_loss=0.000541
Epoch 30 | train_loss=0.000096 | val_loss=0.000508

=== GRU Regression Test Metrics ===
MSE: 0.00232785
MAE: 0.03998495
Corr(y_true, y_pred): 0.0024
Directional accuracy (sign(pred) vs sign(true)): 0.5526
Mean(true): 0.000499, Mean(pred): 0.034328


(np.float32(0.0023278547),
 np.float32(0.039984953),
 np.float32(0.0024249603),
 np.float64(0.5526315789473685))