# Model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader
import math
import numpy as np
import pandas as pd
import random
from collections import defaultdict

def set_seed(seed=1337):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

class TimeSeriesTransformer(nn.Module):
    def __init__(self, d_in, window, d_model=128, n_heads=4, n_layers=3, dropout=0.1):
        super().__init__()
        self.window = window
        self.proj = nn.Linear(d_in, d_model)
        self.pos  = nn.Parameter(torch.randn(1, window, d_model) * 0.01)
        enc = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads,
            dim_feedforward=4*d_model, dropout=dropout,
            batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc, num_layers=n_layers)
        self.head = nn.Linear(d_model, 1)

    def forward(self, x):                # x: (B, L, d_in)
        z = self.proj(x) + self.pos[:, :x.size(1)]
        z = self.encoder(z)              # (B, L, d_model)
        z = z[:, -1, :]                  # use last step
        return self.head(z).squeeze(-1)  # (B,)

# Download and Prepare Stock Data

In [2]:
import argparse
import sys
from pathlib import Path
import pandas as pd
import yfinance as yf
import numpy as np

In [3]:
def download_prices(tickers, start=None, end=None, interval="1d", adjust=False):
    """Download OHLCV data from Yahoo Finance."""
    data = yf.download(tickers=tickers, start=start, end=end, interval=interval,
                       auto_adjust=adjust, group_by='ticker', threads=True, progress=False)
    if data.empty:
        raise ValueError("No data returned. Check tickers/interval/date range.")
    return data

def save_prices(data: pd.DataFrame, tickers, out_path: Path, wide=False):
    """Save downloaded prices to CSV in wide or tidy format."""
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if isinstance(tickers, (list, tuple)) and len(tickers) > 1:
        if wide:
            data.to_csv(out_path)
        else:
            tidy = data.stack(level=0).reset_index()
            tidy = tidy.rename(columns={"level_1": "Ticker"})
            tidy.to_csv(out_path, index=False)
    else:
        if isinstance(data.columns, pd.MultiIndex):
            data.columns = [' '.join(col).strip() for col in data.columns.values]
        data.to_csv(out_path)
    return out_path

def maybe_save_actions(tickers, start, end, out_dir: Path, which: str):
    """Save dividend or split history for given tickers."""
    for t in tickers:
        tk = yf.Ticker(t)
        if which == "dividends":
            df = tk.dividends
        else:
            df = tk.splits
        if df is None or df.empty:
            continue
        df = df.loc[(df.index >= (start or df.index.min())) & (df.index <= (end or df.index.max()))]
        file = out_dir / f"{t.lower()}_{which}.csv"
        df.to_csv(file, header=[which.capitalize()])
        print(f"Saved {which} for {t}: {file}")

def fetch_and_save(tickers, start=None, end=None, interval="1d", adjust=False,
                   dividends=False, splits=False, out="prices.csv", wide=False):
    """
    Fetch historical price data from Yahoo Finance and save to CSV.

    Parameters:
      tickers (list[str] or str): One or more ticker symbols.
      start, end (str or None): YYYY-MM-DD date range.
      interval (str): Data interval, e.g., '1d', '1wk', '1mo'.
      adjust (bool): Adjust OHLC for dividends/splits.
      dividends (bool): Save dividend history to separate CSVs.
      splits (bool): Save split history to separate CSVs.
      out (str): Output CSV path for price data.
      wide (bool): Save multi-ticker data in wide format (default False).
    """
    if isinstance(tickers, str):
        tickers = [tickers]
    out_path = Path(out)
    data = download_prices(tickers, start=start, end=end, interval=interval, adjust=adjust)
    saved_file = save_prices(data, tickers, out_path, wide=wide)
    print(f"Saved prices to: {saved_file.resolve()}")
    if dividends:
        maybe_save_actions(tickers, start, end, out_path.parent, "dividends")
    if splits:
        maybe_save_actions(tickers, start, end, out_path.parent, "splits")
    return saved_file

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
tickers = [
  "NVDA","MSFT","AAPL","AMZN","META","AVGO","GOOGL","GOOG","TSLA","ELV",
  "JPM","WMT","ORCL","V","LLY","MA","NFLX","XOM","COST","JNJ",
  "PLTR","HD","ABBV","PG","BAC","CVX","KO","AMD","TMUS","GE",
  "UNH","CSCO","PM","WFC","MS","CRM","ABT","LIN","IBM","GS",
  "MCD","AXP","MRK","DIS","RTX","T","PEP","INTU","UBER","CAT",
  "VZ","TMO","NOW","BA","BKNG","BLK","TXN","SCHW","C","ANET",
  "ISRG","SPGI","QCOM","GEV","AMGN","ACN","BSX","DHR","ADBE","NEE",
  "TJX","GILD","SYK","PGR","PFE","LOW","COF","HON","ETN","MU",
  "BX","APH","DE","UNP","AMAT","KKR","LRCX","CMCSA","ADP","COP",
  "MDT","PANW","ADI","KLAC","SNPS","NKE","MO","INTC","CB"
]

fetch_and_save(
    tickers=tickers,
    start="2019-01-01",
    end="2024-12-31",
    interval="1d",
    adjust=True,
    out="./drive/MyDrive/stock-data.csv"
)

  tidy = data.stack(level=0).reset_index()


Saved prices to: /content/drive/MyDrive/stock-data.csv


PosixPath('drive/MyDrive/stock-data.csv')

# Data Preprocessing

In [6]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

# ====== 1) Feature engineering per ticker ======
def add_features(g: pd.DataFrame) -> pd.DataFrame:
    """Add new features to the dataset like """
    g = g.sort_values("Date").copy()
    g["logp"]  = np.log(g["Close"])
    g["ret1"]  = g["logp"].diff()                       # next-day target base
    g["hl_pct"] = (g["High"] - g["Low"]) / g["Close"]   # intraday range
    g["vol_log"] = np.log(g["Volume"].replace(0, np.nan)).ffill()
    g["vol_chg"] = g["vol_log"].diff()

    # time (cyclical) encodings
    dow = g["Date"].dt.dayofweek
    month = g["Date"].dt.month
    g["dow_sin"]   = np.sin(2*np.pi * dow/7)
    g["dow_cos"]   = np.cos(2*np.pi * dow/7)
    g["mth_sin"]   = np.sin(2*np.pi * (month-1)/12)
    g["mth_cos"]   = np.cos(2*np.pi * (month-1)/12)

    # drop first row (diff) NaNs
    g = g.dropna().reset_index(drop=True)
    return g

# ====== 2) Split dates (pure temporal split) ======
def temporal_split(df, train_end, val_end):
    """
    Split the dataframe into three seperate dataframes (i.e train, val, and test) based on train_end and val_end times
    """
    train = df[df["Date"] <= pd.to_datetime(train_end)]
    val   = df[(df["Date"] > pd.to_datetime(train_end)) & (df["Date"] <= pd.to_datetime(val_end))]
    test  = df[df["Date"] > pd.to_datetime(val_end)]
    return train, val, test

# ====== 3) Per-ticker standardization using train stats only ======
def fit_scaler(train_df, feature_cols):
    """
    return the mean and standard deiviation of each of the given columns of the given df
    """
    stats = (train_df[feature_cols].mean(), train_df[feature_cols].std().replace(0,1.0))
    return stats

def apply_scaler(df, feature_cols, stats):
    """
    Standardise the given columns of the given df using the given mean and standard deviation which was retrieved from the training set
    """
    mu, sd = stats
    df = df.copy()
    df.loc[:, feature_cols] = (df[feature_cols] - mu) / sd
    return df

# ====== 4) Window maker ======
def make_windows(feat_df, feature_cols, target_col, window=64, horizon=1):
    """
    This creates a training and corresponding target arrays.

    Uses window number of rows to predict target_col of current timestamp plus horizon target.

    I.E, give window=64, target_col='ret1', and horizon=1, predict ret1 of t+1 using the t-64 rows.

    Returns arrays X:(N,L,D), y:(N,), where (N=number of windows, L=window length (horixon), D=number of features (len(feature_cols)))
    Predict target_col at t+horizon using features up to t (inclusive).
    """
    vals = feat_df[feature_cols].values
    target = feat_df[target_col].shift(-horizon).values  # predict next-day return
    # last 'horizon' targets become NaN; drop them
    valid_upto = len(feat_df) - horizon
    X, y = [], []
    for t in range(window, valid_upto):
        X.append(vals[t-window:t, :])
        y.append(target[t-1])  # target at current t (i.e., next-day)
    X = np.stack(X).astype(np.float32)
    y = np.array(y, dtype=np.float32)
    return X, y

# ====== 5) Dataset wrapper ======
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, i):
        return self.X[i], self.y[i]

# Build Dataset

In [7]:
def build_datasets_with_scalers(
    df_long,
    window=64,
    horizon=1,
    train_end="2022-12-31",
    val_end="2023-12-31",
    tickers=None,
    task="regression",
):
    df = df_long.copy()
    df["Date"] = pd.to_datetime(df["Date"])
    if tickers is not None:
        df = df[df["Ticker"].isin(tickers)]

    # add to the dataset new features
    df_feat = (
        df.groupby("Ticker", group_keys=False)
          .apply(add_features)
          .reset_index(drop=True)
    )

    feature_cols = ["ret1", "hl_pct", "vol_chg", "dow_sin", "dow_cos", "mth_sin", "mth_cos"]
    target_col = "ret1"

    # split data into train, validation, and test dfs based on datetime.
    # I.E, the beginning of the dataset to the train_end datetime will be the training set, that point to the val_end datetime will be the validation set. The rest will be the testing set
    train_long, val_long, test_long = temporal_split(df_feat, train_end, val_end)

    Xy = {"train": {"X": [], "y": []}, "val": {"X": [], "y": []}, "test": {"X": [], "y": []}}
    scalers = {}   # ticker -> (mu, sd)

    for ticker, g_train in train_long.groupby("Ticker"):
        g_val  = val_long[val_long["Ticker"] == ticker]
        g_test = test_long[test_long["Ticker"] == ticker]
        if len(g_train) < window + horizon + 1:
            continue

        # get the mean and standard deviation of the training set
        stats = fit_scaler(g_train, feature_cols)
        scalers[ticker] = stats

        # standardise all 3 dfs using the mean and standard deviation of the training set that we retrieved above
        g_train_s = apply_scaler(g_train, feature_cols, stats)
        g_val_s   = apply_scaler(g_val,   feature_cols, stats) if len(g_val)  else g_val
        g_test_s  = apply_scaler(g_test,  feature_cols, stats) if len(g_test) else g_test

        for name, g_s in [("train", g_train_s), ("val", g_val_s), ("test", g_test_s)]:
            if len(g_s) >= window + horizon + 1:
                X, y = make_windows(g_s, feature_cols, target_col, window, horizon)
                if task == "direction":
                    y = (y > 0).astype(np.float32)
                Xy[name]["X"].append(X); Xy[name]["y"].append(y)

    def _stack(split):
        if not Xy[split]["X"]:
            return None, None
        X = np.concatenate(Xy[split]["X"], axis=0).astype(np.float32)
        y = np.concatenate(Xy[split]["y"], axis=0).astype(np.float32)
        return X, y

    X_train, y_train = _stack("train")
    X_val,   y_val   = _stack("val")
    X_test,  y_test  = _stack("test")

    train_ds = TimeSeriesDataset(X_train, y_train) if X_train is not None else None
    val_ds   = TimeSeriesDataset(X_val,   y_val)   if X_val   is not None else None
    test_ds  = TimeSeriesDataset(X_test,  y_test)  if X_test  is not None else None

    meta = {
        "feature_cols": feature_cols,
        "d_in": len(feature_cols),
        "window": window,
        "horizon": horizon,
        "task": task,
        "tickers": sorted(train_long["Ticker"].unique().tolist()),
        "train_end": str(pd.to_datetime(train_end).date()),
        "val_end": str(pd.to_datetime(val_end).date()),
    }
    return train_ds, val_ds, test_ds, meta, scalers

# Training + validation + early stopping

In [8]:
def train_model(
    train_loader, val_loader, meta,
    d_model=128, n_heads=4, n_layers=3, dropout=0.1,
    epochs=50, lr=2e-3, weight_decay=1e-4, patience=5, device=None, seed=1337
):
    set_seed(seed)
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    model = TimeSeriesTransformer(
        d_in=meta["d_in"], window=meta["window"],
        d_model=d_model, n_heads=n_heads, n_layers=n_layers, dropout=dropout
    ).to(device)

    criterion = nn.MSELoss() if meta["task"] == "regression" else nn.BCEWithLogitsLoss()
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2)

    best_val = float("inf")
    best_state = None
    wait = 0

    for epoch in range(1, epochs+1):
        # ---- train
        model.train()
        tr_loss, n_tr = 0.0, 0
        for xb, yb in train_loader:
            xb = xb.to(device); yb = yb.to(device)
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            tr_loss += loss.item() * xb.size(0)
            n_tr += xb.size(0)
        tr_loss /= max(1, n_tr)

        # ---- validate
        model.eval()
        vl_loss, n_vl = 0.0, 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device); yb = yb.to(device)
                out = model(xb)
                loss = criterion(out, yb)
                vl_loss += loss.item() * xb.size(0)
                n_vl += xb.size(0)
        vl_loss /= max(1, n_vl)
        scheduler.step(vl_loss)

        print(f"epoch {epoch:03d} | train {tr_loss:.4f} | val {vl_loss:.4f} | lr {optimizer.param_groups[0]['lr']:.2e}")

        if vl_loss + 1e-8 < best_val:
            best_val = vl_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print("early stopping.")
                break

    model.load_state_dict(best_state)
    model.to(device)
    return model

# Test evaluation

In [9]:
def evaluate(model, data_loader, meta, device=None):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for xb, yb in data_loader:
            xb = xb.to(device)
            out = model(xb).cpu().numpy()
            y_pred.append(out)
            y_true.append(yb.numpy())
    y_true = np.concatenate(y_true); y_pred = np.concatenate(y_pred)

    if meta["task"] == "regression":
        mse = np.mean((y_pred - y_true)**2)
        mae = np.mean(np.abs(y_pred - y_true))
        print(f"Test MSE: {mse:.6f} | MAE: {mae:.6f}")
        return {"mse": mse, "mae": mae, "y_true": y_true, "y_pred": y_pred}
    else:
        prob = 1/(1+np.exp(-y_pred))
        pred = (prob > 0.5).astype(np.float32)
        acc = (pred == y_true).mean()
        print(f"Test Accuracy: {acc:.4f}")
        return {"acc": acc, "y_true": y_true, "y_pred": y_pred, "prob": prob}

# Inference helper for a specific ticker

In [10]:
def latest_window_for_ticker(df_long, ticker, meta, scalers):
    """
    This rebuilds features for a ticker, applies the train-split scaler you used, and returns the last window to predict the next day.
    """
    df = df_long[df_long["Ticker"] == ticker].copy()
    if df.empty or ticker not in scalers:
        raise ValueError(f"No data or scaler for {ticker}")

    df["Date"] = pd.to_datetime(df["Date"])
    g = add_features(df)
    mu, sd = scalers[ticker]
    feat = g[meta["feature_cols"]].copy()
    feat = (feat - mu) / sd
    if len(feat) < meta["window"]:
        raise ValueError(f"Not enough history for window={meta['window']}")
    x = feat.values[-meta["window"]:, :].astype(np.float32)  # (L, D)
    x = torch.from_numpy(x).unsqueeze(0)  # (1, L, D)
    return x

# End-to-end usage

In [11]:
import pandas as pd
import numpy as np

df = pd.read_csv("./drive/MyDrive/stock-data.csv") # read data from csv file

# Build datasets/loaders
train_ds, val_ds, test_ds, meta, scalers = build_datasets_with_scalers(
    df,                       # your tidy df (Date, Ticker, OHLCV)
    window=64, horizon=1,
    train_end="2022-12-31",
    val_end="2023-12-31",
    tickers=None,             # or subset like ["AAPL","MSFT","NVDA"]
    task="regression"         # or "direction"
)


BATCH = 256
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=BATCH, shuffle=False)

  .apply(add_features)


In [12]:
# Train
model = train_model(train_loader, val_loader, meta,
                    d_model=128, n_heads=4, n_layers=3, dropout=0.1,
                    epochs=50, lr=2e-3, patience=6)



epoch 001 | train 1.1440 | val 0.5770 | lr 2.00e-03
epoch 002 | train 0.9298 | val 0.5603 | lr 2.00e-03
epoch 003 | train 0.8813 | val 0.5617 | lr 2.00e-03
epoch 004 | train 0.8543 | val 0.5843 | lr 2.00e-03
epoch 005 | train 0.8327 | val 0.5562 | lr 2.00e-03
epoch 006 | train 0.8131 | val 0.5712 | lr 2.00e-03
epoch 007 | train 0.7876 | val 0.5729 | lr 2.00e-03
epoch 008 | train 0.7611 | val 0.5871 | lr 1.00e-03
epoch 009 | train 0.7048 | val 0.6239 | lr 1.00e-03
epoch 010 | train 0.6741 | val 0.5874 | lr 1.00e-03
epoch 011 | train 0.6499 | val 0.6670 | lr 5.00e-04
early stopping.


In [13]:
# Evaluate
_ = evaluate(model, test_loader, meta)

Test MSE: 0.661646 | MAE: 0.565058


In [None]:
# Inference for a ticker (predict next-day return)
device = "cuda" if torch.cuda.is_available() else "cpu"
x_latest = latest_window_for_ticker(df, "AAPL", meta, scalers).to(device)
with torch.no_grad():
    pred_ret = model(x_latest).item()
print("Predicted next day return for AAPL:", pred_ret)

Predicted next-day return for AAPL: 0.07556751370429993
