In [21]:
import os
import math
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [22]:
BASE_DIR = Path.cwd().parent   
DATA_PATH = BASE_DIR / "dataset" / "card_usage" / "card_subway_with_timeseries_features.csv"
OUT_DIR = BASE_DIR / "ml_outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

DATE_COL = "date"
TARGET_COL = "total_flow"

SEQ_LEN = 14
BATCH_SIZE = 512
EPOCHS = 20
LR = 1e-3
DEVICE = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Device: mps


In [None]:
df = pd.read_csv(DATA_PATH, parse_dates=[DATE_COL], low_memory=False)

# Time split 
split_date = df[DATE_COL].quantile(0.8)
train_df = df[df[DATE_COL] <= split_date].copy()
val_df   = df[df[DATE_COL] >  split_date].copy()

print("Train date range:", train_df[DATE_COL].min(), "->", train_df[DATE_COL].max())
print("Val date range  :", val_df[DATE_COL].min(), "->", val_df[DATE_COL].max())

# keep only stations seen in training
train_stations = set(train_df["station_key"].unique())
val_stations = set(val_df["station_key"].unique())
unseen = sorted(list(val_stations - train_stations))
print(f"Val stations total: {len(val_stations)} | unseen in train: {len(unseen)}")

val_df = val_df[val_df["station_key"].isin(train_stations)].copy()
print("Val rows after filtering unseen stations:", len(val_df))

Train date range: 2015-01-01 00:00:00 -> 2023-10-20 00:00:00
Val date range  : 2023-10-21 00:00:00 -> 2025-11-30 00:00:00
Val stations total: 653 | unseen in train: 11
Val rows after filtering unseen stations: 616355


In [24]:

# Per-station normalisation 
# Compute mu/sigma on train dataset only
stats = train_df.groupby("station_key")[TARGET_COL].agg(["mean", "std"]).reset_index()
stats["std"] = stats["std"].replace(0, np.nan)  # avoid div by zero
stats = stats.fillna({"std": 1.0})

train_df = train_df.merge(stats, on="station_key", how="left")
val_df   = val_df.merge(stats, on="station_key", how="left")

# Normalize target for stable training
train_df["flow_norm"] = (train_df[TARGET_COL] - train_df["mean"]) / train_df["std"]
val_df["flow_norm"]   = (val_df[TARGET_COL]   - val_df["mean"])   / val_df["std"]

# Fill any missing station stats in val (rare if val contains new stations)
val_df["mean"] = val_df["mean"].fillna(val_df[TARGET_COL].mean())
val_df["std"]  = val_df["std"].fillna(val_df[TARGET_COL].std() if val_df[TARGET_COL].std() > 0 else 1.0)
val_df["flow_norm"] = (val_df[TARGET_COL] - val_df["mean"]) / val_df["std"]

In [25]:
# Build sequences per station
# Dynamically select only features that exist in the DataFrame
possible_features = [
    "flow_norm", "is_weekend", "day_of_week_num", "day_of_month", "week_of_year",
    "flow_lag_1", "flow_lag_7", "flow_lag_14", "flow_roll_mean_7", "flow_roll_mean_14",
    "flow_roll_std_7", "flow_roll_std_14", "flow_ratio", "flow_diff"
]
if "latitude" in df.columns and "longitude" in df.columns:
    possible_features += ["latitude", "longitude"]

FEATURE_COLS = [col for col in possible_features if col in df.columns]
print("Using features:", FEATURE_COLS)

class SubwaySeqDataset(Dataset):
    """
    Each item:
      X: (SEQ_LEN, F) features for days t-SEQ_LEN+1..t
      y: scalar flow_norm at day t+1
      meta: station_key, date_of_y, mean, std (for denorm)
    """
    def __init__(self, frame: pd.DataFrame, seq_len: int, feature_cols: list[str]):
        self.seq_len = seq_len
        self.feature_cols = feature_cols

        self.samples = []
        for st, g in frame.groupby("station_key"):
            g = g.sort_values(DATE_COL).reset_index(drop=True)
            if len(g) < seq_len + 1:
                continue

            feats = g[feature_cols].to_numpy(dtype=np.float32)
            y = g["flow_norm"].to_numpy(dtype=np.float32)
            mu = g["mean"].to_numpy(dtype=np.float32)
            sd = g["std"].to_numpy(dtype=np.float32)

            # keep date as plain string (safe for collate)
            dates = g[DATE_COL].dt.strftime("%Y-%m-%d").astype(str).to_numpy()

            for t in range(seq_len - 1, len(g) - 1):
                X = feats[t - (seq_len - 1): t + 1]
                target = y[t + 1]
                meta = {
                    "station_key": str(st),
                    "date_y": dates[t + 1],
                    "mean": float(mu[t + 1]),
                    "std": float(sd[t + 1]),
                }
                self.samples.append((X, target, meta))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        X, y, meta = self.samples[idx]
        return torch.from_numpy(X), torch.tensor(y, dtype=torch.float32), meta


train_ds = SubwaySeqDataset(train_df, SEQ_LEN, FEATURE_COLS)
val_ds   = SubwaySeqDataset(val_df,   SEQ_LEN, FEATURE_COLS)

print("Train samples:", len(train_ds))
print("Val samples  :", len(val_ds))

def custom_collate(batch):
    Xs, ys, metas = zip(*batch)
    return torch.stack(Xs), torch.stack(ys), list(metas)

# smaller batch helps avoid "predict-mean" collapse
BATCH_SIZE = 128

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=custom_collate)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)


Using features: ['is_weekend', 'day_of_week_num', 'day_of_month', 'week_of_year', 'flow_lag_1', 'flow_lag_7', 'flow_lag_14', 'flow_roll_mean_7', 'flow_roll_mean_14', 'flow_roll_std_7', 'flow_roll_std_14', 'flow_ratio', 'flow_diff', 'latitude', 'longitude']
Train samples: 2477991
Val samples  : 607448


In [None]:
# Model: LSTM
class LSTMRegressor(nn.Module):
    def __init__(self, input_size: int, hidden_size: int = 128, num_layers: int = 2, dropout: float = 0.0):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        self.head = nn.Sequential(
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.head(last).squeeze(-1)


LR = 3e-4
model = LSTMRegressor(input_size=len(FEATURE_COLS)).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.MSELoss()

def eval_loader(loader):
    model.eval()
    preds_norm, trues_norm, metas = [], [], []
    with torch.no_grad():
        for X, y, meta in loader:
            X = X.to(DEVICE, dtype=torch.float32)
            y = y.to(DEVICE, dtype=torch.float32)

            yhat = model(X)

            preds_norm.append(yhat.detach().cpu().numpy())
            trues_norm.append(y.detach().cpu().numpy())

        
            if isinstance(meta, dict):
                metas.append(meta)
            else:
                metas.extend(list(meta))

    preds_norm = np.concatenate(preds_norm)
    trues_norm = np.concatenate(trues_norm)

    mu = np.array([m["mean"] for m in metas], dtype=np.float32)
    sd = np.array([m["std"] for m in metas], dtype=np.float32)

    preds = preds_norm * sd + mu
    trues = trues_norm * sd + mu

    mae = float(np.mean(np.abs(preds - trues)))
    rmse = float(math.sqrt(np.mean((preds - trues) ** 2)))
    return mae, rmse, preds, trues, metas

# Sanity check: baseline "predict tomorrow = today"
def eval_persistence(loader):
    preds_norm, trues_norm, metas = [], [], []
    for X, y, meta in loader:
        pred = X[:, -1, 0]  # last timestep's flow_norm
        preds_norm.append(pred.numpy())
        trues_norm.append(y.numpy())
        metas.extend(meta)

    preds_norm = np.concatenate(preds_norm)
    trues_norm = np.concatenate(trues_norm)

    mu = np.array([m["mean"] for m in metas], dtype=np.float32)
    sd = np.array([m["std"] for m in metas], dtype=np.float32)
    preds = preds_norm * sd + mu
    trues = trues_norm * sd + mu

    mae = float(np.mean(np.abs(preds - trues)))
    rmse = float(math.sqrt(np.mean((preds - trues) ** 2)))
    return mae, rmse

print("Persistence baseline (val):", eval_persistence(val_loader))

history = {"train_loss": [], "val_mae": [], "val_rmse": []}

for epoch in range(1, EPOCHS + 1):
    model.train()
    running = 0.0

    for X, y, _ in train_loader:
        X = X.to(DEVICE, dtype=torch.float32)
        y = y.to(DEVICE, dtype=torch.float32)

        optimizer.zero_grad()
        yhat = model(X)
        loss = criterion(yhat, y)
        loss.backward()

        # helps stabilize training / avoid collapse
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        running += float(loss.item())

    train_loss = running / max(1, len(train_loader))
    val_mae, val_rmse, _, _, _ = eval_loader(val_loader)

    history["train_loss"].append(train_loss)
    history["val_mae"].append(val_mae)
    history["val_rmse"].append(val_rmse)

    print(f"Epoch {epoch}/{EPOCHS} | train_loss={train_loss:.5f} | val_MAE={val_mae:,.2f} | val_RMSE={val_rmse:,.2f}")

val_mae, val_rmse, preds, trues, metas = eval_loader(val_loader)
print("\nFinal Validation")
print(f"MAE  : {val_mae:,.2f}")
print(f"RMSE : {val_rmse:,.2f}")


Persistence baseline (val): (10885.0859375, 19490.745291034924)


In [None]:
# Training curve
plt.figure()
plt.plot(history["train_loss"], label="train MSE loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("LSTM Training Loss")
plt.legend()
plt.tight_layout()
plt.savefig(OUT_DIR / "lstm_training_loss.png", dpi=200)
plt.close()

# Scatterplot: predicted vs actual
plt.figure()
# sample to make plot lighter if huge
idx = np.random.choice(len(preds), size=min(30000, len(preds)), replace=False)
plt.scatter(trues[idx], preds[idx], s=3)
minv = min(trues[idx].min(), preds[idx].min())
maxv = max(trues[idx].max(), preds[idx].max())
plt.plot([minv, maxv], [minv, maxv])
plt.xlabel("Actual next-day total_flow")
plt.ylabel("Predicted next-day total_flow")
plt.title("Predicted vs Actual (Validation)")
plt.tight_layout()
plt.savefig(OUT_DIR / "lstm_pred_vs_actual_scatter.png", dpi=200)
plt.close()

# Time series plot for one station (top-busy station in val)
meta_df = pd.DataFrame({
    "station_key": [m["station_key"] for m in metas],
    "date_y": pd.to_datetime([m["date_y"] for m in metas]),
    "pred": preds,
    "true": trues,
})
# pick station with most rows
pick_station = meta_df["station_key"].value_counts().index[0]
st_df = meta_df[meta_df["station_key"] == pick_station].sort_values("date_y")

plt.figure(figsize=(10, 4))
plt.plot(st_df["date_y"], st_df["true"], label="Actual")
plt.plot(st_df["date_y"], st_df["pred"], label="Predicted")
plt.xlabel("Date")
plt.ylabel("Total Flow")
plt.title(f"Next-day Forecast for Station {pick_station}")
plt.legend()
plt.tight_layout()
plt.savefig(OUT_DIR / "lstm_station_timeseries.png", dpi=200)
plt.close()

# Save a small CSV of predictions for reporting
meta_df.to_csv(OUT_DIR / "lstm_val_predictions.csv", index=False, encoding="utf-8-sig")

print("\nSaved outputs to:", OUT_DIR)
print("- lstm_training_loss.png")
print("- lstm_pred_vs_actual_scatter.png")
print("- lstm_station_timeseries.png")
print("- lstm_val_predictions.csv")



Saved outputs to: /Users/melvinang/Documents/NUS/Y4 Winter/Team7/IEE3593_Team7/ml_outputs
- lstm_training_loss.png
- lstm_pred_vs_actual_scatter.png
- lstm_station_timeseries.png
- lstm_val_predictions.csv
