In [1]:
"""
QML Model — Level 2: Swaption Surface Missing Cell Reconstruction
Architecture: Quantum Masked Autoencoder with MerLin CircuitBuilder

Problem (corrected understanding):
    The test set has ARBITRARY individual cells missing — not full maturity rows.
    Any (tenor, maturity) combination can be missing.
    e.g. "Tenor:5; Maturity:0.083", "Tenor:15; Maturity:0.25", "Tenor:10; Maturity:0.5"

    The training data is fully observed — we simulate missing cells by randomly
    masking cells during training (masked autoencoder approach).

Pipeline:
    1. Load Level 2 training data (fully observed, 489 rows × 224 cells)
    2. Each training step: randomly mask M cells → set to 0
    3. Input  = masked surface (224) + binary mask (224) → 448 features
    4. PCA compress 448 → 16 + re-scale to [0,1]
    5. Quantum circuit (CircuitBuilder, 16 modes, 4 photons)
    6. LexGrouping → 32 features
    7. Classical readout → 224 (full surface reconstruction)
    8. Loss computed ONLY on the masked cells (so model learns to impute)
    9. At test time: fill NaN cells with 0, build mask, run model,
       use predictions for the NaN positions only

Key insight:
    The binary mask is essential — it tells the quantum circuit WHICH cells
    are observed vs missing, preventing the model from confusing "zero volatility"
    with "missing value".
"""


'\nQML Model — Level 2: Swaption Surface Missing Cell Reconstruction\nArchitecture: Quantum Masked Autoencoder with MerLin CircuitBuilder\n\nProblem (corrected understanding):\n    The test set has ARBITRARY individual cells missing — not full maturity rows.\n    Any (tenor, maturity) combination can be missing.\n    e.g. "Tenor:5; Maturity:0.083", "Tenor:15; Maturity:0.25", "Tenor:10; Maturity:0.5"\n\n    The training data is fully observed — we simulate missing cells by randomly\n    masking cells during training (masked autoencoder approach).\n\nPipeline:\n    1. Load Level 2 training data (fully observed, 489 rows × 224 cells)\n    2. Each training step: randomly mask M cells → set to 0\n    3. Input  = masked surface (224) + binary mask (224) → 448 features\n    4. PCA compress 448 → 16 + re-scale to [0,1]\n    5. Quantum circuit (CircuitBuilder, 16 modes, 4 photons)\n    6. LexGrouping → 32 features\n    7. Classical readout → 224 (full surface reconstruction)\n    8. Loss comput

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from datasets import load_dataset

import merlin as ML
from merlin import LexGrouping, MeasurementStrategy, ComputationSpace
from merlin.builder import CircuitBuilder


In [3]:
N_PCA_COMPONENTS  = 16    # PCA: 448 (surface + mask) → 16
N_MODES           = 16    # Quantum circuit modes (≤ 20 QPU hard limit)
N_PHOTONS         = 4
N_GROUPED_OUTPUTS = 32    # LexGrouping output
MASK_RATIO        = 0.15  # Randomly mask 15% of cells per training sample
                          # (~34 cells out of 224) — simulates realistic missing patterns
TRAIN_SPLIT       = 0.85
EPOCHS            = 100
LR                = 5e-4
BATCH_SIZE        = 16
DEVICE            = torch.device("cpu")



In [4]:
# ─────────────────────────────────────────────
# 1. LOAD DATA
# ─────────────────────────────────────────────

print("Loading Level 2 dataset...")
ds = load_dataset(
    "Quandela/Challenge_Swaptions",
    data_files="level-2_Missing_data_prediction/train_level2.csv",
    split="train",
)
df = ds.to_pandas()
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
df = df.sort_values("Date").reset_index(drop=True)

feature_cols = [c for c in df.columns if c != "Date"]
raw_data = df[feature_cols].values.astype(np.float32)  # (489, 224)

print(f"Raw data shape: {raw_data.shape}")
print(f"Feature columns: {len(feature_cols)}")


Loading Level 2 dataset...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Raw data shape: (489, 224)
Feature columns: 224


In [5]:
# ─────────────────────────────────────────────
# 2. SCALE THE SURFACE
# ─────────────────────────────────────────────

# MinMaxScaler on the surface values → [0, 1]
# Important: masked cells will be set to 0, which is a valid value in this range
# so the mask channel is essential to distinguish "truly zero" from "missing"
surface_scaler = MinMaxScaler()
data_scaled = surface_scaler.fit_transform(raw_data).astype(np.float32)  # (489, 224)

print(f"Surface scaled to [0,1]")

# ─────────────────────────────────────────────
# 3. BUILD MASKED TRAINING PAIRS
# ─────────────────────────────────────────────
#
# For each row we create: (masked_surface + mask, full_surface)
# The mask is a binary vector: 1 = observed, 0 = missing
# We stack [masked_surface | mask] → 448-dim input
#
# We pre-generate all masked samples here (one mask per row).
# At each epoch we could re-generate for more variety, but pre-generation
# is simpler and still effective for 489 rows.

np.random.seed(42)
n_samples  = len(data_scaled)
n_features = data_scaled.shape[1]  # 224

masked_inputs = np.zeros((n_samples, n_features * 2), dtype=np.float32)
full_targets  = data_scaled.copy()

mask_indices_list = []  # store which cells were masked per row (for loss computation)

for i in range(n_samples):
    row = data_scaled[i].copy()

    # Randomly select cells to mask
    n_mask = max(1, int(n_features * MASK_RATIO))
    mask_idx = np.random.choice(n_features, n_mask, replace=False)
    mask_indices_list.append(mask_idx)

    # Binary mask: 1=observed, 0=missing
    mask = np.ones(n_features, dtype=np.float32)
    mask[mask_idx] = 0.0

    # Zero out the masked cells
    masked_row = row.copy()
    masked_row[mask_idx] = 0.0

    # Concatenate: [masked_surface (224) | binary_mask (224)] → (448,)
    masked_inputs[i] = np.concatenate([masked_row, mask])

print(f"\nMasked input shape : {masked_inputs.shape}  (224 surface + 224 mask)")
print(f"Target shape       : {full_targets.shape}")
print(f"Avg cells masked   : {int(n_features * MASK_RATIO)} / {n_features} "
      f"({MASK_RATIO*100:.0f}%)")

# ─────────────────────────────────────────────
# 4. PCA ON THE MASKED INPUTS
# ─────────────────────────────────────────────
# PCA on 448 → 16 to fit quantum mode limit

pca = PCA(n_components=N_PCA_COMPONENTS)
X_pca = pca.fit_transform(masked_inputs).astype(np.float32)   # (489, 16)

# Re-scale PCA outputs to [0,1] for angle encoding
pca_scaler = MinMaxScaler()
X_pca = pca_scaler.fit_transform(X_pca).astype(np.float32)

print(f"\nPCA explained variance: {pca.explained_variance_ratio_.sum()*100:.1f}%")
print(f"X after PCA: {X_pca.shape}")

# ─────────────────────────────────────────────
# 5. TRAIN / VAL SPLIT  (chronological)
# ─────────────────────────────────────────────

split   = int(n_samples * TRAIN_SPLIT)

X_train = torch.tensor(X_pca[:split],        device=DEVICE)
Y_train = torch.tensor(full_targets[:split],  device=DEVICE)
X_val   = torch.tensor(X_pca[split:],         device=DEVICE)
Y_val   = torch.tensor(full_targets[split:],  device=DEVICE)

# Store mask indices for computing loss only on masked cells
mask_train = mask_indices_list[:split]
mask_val   = mask_indices_list[split:]

train_loader = DataLoader(
    TensorDataset(X_train, Y_train, torch.arange(split)),
    batch_size=BATCH_SIZE, shuffle=False
)

print(f"\nTrain: {len(X_train)} samples | Val: {len(X_val)} samples")


Surface scaled to [0,1]

Masked input shape : (489, 448)  (224 surface + 224 mask)
Target shape       : (489, 224)
Avg cells masked   : 33 / 224 (15%)

PCA explained variance: 29.0%
X after PCA: (489, 16)

Train: 415 samples | Val: 74 samples


In [6]:
# ─────────────────────────────────────────────
# 6. BUILD THE QUANTUM CIRCUIT
# ─────────────────────────────────────────────

builder = CircuitBuilder(n_modes=N_MODES)
builder.add_entangling_layer(trainable=True, name="U1")
builder.add_angle_encoding(
    modes=list(range(N_MODES)),
    name="input",
    scale=np.pi,
)
builder.add_rotations(trainable=True, name="theta")
builder.add_superpositions(depth=2, trainable=True)

quantum_core = ML.QuantumLayer(
    input_size=N_MODES,
    builder=builder,
    n_photons=N_PHOTONS,
    measurement_strategy=MeasurementStrategy.probs(ComputationSpace.UNBUNCHED),
)

print(f"\nQuantum layer Fock output size : {quantum_core.output_size}")
print(f"After LexGrouping              : {N_GROUPED_OUTPUTS}")



Quantum layer Fock output size : 1820
After LexGrouping              : 32


In [7]:
# ─────────────────────────────────────────────
# 7. FULL HYBRID MODEL
# ─────────────────────────────────────────────
#
#   [16]  PCA of (masked surface + binary mask)
#     │
#   QuantumLayer (16 modes, 4 photons)
#     │
#   LexGrouping(Fock → 32)
#     │
#   Linear(32→256) + BN + ReLU + Dropout(0.3)
#   Linear(256→256) + BN + ReLU + Dropout(0.3)
#   Linear(256→224) + Sigmoid
#     │
#   [224] full reconstructed surface
#        → at test time, only the missing cell predictions are used

class QRCMaskedAutoencoder(nn.Module):
    def __init__(self, output_size: int):
        super().__init__()

        self.quantum = nn.Sequential(
            quantum_core,
            LexGrouping(quantum_core.output_size, N_GROUPED_OUTPUTS),
        )

        self.readout = nn.Sequential(
            nn.Linear(N_GROUPED_OUTPUTS, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(p=0.3),

            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(p=0.3),

            nn.Linear(256, output_size),
            nn.Sigmoid(),   # output in [0,1], same scale as MinMax-scaled targets
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.quantum(x)
        return self.readout(x)


model = QRCMaskedAutoencoder(output_size=n_features).to(DEVICE)

total_params     = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal params     : {total_params:,}")
print(f"Trainable params : {trainable_params:,}")



Total params     : 133,148
Trainable params : 133,148


In [8]:
# ─────────────────────────────────────────────
# 8. CUSTOM LOSS — only on masked cells
# ─────────────────────────────────────────────
#
# We only penalize the model on cells it couldn't see.
# Computing loss on observed cells too would be trivial (they're given)
# and would drown out the imputation signal.

def masked_mse_loss(pred, target, mask_indices_batch, batch_indices):
    """MSE computed only on the masked (missing) cells."""
    loss = torch.tensor(0.0, device=pred.device, requires_grad=True)
    count = 0
    for j, global_idx in enumerate(batch_indices):
        idx = mask_train[global_idx.item()]
        loss = loss + ((pred[j, idx] - target[j, idx]) ** 2).mean()
        count += 1
    return loss / max(count, 1)


In [10]:
# ─────────────────────────────────────────────
# 9. TRAIN
# ─────────────────────────────────────────────

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=10,
)

best_val_loss = float("inf")
best_state    = None

print("\nTraining (loss computed on masked cells only)...")
for epoch in range(1, EPOCHS + 1):
    model.train()
    epoch_loss = 0.0

    for xb, yb, idxb in train_loader:
        optimizer.zero_grad()
        pred = model(xb)
        loss = masked_mse_loss(pred, yb, mask_train, idxb)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * len(xb)

    epoch_loss /= len(X_train)

    # Validation: MSE on masked cells
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val)
    val_loss = 0.0
    for j in range(len(X_val)):
        idx = mask_val[j]
        val_loss += ((val_pred[j, idx] - Y_val[j, idx]) ** 2).mean().item()
    val_loss /= len(X_val)

    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = {k: v.clone() for k, v in model.state_dict().items()}

    print(f"  Epoch {epoch:3d}/{EPOCHS} | "
              f"train MSE: {epoch_loss:.6f} | "
              f"val MSE (masked): {val_loss:.6f} | "
              f"best: {best_val_loss:.6f}")

model.load_state_dict(best_state)
print(f"\nRestored best model (val MSE: {best_val_loss:.6f})")



Training (loss computed on masked cells only)...
  Epoch   1/100 | train MSE: 0.063059 | val MSE (masked): 0.051280 | best: 0.051280
  Epoch   2/100 | train MSE: 0.050502 | val MSE (masked): 0.044997 | best: 0.044997
  Epoch   3/100 | train MSE: 0.046689 | val MSE (masked): 0.043441 | best: 0.043441
  Epoch   4/100 | train MSE: 0.044442 | val MSE (masked): 0.036650 | best: 0.036650
  Epoch   5/100 | train MSE: 0.042307 | val MSE (masked): 0.033283 | best: 0.033283
  Epoch   6/100 | train MSE: 0.041009 | val MSE (masked): 0.030646 | best: 0.030646
  Epoch   7/100 | train MSE: 0.039811 | val MSE (masked): 0.028870 | best: 0.028870
  Epoch   8/100 | train MSE: 0.039084 | val MSE (masked): 0.027447 | best: 0.027447
  Epoch   9/100 | train MSE: 0.037719 | val MSE (masked): 0.025152 | best: 0.025152
  Epoch  10/100 | train MSE: 0.036142 | val MSE (masked): 0.024148 | best: 0.024148
  Epoch  11/100 | train MSE: 0.035290 | val MSE (masked): 0.023803 | best: 0.023803
  Epoch  12/100 | train MS

In [11]:
# ─────────────────────────────────────────────
# 10. EVALUATE  (on masked cells only, original scale)
# ─────────────────────────────────────────────

model.eval()
with torch.no_grad():
    val_pred_np = model(X_val).numpy()

val_true_np = Y_val.numpy()

# Collect only the masked cell predictions and targets
all_pred, all_true = [], []
for j in range(len(X_val)):
    idx = mask_val[j]
    pred_orig = surface_scaler.inverse_transform(val_pred_np[j:j+1])[:, idx]
    true_orig = surface_scaler.inverse_transform(val_true_np[j:j+1])[:, idx]
    all_pred.append(pred_orig.flatten())
    all_true.append(true_orig.flatten())

all_pred = np.concatenate(all_pred)
all_true = np.concatenate(all_true)

rmse = np.sqrt(mean_squared_error(all_true, all_pred))
mae  = np.mean(np.abs(all_true - all_pred))

print(f"\n{'='*55}")
print(f"VALIDATION RESULTS — masked cells only (original scale)")
print(f"{'='*55}")
print(f"  Overall RMSE : {rmse:.6f}")
print(f"  Overall MAE  : {mae:.6f}")
print(f"  (Volatility range ≈ 0.02 – 0.45)")



VALIDATION RESULTS — masked cells only (original scale)
  Overall RMSE : 0.014704
  Overall MAE  : 0.010172
  (Volatility range ≈ 0.02 – 0.45)


In [14]:
# ─────────────────────────────────────────────
# 11. TEST SET INFERENCE  (how to use at test time)
# ─────────────────────────────────────────────
#
# When you receive the test set with NaN values:
#
#   test_row = {col: value or NaN}
#
# Step 1: build the masked surface and binary mask
# Step 2: run through PCA pipeline
# Step 3: run through model
# Step 4: use model predictions ONLY for the NaN positions

def predict_missing(row_dict: dict, model, surface_scaler, pca, pca_scaler,
                    feature_cols, device):
    """
    row_dict: {column_name: value_or_nan}
    Returns: dict of {column_name: predicted_value} for only the missing cells
    """
    model.eval()

    surface   = np.array([row_dict.get(c, np.nan) for c in feature_cols],
                          dtype=np.float32)
    mask      = (~np.isnan(surface)).astype(np.float32)
    missing   = np.where(np.isnan(surface))[0]

    # Fill missing with 0 (neutral value in [0,1] scaled space)
    surface_filled = surface.copy()
    surface_filled[np.isnan(surface_filled)] = 0.0

    # Scale observed cells
    # We scale the full row — zeros in missing positions won't affect the fit
    surface_scaled = surface_scaler.transform(surface_filled[np.newaxis, :])[0]
    surface_scaled[missing] = 0.0   # re-zero after scaling

    # Build input: [scaled_surface | mask]
    inp = np.concatenate([surface_scaled, mask])[np.newaxis, :].astype(np.float32)

    # PCA + re-scale
    inp_pca = pca.transform(inp)
    inp_pca = pca_scaler.transform(inp_pca).astype(np.float32)

    with torch.no_grad():
        pred_scaled = model(torch.tensor(inp_pca, device=device)).numpy()

    pred_original = surface_scaler.inverse_transform(pred_scaled)[0]

    return {feature_cols[i]: float(pred_original[i]) for i in missing}


# Demo on last training row with synthetic missing cells
demo_row = {col: float(raw_data[-1, i]) for i, col in enumerate(feature_cols)}
# Simulate the test set example: specific cells missing
test_missing_cols = [
    "Tenor : 5; Maturity : 0.0833333333333333",
    "Tenor : 15; Maturity : 0.25",
    "Tenor : 10; Maturity : 0.5",
]
for col in test_missing_cols:
    demo_row[col] = np.nan

predictions = predict_missing(demo_row, model, surface_scaler, pca, pca_scaler,
                               feature_cols, DEVICE)
print(f"\nDemo prediction for test-style missing cells:")
for col, pred_val in predictions.items():
    short   = col.replace("Tenor : ", "T").replace("; Maturity : ", "/M")
    true_val = raw_data[-1, feature_cols.index(col)]
    print(f"  {short:<20} → predicted: {pred_val:.6f} | true: {true_val:.6f}")



Demo prediction for test-style missing cells:
  T5/M0.0833333333333333 → predicted: 0.042142 | true: 0.035875
  T15/M0.25            → predicted: 0.063201 | true: 0.058743
  T10/M0.5             → predicted: 0.098090 | true: 0.085806
