In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Level 1 vs Level 2 data comparision
TLDR: L2 is L1 with 5 rows missing. So the task of predictions from L1 still remains important. Once this is working, then adding data imputation to L2 will complete the challenge.

In [None]:
#load the datasets library from hugging face

# Level 1 data
ds_level1 = load_dataset(
"Quandela/Challenge_Swaptions",
data_files="level-1_Future_prediction/train.csv",
split="train",
download_mode="force_redownload")

df = ds_level1.to_pandas()
# Convert price columns from string to float
price_cols_all = [c for c in df.columns if c != 'Date']
df[price_cols_all] = df[price_cols_all].astype(float)
df.sample(5)

In [None]:
# Basic shape and info
print(f"Shape: {df.shape}")
df.info()
df.describe()

In [None]:
ds_level2 = load_dataset(
    "Quandela/Challenge_Swaptions",
    data_files="level-2_Missing_data_prediction/train_level2.csv",
    split="train",
)
df2 = ds_level2.to_pandas()
# Convert price columns from string to float
price_cols_all = [c for c in df2.columns if c != 'Date']
df2[price_cols_all] = df2[price_cols_all].astype(float)
df2.sample(5)

In [None]:
# Basic shape and info
print(f"Shape: {df2.shape}")
df2.info()
df2.describe()

In [None]:
# Find missing rows (in L1 but not L2) by Date
missing = df[~df['Date'].isin(df2['Date'])]
print(f"Missing {len(missing)} rows from L2:\n")
print(missing['Date'].tolist())

# Check if shared rows have identical values
merged = df.merge(df2, on='Date', suffixes=('_L1', '_L2'))
diffs = 0
for col in price_cols_all:
    mask = merged[f'{col}_L1'] != merged[f'{col}_L2']
    diffs += mask.sum()
print(f"\nValue differences across shared rows: {diffs}")

L1 has 494 records and L2 has 489. Only 5 records are missing. The rest of the data is identical. The five missing dates are: ` ['01/03/2050', '14/04/2050', '28/05/2050', '17/08/2050', '20/10/2050']` 


# EDA L1
## Data Structure

- **494 rows** = trading days (~2 years daily data)
- **224 price columns** + 1 `Date` column (wide format — each row is one day's full pricing surface)

### The Grid: 14 Tenors × 16 Maturities = 224 instruments

| Dimension | Values (years) |
|---|---|
| **Tenor** (how long the swap lasts) | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30 |
| **Maturity** (when the option expires) | 0.083, 0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30 |

E.g. `Tenor : 5; Maturity : 2` = price of an option expiring in 2 years to enter a 5-year swap.

**Task:** Given a sequence of 224-d daily surface snapshots, predict the next ~10 trading days (2 weeks).

In [None]:
# Parse dates and check frequency
df['Date'] = pd.to_datetime(df['Date'], format='mixed')
df = df.sort_values('Date').reset_index(drop=True)

print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Total days: {df.shape[0]}")
print(f"Gaps > 3 calendar days: {(df['Date'].diff().dt.days > 3).sum()}")
print(f"Null values: {df.iloc[:, 1:].isnull().sum().sum()}")

In [None]:
# Parse tenor/maturity from column names
price_cols = [c for c in df.columns if c != 'Date']
tenors, maturities = [], []
for c in price_cols:
    parts = c.split(';')
    t = float(parts[0].split(':')[1].strip())
    m = float(parts[1].split(':')[1].strip())
    tenors.append(t)
    maturities.append(m)

unique_tenors = sorted(set(tenors))
unique_maturities = sorted(set(maturities))
print(f"Tenors ({len(unique_tenors)}): {unique_tenors}")
print(f"Maturities ({len(unique_maturities)}): {unique_maturities}")

In [None]:
# Heatmap: swaption surface for a single day
day_idx = 0  # first day
surface = df.iloc[day_idx, 1:].values.astype(float).reshape(len(unique_maturities), len(unique_tenors))

fig, ax = plt.subplots(figsize=(10, 7))
im = ax.imshow(surface, aspect='auto', cmap='viridis', origin='lower')
ax.set_xticks(range(len(unique_tenors)))
ax.set_xticklabels([f"{t:.0f}" if t >= 1 else f"{t}" for t in unique_tenors], rotation=45)
ax.set_yticks(range(len(unique_maturities)))
ax.set_yticklabels([f"{m:.2f}" if m < 1 else f"{m:.0f}" for m in unique_maturities])
ax.set_xlabel('Tenor (years)')
ax.set_ylabel('Maturity (years)')
ax.set_title(f"Swaption Surface — {df['Date'].iloc[day_idx]}")
plt.colorbar(im, label='Price')
plt.tight_layout()
plt.show()

In [None]:
# Time series of a few representative (tenor, maturity) pairs
samples = ['Tenor : 5; Maturity : 2', 'Tenor : 10; Maturity : 5', 
           'Tenor : 1; Maturity : 0.25', 'Tenor : 30; Maturity : 30']

fig, ax = plt.subplots(figsize=(12, 5))
for col in samples:
    ax.plot(df['Date'], df[col], label=col, alpha=0.8)
ax.set_xlabel('Date')
ax.set_ylabel('Price')
ax.set_title('Swaption Price Time Series (selected instruments)')
ax.legend(fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# Correlation structure — how related are the 224 instruments?
corr = df[price_cols].corr()

fig, ax = plt.subplots(figsize=(10, 8))
im = ax.imshow(corr.values, cmap='RdBu_r', vmin=-1, vmax=1)
ax.set_title('Correlation Matrix (224 instruments)')
plt.colorbar(im)
plt.tight_layout()
plt.show()

print(f"Mean pairwise correlation: {corr.values[np.triu_indices(224, k=1)].mean():.3f}")

In [None]:
# PCA — how many components capture the surface variance?
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(df[price_cols].values)
pca = PCA().fit(X)
cumvar = np.cumsum(pca.explained_variance_ratio_)

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(range(1, len(cumvar)+1), cumvar, 'b-')
ax.axhline(0.95, color='r', linestyle='--', label='95% variance')
ax.axhline(0.99, color='orange', linestyle='--', label='99% variance')
n95 = np.argmax(cumvar >= 0.95) + 1
n99 = np.argmax(cumvar >= 0.99) + 1
ax.set_xlabel('Number of Components')
ax.set_ylabel('Cumulative Explained Variance')
ax.set_title('PCA on Swaption Surface')
ax.legend()
ax.set_xlim(0, 30)
plt.tight_layout()
plt.show()

print(f"Components for 95% variance: {n95}")
print(f"Components for 99% variance: {n99}")

## PCA Result

**3 components capture 99% of the variance** — the 224-dimensional surface can be described by just 3 numbers per day.

- **PC1** = overall level (all prices shift up/down together)
- **PC2** = slope (short vs long end move differently)
- **PC3** = curvature (middle vs extremes)

**Implication:** We compress 224 → 3 features via PCA, then predict a time-series of 3-d vectors. This fits easily within the 20-mode quantum circuit limit. After prediction, inverse-PCA reconstructs the full 224-price surface.

In [None]:
# Distribution of daily price changes (returns)
returns = df[price_cols].diff().iloc[1:]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(returns.values.flatten(), bins=100, edgecolor='none', alpha=0.7)
axes[0].set_title('Distribution of Daily Price Changes')
axes[0].set_xlabel('ΔPrice')

# Volatility across the surface (std of daily changes per instrument)
vol = returns.std()
vol_surface = vol.values.reshape(len(unique_maturities), len(unique_tenors))
im = axes[1].imshow(vol_surface, aspect='auto', cmap='hot', origin='lower')
axes[1].set_xticks(range(len(unique_tenors)))
axes[1].set_xticklabels([f"{t:.0f}" if t >= 1 else f"{t}" for t in unique_tenors], rotation=45)
axes[1].set_yticks(range(len(unique_maturities)))
axes[1].set_yticklabels([f"{m:.2f}" if m < 1 else f"{m:.0f}" for m in unique_maturities])
axes[1].set_xlabel('Tenor')
axes[1].set_ylabel('Maturity')
axes[1].set_title('Volatility of Daily Changes')
plt.colorbar(im, ax=axes[1])
plt.tight_layout()
plt.show()

## Volatility Surface Observations

**Important:** The dataset values are **implied volatilities**, not raw prices. Predicting these IV values is the challenge objective.

Daily change volatility observations:
- **Lowest volatility:** 1-month maturity row (bottom) — short-dated options are most stable day-to-day
- **Highest volatility (~0.025):** tenors <7 with maturities 2–7 — the "belly" of the surface moves most
- **Long maturities (20+):** also elevated — far-future options are harder to predict
- **Overall range is narrow** — daily moves are small, so the model needs to be precise

In [None]:
# Autocorrelation — how predictable is tomorrow from today?
from statsmodels.tsa.stattools import acf

# Check autocorrelation on the first 3 PCA components
X_pca = pca.transform(X)[:, :3]
pc_labels = ['PC1 (level)', 'PC2 (slope)', 'PC3 (curvature)']

fig, axes = plt.subplots(1, 3, figsize=(14, 3))
for i, (ax, label) in enumerate(zip(axes, pc_labels)):
    ac = acf(X_pca[:, i], nlags=20)
    ax.bar(range(len(ac)), ac, width=0.5)
    ax.set_title(label)
    ax.set_xlabel('Lag (days)')
    ax.set_ylabel('Autocorrelation')
    ax.axhline(0, color='k', linewidth=0.5)
plt.suptitle('Autocorrelation of PCA Components', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Stationarity check — are PCA components trending or mean-reverting?
from statsmodels.tsa.stattools import adfuller

for i, label in enumerate(pc_labels):
    result = adfuller(X_pca[:, i])
    status = "Stationary" if result[1] < 0.05 else "Non-stationary (trending)"
    print(f"{label}: ADF p-value={result[1]:.4f} → {status}")

## EDA Summary

| Finding | Implication |
|---|---|
| 224 IV values compress to **3 PCA components** (99% variance) | Model input is just 3 features per day |
| High autocorrelation → tomorrow looks like today | Time-series forecasting is feasible |
| If non-stationary → components trend over time | May need to predict **daily changes** (Δ) instead of levels |
| Narrow daily change range (~0.025 max) | Model must be precise; small errors matter |
| Values are **implied volatilities** | Predict IV directly — this is the challenge objective |

**Next step:** Build a classical baseline (MLP/RNN) on PCA-compressed IV time series.