In [1]:
"""
Level 2 — Missing Data EDA
Understand exactly what is missing, where, and what patterns exist
to inform the model architecture.
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from datasets import load_dataset



In [2]:
# ─────────────────────────────────────────────
# 1. LOAD BOTH DATASETS
# ─────────────────────────────────────────────

ds1 = load_dataset("Quandela/Challenge_Swaptions",
                   data_files="level-1_Future_prediction/train.csv", split="train")
ds2 = load_dataset("Quandela/Challenge_Swaptions",
                   data_files="level-2_Missing_data_prediction/train_level2.csv", split="train")

df1 = ds1.to_pandas()
df2 = ds2.to_pandas()

df1["Date"] = pd.to_datetime(df1["Date"], dayfirst=True)
df2["Date"] = pd.to_datetime(df2["Date"], dayfirst=True)
df1 = df1.sort_values("Date").reset_index(drop=True)
df2 = df2.sort_values("Date").reset_index(drop=True)

feat_cols = [c for c in df2.columns if c != "Date"]
tenors     = sorted(set(int(c.split("Tenor : ")[1].split(";")[0]) for c in feat_cols))
maturities = sorted(set(float(c.split("Maturity : ")[1]) for c in feat_cols))

print("=" * 60)
print("LEVEL 2 — MISSING DATA ANALYSIS")
print("=" * 60)


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


LEVEL 2 — MISSING DATA ANALYSIS


In [3]:
# ─────────────────────────────────────────────
# 2. MISSING VALUE STRUCTURE
# ─────────────────────────────────────────────

missing_per_col = df2[feat_cols].isnull().mean()
missing_per_row = df2[feat_cols].isnull().sum(axis=1)

print(f"\nTotal rows          : {len(df2)}")
print(f"Total cells         : {len(df2) * len(feat_cols)}")
print(f"Missing cells       : {df2[feat_cols].isnull().sum().sum()}")
print(f"Missing %           : {df2[feat_cols].isnull().mean().mean()*100:.1f}%")
print(f"\nRows with NO missing: {(missing_per_row == 0).sum()}")
print(f"Rows with missing   : {(missing_per_row > 0).sum()}")
print(f"Missing per row     : always {missing_per_row[missing_per_row > 0].unique()} cells")

# Which maturities are missing?
print("\nMissing rate per MATURITY:")
for mat in maturities:
    cols = [c for c in feat_cols if float(c.split("Maturity : ")[1]) == mat]
    rate = df2[cols].isnull().mean().mean()
    bar  = "█" * int(rate * 40)
    print(f"  Maturity {mat:5.2f}yr : {rate*100:5.1f}%  {bar}")

# Which tenors are missing?
print("\nMissing rate per TENOR:")
for ten in tenors:
    cols = [c for c in feat_cols if int(c.split("Tenor : ")[1].split(";")[0]) == ten]
    rate = df2[cols].isnull().mean().mean()
    bar  = "█" * int(rate * 40)
    print(f"  Tenor {ten:2d}yr : {rate*100:5.1f}%  {bar}")

# Exactly which cells are missing?
missing_cols = missing_per_col[missing_per_col > 0].index.tolist()
print(f"\nExactly {len(missing_cols)} columns ever have missing values:")
for c in missing_cols[:10]:
    print(f"  {c}  →  {missing_per_col[c]*100:.1f}% missing")
if len(missing_cols) > 10:
    print(f"  ... and {len(missing_cols)-10} more")

# Are missing patterns consistent across rows?
print("\nIs the missing pattern the same on every row?")
missing_mask = df2[feat_cols].isnull()
pattern_counts = missing_mask.apply(tuple, axis=1).value_counts()
print(f"  Distinct missing patterns: {len(pattern_counts)}")
print(f"  Most common pattern count: {pattern_counts.iloc[0]} rows")



Total rows          : 489
Total cells         : 109536
Missing cells       : 0
Missing %           : 0.0%

Rows with NO missing: 489
Rows with missing   : 0
Missing per row     : always [] cells

Missing rate per MATURITY:
  Maturity  0.08yr :   0.0%  
  Maturity  0.25yr :   0.0%  
  Maturity  0.50yr :   0.0%  
  Maturity  0.75yr :   0.0%  
  Maturity  1.00yr :   0.0%  
  Maturity  1.50yr :   0.0%  
  Maturity  2.00yr :   0.0%  
  Maturity  3.00yr :   0.0%  
  Maturity  4.00yr :   0.0%  
  Maturity  5.00yr :   0.0%  
  Maturity  7.00yr :   0.0%  
  Maturity 10.00yr :   0.0%  
  Maturity 15.00yr :   0.0%  
  Maturity 20.00yr :   0.0%  
  Maturity 25.00yr :   0.0%  
  Maturity 30.00yr :   0.0%  

Missing rate per TENOR:
  Tenor  1yr :   0.0%  
  Tenor  2yr :   0.0%  
  Tenor  3yr :   0.0%  
  Tenor  4yr :   0.0%  
  Tenor  5yr :   0.0%  
  Tenor  6yr :   0.0%  
  Tenor  7yr :   0.0%  
  Tenor  8yr :   0.0%  
  Tenor  9yr :   0.0%  
  Tenor 10yr :   0.0%  
  Tenor 15yr :   0.0%  
  Tenor

In [4]:
# ─────────────────────────────────────────────
# 3. CORRELATION: MISSING ROWS vs OBSERVED ROWS
# ─────────────────────────────────────────────

# Get the maturity values that are missing
missing_mats = sorted(set(
    float(c.split("Maturity : ")[1])
    for c in missing_cols
))
observed_mats = [m for m in maturities if m not in missing_mats]

print(f"\nMissing maturities : {missing_mats}")
print(f"Observed maturities: {len(observed_mats)} values")

# For rows that have complete data (no missing values), check correlation
# between missing-maturity rows and neighboring observed-maturity rows
complete_rows = df2[missing_per_row == 0]
print(f"\nComplete rows available for correlation analysis: {len(complete_rows)}")

if len(complete_rows) > 10:
    # Use Level 1 data as complete reference instead
    print("Using Level 1 data as complete reference for correlation analysis...")
    ref = df1

    for miss_mat in missing_mats:
        miss_cols = [c for c in feat_cols if float(c.split("Maturity : ")[1]) == miss_mat]
        # Nearest observed maturities
        dists = [(abs(m - miss_mat), m) for m in observed_mats]
        dists.sort()
        nearest = [d[1] for d in dists[:3]]
        print(f"\n  Maturity {miss_mat}yr ← correlates with:")
        for near_mat in nearest:
            near_cols = [c for c in feat_cols if float(c.split("Maturity : ")[1]) == near_mat]
            # Average correlation across tenors
            corrs = []
            for mc, nc in zip(miss_cols, near_cols):
                if mc in ref.columns and nc in ref.columns:
                    corrs.append(ref[mc].corr(ref[nc]))
            print(f"    Maturity {near_mat}yr  →  avg corr = {np.mean(corrs):.4f}")



Missing maturities : []
Observed maturities: 16 values

Complete rows available for correlation analysis: 489
Using Level 1 data as complete reference for correlation analysis...


In [5]:

obs_cols  = [c for c in feat_cols if c not in missing_cols]
pred_cols = missing_cols

print(f"\n{'='*60}")
print(f"MODEL TASK SUMMARY")
print(f"{'='*60}")
print(f"  Input  (observed): {len(obs_cols)} cells per row")
print(f"  Output (to predict): {len(pred_cols)} cells per row")
print(f"  Ratio: predict {len(pred_cols)/len(feat_cols)*100:.1f}% of surface from {len(obs_cols)/len(feat_cols)*100:.1f}%")



MODEL TASK SUMMARY
  Input  (observed): 224 cells per row
  Output (to predict): 0 cells per row
  Ratio: predict 0.0% of surface from 100.0%
