In [1]:
# -------------------------------------------
# Notebook: 03_split_data.ipynb
# Purpose: Create global train/test splits for all assets
# Project: Fall 2025 Erods Institute Quant Finance Bootcamp
# -------------------------------------------

import sys,os
import pandas as pd



In [2]:
# -------------------------------------------
# 1. Ensure project modules are importable
# -------------------------------------------

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)



In [3]:
# -------------------------------------------
# 2. Import the split function
# -------------------------------------------

from modules.data_utils import split_data_global, load_split_manifest, slice_shock_window

In [4]:
# -------------------------------------------
# 3. Choose global split date
# -------------------------------------------

# This date defines the cutoff for training vs testing across *all* assets.
# Everything before or equal to this date = train
# Everything after this date = test

TRAIN_END = "2019-06-05"
# pre covid train end date



EXTRA_WINDOWS = [
("covid", "2017-06-30", "2020-04-15")
]



In [5]:
# -------------------------------------------
# 4. Perform the global time-based split
# -------------------------------------------


# Perform global split with extra windows

manifest = split_data_global(
data_dir="data",
train_end="2019-06-05",
extra_train_windows=EXTRA_WINDOWS,
save_dir="data/splits",
min_train_length=252,
verbose=True
)

# Load the manifest later for easy access


# Pre-COVID
pre_manifest = load_split_manifest(period="pre_covid")
print(pre_manifest)

# COVID period
covid_manifest = load_split_manifest(period="covid")
print(covid_manifest)




FX_USDINR (covid): train=726 rows
FX_USDINR: train=1561, test=1667
BOND_BNDX (covid): train=702 rows
BOND_BNDX: train=1510, test=1610
COM_CRUDE (covid): train=702 rows
COM_CRUDE: train=1498, test=1613
[OK] Manifest saved: data/splits/manifest.csv

Saved manifest to data/splits/manifest.csv
[OK] Loaded manifest for pre_covid (3 assets).
       asset train_start  train_end test_start   test_end  n_train  n_test  \
0  FX_USDINR  2013-06-06 2019-06-05 2019-06-06 2025-10-29     1561    1667   
2  BOND_BNDX  2013-06-06 2019-06-05 2019-06-06 2025-10-29     1510    1610   
4  COM_CRUDE  2013-06-06 2019-06-05 2019-06-06 2025-10-29     1498    1613   

                        train_path                       test_path  
0  data/splits/FX_USDINR_train.csv  data/splits/FX_USDINR_test.csv  
2  data/splits/BOND_BNDX_train.csv  data/splits/BOND_BNDX_test.csv  
4  data/splits/COM_CRUDE_train.csv  data/splits/COM_CRUDE_test.csv  
[OK] Loaded manifest for covid (3 assets).
       asset train_start  trai

In [6]:


# Convert date columns to datetime
for col in ["train_start","train_end","test_start","test_end"]:
    manifest[col] = pd.to_datetime(manifest[col])

pre_covid_cutoff = pd.to_datetime("2019-06-05")

for asset in manifest["asset"].unique():
    asset_rows = manifest[manifest["asset"] == asset]

    for idx, row in asset_rows.iterrows():
        # Determine if this row is pre-COVID or COVID/extra
        if row["train_end"] <= pre_covid_cutoff:
            label = "Pre-COVID"
        else:
            label = "COVID/extra window"

        print(f"{asset} - {label}: train={row['n_train']} rows | test={row['n_test']} rows")
        print(f"  Train range: {row['train_start'].date()} → {row['train_end'].date()}")
        print(f"  Test range: {row['test_start'].date()} → {row['test_end'].date()}")


FX_USDINR - Pre-COVID: train=1561 rows | test=1667 rows
  Train range: 2013-06-06 → 2019-06-05
  Test range: 2019-06-06 → 2025-10-29
FX_USDINR - COVID/extra window: train=726 rows | test=1442 rows
  Train range: 2017-06-30 → 2020-04-15
  Test range: 2020-04-16 → 2025-10-29
BOND_BNDX - Pre-COVID: train=1510 rows | test=1610 rows
  Train range: 2013-06-06 → 2019-06-05
  Test range: 2019-06-06 → 2025-10-29
BOND_BNDX - COVID/extra window: train=702 rows | test=1393 rows
  Train range: 2017-06-30 → 2020-04-15
  Test range: 2020-04-16 → 2025-10-29
COM_CRUDE - Pre-COVID: train=1498 rows | test=1613 rows
  Train range: 2013-06-06 → 2019-06-05
  Test range: 2019-06-06 → 2025-10-29
COM_CRUDE - COVID/extra window: train=702 rows | test=1396 rows
  Train range: 2017-06-30 → 2020-04-15
  Test range: 2020-04-16 → 2025-10-29
