# 01: Data Loading and Cleaning

This notebook loads and cleans the VARBX and benchmark data.


In [1]:
import sys
from pathlib import Path

# Add src to path - robust path resolution for notebooks
# In Jupyter, cwd is usually the project root, not notebooks/
current_dir = Path.cwd()

# Check if we're in notebooks/ directory or at project root
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
elif (current_dir / 'config.yml').exists():
    # We're already at project root
    project_root = current_dir
else:
    # Try to find project root by looking for config.yml
    project_root = current_dir
    while project_root != project_root.parent:
        if (project_root / 'config.yml').exists():
            break
        project_root = project_root.parent

sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
from src.data.loaders import load_varbx_data, load_sp500_data, load_hfri_ed_data
from src.data.preprocess import align_timeframes, merge_returns_dataframes, clean_returns_dataframe
from src.utils.paths import get_data_interim_path, ensure_dir

# Verify path resolution (you can remove this after confirming it works)
print(f"Project root: {project_root}")
print(f"Config exists: {(project_root / 'config.yml').exists()}")


Project root: /Users/paulgarofalo/Desktop/varbxaltinvfinalproj
Config exists: True


## Load Data


In [2]:
# Load VARBX data
varbx_df = load_varbx_data()
print(f"VARBX data: {len(varbx_df)} observations")
print(varbx_df.head())


VARBX data: 70 observations
        date    return
0 2020-02-29 -0.002849
1 2020-03-31 -0.020952
2 2020-04-30  0.009728
3 2020-05-31  0.000963
4 2020-06-30  0.000962


In [3]:
# Load benchmark data
sp500_df = load_sp500_data()
hfri_ed_df = load_hfri_ed_data()

print(f"S&P 500 data: {len(sp500_df)} observations")
print(f"HFRI ED (Merger Arbitrage Index) data: {len(hfri_ed_df)} observations")


S&P 500 data: 70 observations
HFRI ED (Merger Arbitrage Index) data: 429 observations


## Clean and Align Data


In [4]:
# Clean each dataset
varbx_clean = clean_returns_dataframe(varbx_df, date_column="date", return_column="return")
sp500_clean = clean_returns_dataframe(sp500_df, date_column="date", return_column="return")
hfri_ed_clean = clean_returns_dataframe(hfri_ed_df, date_column="date", return_column="return")

# Prune HFRI ED to start at the same date as VARBX and SP500
# Find the earliest date where VARBX or SP500 has data
varbx_start = varbx_clean["date"].min() if len(varbx_clean) > 0 else None
sp500_start = sp500_clean["date"].min() if len(sp500_clean) > 0 else None

if varbx_start is not None and sp500_start is not None:
    # Use the earliest start date between VARBX and SP500
    common_start = min(varbx_start, sp500_start)
    # Filter HFRI ED to start from this date
    hfri_ed_clean = hfri_ed_clean[hfri_ed_clean["date"] >= common_start].copy().reset_index(drop=True)
    print(f"Pruned HFRI ED data to start at {common_start} (same as VARBX/SP500)")


Pruned HFRI ED data to start at 2020-02-29 00:00:00 (same as VARBX/SP500)


In [5]:
# Merge all returns into single DataFrame
returns_df = merge_returns_dataframes(
    varbx_clean,
    sp500_clean,
    hfri_ed_clean,
    date_column="date",
    suffixes=["varbx", "sp500", "hfri_ed"]
)

print(f"Merged data: {len(returns_df)} observations")
print(f"Date range: {returns_df['date'].min()} to {returns_df['date'].max()}")
print(returns_df.head())


Merged data: 70 observations
Date range: 2020-02-29 00:00:00 to 2025-11-30 00:00:00
        date  return_varbx  return_sp500  return_hfri_ed
0 2020-02-29     -0.002849     -0.079166       -0.013640
1 2020-03-31     -0.020952     -0.124871       -0.095794
2 2020-04-30      0.009728      0.126984        0.048416
3 2020-05-31      0.000963      0.047645       -0.007537
4 2020-06-30      0.000962      0.017734        0.013172


## Save Cleaned Data


In [6]:
# Save to interim directory
interim_path = ensure_dir(get_data_interim_path())
returns_df.to_csv(interim_path / "returns_merged.csv", index=False)
print(f"Saved cleaned data to {interim_path / 'returns_merged.csv'}")


Saved cleaned data to /Users/paulgarofalo/Desktop/varbxaltinvfinalproj/data/interim/returns_merged.csv
