## 4) Compute & Cache DTW Distances

In [1]:
import sys
from pathlib import Path
import pandas as pd
from IPython.display import display

# 4.1) Ensure src/ is importable
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# 4.2) Import the build_cache function
from src.dtw.compute_dtw import build_cache

# 4.3) Define paths
PAIRS_PATH   = project_root / "data" / "pairs_meta.parquet"
CATALOG_PATH = project_root / "data" / "biosecurid_catalog.parquet"
CACHE_PATH   = project_root / "data" / "dtw_cache.parquet"

print("Pairs:", PAIRS_PATH.exists(), "Catalog:", CATALOG_PATH.exists())

# %%  
# 4.4) Run DTW cache build (this will skip already‐computed pair_id rows)
build_cache(
    pairs_path   = PAIRS_PATH,
    cache_path   = CACHE_PATH,
    chunk_size   = 5000,      # adjust based on memory / speed
    backend      = None,      # None → pick best available automatically
    window       = 10         # Sakoe‐Chiba window size for bounded DTW
) 

# %%  
# 4.5) Quick sanity check: peek at the first few rows
df_cache = pd.read_parquet(CACHE_PATH)
print(f"Total cached pairs: {len(df_cache):,}")

print("\n--- Cache preview ---")
display(df_cache.head())

print("\n--- Cache info ---")
print(df_cache.info())

Pairs: True Catalog: True
Total cached pairs: 38,400

--- Cache preview ---


Unnamed: 0,pair_id,label,d_raw,d_bound,path_len,len_ref,len_qry
0,0,0,90.088621,91.125942,1205,493,1124
1,1,1,33.710096,33.722913,522,469,476
2,2,0,99.443359,99.443359,1616,596,1586
3,3,0,85.101735,85.101735,1189,495,1160
4,4,0,67.822628,67.822628,661,226,652



--- Cache info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38400 entries, 0 to 38399
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pair_id   38400 non-null  int64  
 1   label     38400 non-null  int64  
 2   d_raw     38400 non-null  float64
 3   d_bound   38400 non-null  float64
 4   path_len  38400 non-null  int64  
 5   len_ref   38400 non-null  int64  
 6   len_qry   38400 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 2.1 MB
None
