point your kernel at the project’s src/ folder

In [None]:
import sys
from pathlib import Path

# Assume this notebook is in DTW-project/notebooks/
project_root = Path().cwd().parent
sys.path.insert(0, str(project_root / "src"))

Core Imports

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)
print("TensorFlow:", tf.__version__)


NumPy: 1.26.4
Pandas: 2.3.1
TensorFlow: 2.18.1


define your data tree

In [None]:
RAW_ROOT   = project_root / "data" / "raw"
PROC_ROOT  = project_root / "data" / "processed"

----------------Data Ingestion & Catalogue

In [None]:
# 1.1) skip if already done
if (PROC_ROOT / "biosecurid_catalog.parquet").exists():
    print("Catalog found—skipping ingest.")
else:
    from src.io.load_biosecurid import build_catalog
    df_catalog = build_catalog(
        global_dir = PROC_ROOT / "GlobalFeatures",
        local_dir  = RAW_ROOT  / "LocalFunctions",
        output_catalog = project_root / "data" / "biosecurid_catalog.parquet"
    )
    print("Wrote", len(df_catalog), "entries")


------------------Organise the dataset
In a first notebook cell, use src/importData.py’s copy_subset or main() function to copy the raw BiosecurID files into a structured data/processed/uXXXX/… hierarchy
GitHub
. This only needs to be run once.

In [None]:
# ─── 2) Check for existing processed data ───
processed = project_root / "data" / "processed"
if any((processed / "GlobalFeatures").glob("u*.mat")) and any((processed / "LocalFunctions").glob("u*.mat")):
    print("✅ Processed data already present; skipping import.")
else:
    print("⏳ No processed files found; running importData.main() …")
    from src.data.importData import main as import_main
    import_main()
    print("✅ Data import complete.")

# ─── 3) Quick sanity‐check of the directory tree ───
for user_dir in sorted((processed).glob("u*")):
    print(f"\n{user_dir.relative_to(project_root)}")
    print("  GlobalFeatures:", len(list((user_dir/"GlobalFeatures").glob("*.mat"))), "files")
    print("  LocalFunctions:",  len(list((user_dir/"LocalFunctions").glob("*.mat"))),  "files")


Exploratory Data Analysis

In [None]:
import seaborn as sns

# 2.1) load catalog
cat = pd.read_parquet(project_root/"data"/"biosecurid_catalog.parquet")

# 2.2) global‐feature EDA
gf = pd.DataFrame(
    [np.loadtxt(p) for p in cat.global_path],  # or use load_global()
    columns=[f"f{i+1}" for i in range(40)]
)
sns.histplot(data=gf, kde=True)
plt.title("All 40 Global Features")

# 2.3) correlation heatmap
corr = gf.corr()
sns.heatmap(corr, vmax=1, vmin=-1, center=0)
plt.title("Pearson Correlation of Global Features")


Pair-Generation

In [None]:
from src.pairing.make_pairs import generate_pairs as make_pairs
# make_pairs reads your catalog, builds genuine vs forgery pairs, subsamples,
# and writes out pairs_meta.parquet
cat = pd.read_parquet(project_root/"data"/"biosecurid_catalog.parquet")
pairs_meta = make_pairs(
    df = cat
)
print("Generated", len(pairs_meta), "pairs")
pairs_meta.head()


DTW Cache

In [None]:
from src.dtw.compute_dtw import build_cache

# build_cache will read pairs_meta, compute dp distances (via dtaidistance or your dp),
# normalise them, and write dtw_cache.parquet
dtw_cache = build_cache(
    pairs_path = project_root/"data"/"pairs_meta.parquet",
    catalog_path = project_root/"data"/"biosecurid_catalog.parquet",
    cache_path = project_root/"data"/"dtw_cache.parquet",
)

if dtw_cache is not None:
    print("Cached DTW for", len(dtw_cache), "pairs")
else:
    print("DTW cache built and saved, but no object returned.")


Baseline Evaluation

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np

# load
pairs = pd.read_parquet(project_root/"data"/"pairs_meta.parquet")
cache = pd.read_parquet(project_root/"data"/"dtw_cache.parquet")
df = pairs.merge(cache, on="pair_id")

# ROC & AUC
fpr, tpr, thr = roc_curve(df.label, df.d_norm1)
auc = roc_auc_score(df.label, df.d_norm1)

plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("FPR"); plt.ylabel("TPR")
plt.title("Baseline ROC Curve"); plt.legend()
