## 3) Generate Pair Metadata

In [1]:
import sys
from pathlib import Path
import pandas as pd

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# 3.1) Core pairing functions
from src.pairing.make_pairs import generate_pairs, balance_subsample

# 3.2) Paths
project_root = Path.cwd().parent
CATALOG_PATH = project_root / "data" / "biosecurid_catalog.parquet"
PAIRS_PATH   = project_root / "data" / "pairs_meta.parquet"

In [2]:
# 3.3) Load the catalog
catalog = pd.read_parquet(CATALOG_PATH)

# 3.4) Select only the local-function entries and build full paths
loc = catalog[catalog.feature == "localfunctions"].copy()
loc["path"] = loc["file_path"].apply(lambda p: str(project_root / "data" / "processed" / p))

# %%  
# 3.5) Generate all labelled pairs (genuine=1, forgery=0)
pairs = generate_pairs(loc, path_col="path", weak_forgery=False)

# 3.6) Balance & subsample to at most 200 000 pairs
pairs = balance_subsample(pairs, max_pairs=200_000, random_state=42)

# 3.7) Persist for downstream steps
pairs.to_parquet(PAIRS_PATH, index=False)
print(f"✅ Wrote {len(pairs):,} pairs → {PAIRS_PATH}")

# %%  
# 3.8) Sanity check
print("\n--- Pairs preview ---")
display(pairs.head())

print("\n--- Pair counts by label ---")
print(pairs.label.value_counts())

✅ Wrote 38,400 pairs → c:\Users\mattt\Skripsie\Projects\DTW-project\data\pairs_meta.parquet

--- Pairs preview ---


Unnamed: 0,userA,sessionA,sampleA,pathA,userB,sessionB,sampleB,pathB,label
0,1116,1,6,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,1116,1,4,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,0
1,1147,3,6,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,1147,3,7,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,1
2,1336,3,2,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,1336,3,4,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,0
3,1284,4,1,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,1284,4,5,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,0
4,1127,3,6,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,1127,3,3,c:\Users\mattt\Skripsie\Projects\DTW-project\d...,0



--- Pair counts by label ---
label
0    19200
1    19200
Name: count, dtype: int64
