## 1) Organise the dataset

In [1]:
# 1.1) Ensure src/ is importable and define roots
import sys
from pathlib import Path
import pandas as pd

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.data.importData import main as import_main

RAW_ROOT  = project_root / "data" / "raw"
PROC_ROOT = project_root / "data" / "processed"

In [2]:
# 1.2) Skip copy if already organised
user_dirs = [d for d in PROC_ROOT.glob("u*") if d.is_dir()]
if user_dirs:
    print(f"✅ Found {len(user_dirs)} user folders under {PROC_ROOT}. Skipping import.")
else:
    print("⏳ No per-user folders found under processed → running import_main()")
    import_main(raw_root=RAW_ROOT, proc_root=PROC_ROOT)
    user_dirs = [d for d in PROC_ROOT.glob("u*") if d.is_dir()]
    print(f"✅ Created {len(user_dirs)} user folders under {PROC_ROOT}")

# 1.3) Sanity-check layout for first few users
for u in sorted(user_dirs)[:3]:
    gf = list((u/"GlobalFeatures").glob("*.mat"))
    lf = list((u/"LocalFunctions").glob("*.mat"))
    print(f"{u.name}: GlobalFeatures={len(gf)} files, LocalFunctions={len(lf)} files")

✅ Found 400 user folders under c:\Users\mattt\Skripsie\Projects\DTW-project\data\processed. Skipping import.
u1001: GlobalFeatures=28 files, LocalFunctions=28 files
u1002: GlobalFeatures=28 files, LocalFunctions=28 files
u1003: GlobalFeatures=28 files, LocalFunctions=28 files


## 2) Build the catalog index

In [3]:
from src.io.load_biosecurid import build_catalog
import pandas as pd

CATALOG_PATH = project_root / "data" / "biosecurid_catalog.parquet"

# Build or load (but rebuild if empty)
if CATALOG_PATH.exists():
    df_catalog = pd.read_parquet(CATALOG_PATH)
    if df_catalog.empty:
        print("⚠️ Catalog is empty — rebuilding…")
        df_catalog = build_catalog(PROC_ROOT, CATALOG_PATH)
    else:
        print("✅ Loaded existing catalog.")
else:
    print("⏳ Building new catalog…")
    df_catalog = build_catalog(PROC_ROOT, CATALOG_PATH)

# Now inspect
print("\n--- Catalog preview ---")
display(df_catalog.head())
print("\n--- Catalog info ---")
print(df_catalog.info())

⏳ Building new catalog…

--- Catalog preview ---


Unnamed: 0,file_path,user_id,session_id,sample_id,feature,label
0,u1001\GlobalFeatures\u1001s0001_sg0001.mat,1001,1,1,globalfeatures,genuine
1,u1001\GlobalFeatures\u1001s0001_sg0002.mat,1001,1,2,globalfeatures,genuine
2,u1001\GlobalFeatures\u1001s0001_sg0003.mat,1001,1,3,globalfeatures,forgery
3,u1001\GlobalFeatures\u1001s0001_sg0004.mat,1001,1,4,globalfeatures,forgery
4,u1001\GlobalFeatures\u1001s0001_sg0005.mat,1001,1,5,globalfeatures,forgery



--- Catalog info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22400 entries, 0 to 22399
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   file_path   22400 non-null  object
 1   user_id     22400 non-null  int64 
 2   session_id  22400 non-null  int64 
 3   sample_id   22400 non-null  int64 
 4   feature     22400 non-null  object
 5   label       22400 non-null  object
dtypes: int64(3), object(3)
memory usage: 1.0+ MB
None
