# 04. 100k 전체 파이프라인 (GPU 가속)

이 노트북은 100k 합성 데이터셋에 대한 전체 ML 파이프라인을 실행합니다.

**전제**:
- `data/synthetic/ml_dataset/`에 100k 데이터가 생성되어 있음
- GPU 환경이 구성되어 있음 (`00_setup_gpu_env.ipynb` 검증 완료)

In [None]:
import sys
from pathlib import Path

repo = Path("../").resolve()
sys.path.insert(0, str(repo / "src"))

BASE = repo / "data" / "synthetic" / "ml_dataset"
manifest = BASE / "manifest.json"
print(f"Manifest exists: {manifest.exists()}")
if manifest.exists():
    import json
    m = json.loads(manifest.read_text())
    print(f"Total count: {m.get('total_count', 'N/A')}")
    print(f"Splits: {m.get('splits', {})}")

## 1. 특징 추출 (병렬)

In [None]:
from motionanalyzer.auto_optimize import (
    FeatureExtractionConfig,
    prepare_training_data,
    normalize_features,
)
import json

manifest = json.loads((BASE / "manifest.json").read_text())
entries = manifest["entries"]

normal_train = [BASE / e["path"] for e in entries if e["goal"] == "normal" and e["split"] == "train"]
normal_val = [BASE / e["path"] for e in entries if e["goal"] == "normal" and e["split"] == "val"]
normal_test = [BASE / e["path"] for e in entries if e["goal"] == "normal" and e["split"] == "test"]
crack_train = [BASE / e["path"] for e in entries if e["goal"] == "goal1" and e["split"] == "train"]
crack_val = [BASE / e["path"] for e in entries if e["goal"] == "goal1" and e["split"] == "val"]
crack_test = [BASE / e["path"] for e in entries if e["goal"] == "goal1" and e["split"] == "test"]

cfg = FeatureExtractionConfig(
    include_per_frame=True,
    include_per_point=False,
    include_global_stats=True,
    include_crack_risk_features=False,
    include_advanced_stats=True,
    include_frequency_domain=True,
)

print("Extracting train features...")
feat_train, lab_train = prepare_training_data(normal_train, crack_train, feature_config=cfg)
print(f"Train: {len(feat_train)} samples")

print("Extracting val features...")
feat_val, lab_val = prepare_training_data(normal_val, crack_val, feature_config=cfg)
print(f"Val: {len(feat_val)} samples")

print("Extracting test features...")
feat_test, lab_test = prepare_training_data(normal_test, crack_test, feature_config=cfg)
print(f"Test: {len(feat_test)} samples")

## 2. 정규화 및 DREAM/PatchCore 학습

In [None]:
import numpy as np
import pandas as pd

exclude = ["label", "dataset_path", "frame", "index", "x", "y"]
feature_cols = [c for c in feat_train.columns if c not in exclude and "crack_risk" not in c.lower()
                and c in feat_train.select_dtypes(include=["number"]).columns]

normal_mask = np.asarray(lab_train, dtype=int) == 0
norm_train = normalize_features(feat_train, exclude_cols=exclude, fit_df=feat_train.loc[normal_mask])
norm_val = normalize_features(feat_val, exclude_cols=exclude, fit_df=feat_train.loc[normal_mask])
norm_test = normalize_features(feat_test, exclude_cols=exclude, fit_df=feat_train.loc[normal_mask])

X_train = norm_train[feature_cols].fillna(0).to_numpy(dtype=np.float32)
y_train = np.asarray(lab_train, dtype=int)
X_val = norm_val[feature_cols].fillna(0).to_numpy(dtype=np.float32)
y_val = np.asarray(lab_val, dtype=int)
X_test = norm_test[feature_cols].fillna(0).to_numpy(dtype=np.float32)
y_test = np.asarray(lab_test, dtype=int)

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

In [None]:
from motionanalyzer.gui.runners import _run_dream, _run_patchcore

def log(msg): print(msg)
def progress(): pass

print("Training DREAM (GPU if available)...")
res_dream = _run_dream(
    pd.DataFrame(X_train, columns=feature_cols),
    y_train,
    log=log, progress=progress,
    epochs=15,
    batch_size=128,
    weight_decay=1e-5,
)
print(f"DREAM success: {res_dream.get('success')}")

print("Training PatchCore...")
res_patch = _run_patchcore(
    pd.DataFrame(X_train, columns=feature_cols),
    y_train,
    log=log, progress=progress,
)
print(f"PatchCore success: {res_patch.get('success')}")

## 3. 검증 및 결과

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score

results = {}

if res_dream.get("success"):
    from motionanalyzer.ml_models.dream import DREAMPyTorch
    model = DREAMPyTorch(input_dim=len(feature_cols))
    model.load(res_dream["model_path"])
    scores_val = model.predict(X_val)
    scores_test = model.predict(X_test)
    # Threshold from val (simplified)
    from sklearn.metrics import precision_recall_curve
    prec, rec, thresh = precision_recall_curve(y_val, scores_val)
    best_idx = np.argmax(2 * prec[:-1] * rec[:-1] / (prec[:-1] + rec[:-1] + 1e-12))
    thresh_best = float(thresh[best_idx])
    pred = (scores_test >= thresh_best).astype(int)
    cm = confusion_matrix(y_test, pred)
    results["DREAM"] = {"cm": cm, "roc": roc_auc_score(y_test, scores_test)}
    print("DREAM:", cm, "ROC:", results["DREAM"]["roc"])

if res_patch.get("success"):
    from motionanalyzer.ml_models.patchcore import PatchCoreScikitLearn
    model = PatchCoreScikitLearn(feature_dim=len(feature_cols))
    model.load(res_patch["model_path"])
    scores_val = model.predict(pd.DataFrame(X_val, columns=feature_cols))
    scores_test = model.predict(pd.DataFrame(X_test, columns=feature_cols))
    prec, rec, thresh = precision_recall_curve(y_val, scores_val)
    best_idx = np.argmax(2 * prec[:-1] * rec[:-1] / (prec[:-1] + rec[:-1] + 1e-12))
    thresh_best = float(thresh[best_idx])
    pred = (scores_test >= thresh_best).astype(int)
    cm = confusion_matrix(y_test, pred)
    results["PatchCore"] = {"cm": cm, "roc": roc_auc_score(y_test, scores_test)}
    print("PatchCore:", cm, "ROC:", results["PatchCore"]["roc"])