In [None]:
# OpenLHC-Anomaly Toolkit — Quickstart (CMS minimal)

This notebook runs an end-to-end anomaly detection baseline (AE or PCA) on a toy-sized dataset.

- By default, it uses a synthetic dataset so it works out-of-the-box.
- To use real data, convert a ROOT file to Parquet or point to a local/remote ROOT/Parquet in `configs/cms_config.yaml`.


In [None]:
# Install (if needed) and set up paths
import sys, os, subprocess

REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)

print('Repo root:', REPO_ROOT)


In [None]:
# Prepare minimal configs (synthetic data)
from pathlib import Path
import yaml

repo = Path(REPO_ROOT)
configs = repo / 'configs'
configs.mkdir(parents=True, exist_ok=True)

cms_cfg = configs / 'cms_config.yaml'
if not cms_cfg.exists():
    cms_cfg.write_text(yaml.safe_dump({
        'dataset_name': 'cms_open_data_minimal',
        'root_files': [],
        'features': ['f1','f2','f3','f4','f5'],
        'split': {'train':0.7,'val':0.15,'test':0.15},
        'cache_parquet': str(repo / 'data/processed/synth.parquet'),
        'max_events': 10000,
    }))

model_cfg = configs / 'model_config.yaml'
if not model_cfg.exists():
    model_cfg.write_text(yaml.safe_dump({
        'model': 'autoencoder',
        'seed': 42,
        'ae': {
            'hidden_dims': [64,32,16,32,64],
            'activation': 'relu',
            'dropout': 0.0,
            'lr': 1e-3,
            'batch_size': 256,
            'epochs': 3,
            'device': 'cpu',
        }
    }))

print('Configs ready:', cms_cfg, model_cfg)


In [None]:
# Generate synthetic Parquet if missing
from scripts.prepare_data import generate_synthetic_parquet

parquet_path = Path(REPO_ROOT) / 'data/processed/synth.parquet'
if not parquet_path.exists():
    generate_synthetic_parquet(parquet_path)
parquet_path


In [None]:
# Train
from backend.train import run_training

run_info = run_training(
    Path(REPO_ROOT) / 'configs/cms_config.yaml',
    Path(REPO_ROOT) / 'configs/model_config.yaml',
    Path(REPO_ROOT) / 'results/runs/quickstart_ae',
)
run_info


In [None]:
# Evaluate
from backend.evaluate import run_evaluation

metrics = run_evaluation(
    Path(REPO_ROOT) / 'results/runs/quickstart_ae',
    Path(REPO_ROOT) / 'results/plots/quickstart_ae',
    Path(REPO_ROOT) / 'results/leaderboard.json',
)
metrics
