From 7eedabd03d3ac74958afdd364cf865bd580e0983 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 30 Apr 2026 19:16:14 +0000 Subject: [PATCH 1/9] docs(base.config): explain errorStrategy exit codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add inline comment documenting the semantics of each exit-code range in the errorStrategy retry list. Notably, 2147483647 is Integer.MAX_VALUE — Nextflow's sentinel for tasks that died before writing .exitcode (e.g. AWS Batch spot reclamation, kubernetes preemption, grid-scheduler cancellations) — not an AWS-specific exit code. Cite Nextflow docs/aws.md for the AWS case. Addresses PR #139 review comment r3165322845. --- conf/base.config | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/conf/base.config b/conf/base.config index e1668aa..45b6260 100644 --- a/conf/base.config +++ b/conf/base.config @@ -16,6 +16,15 @@ process { // resourceLimits = [ cpus: 192, memory: 750.GB, time: 72.h ] + // Retry signal-induced exits and "killed without exit code" cases: + // 130..145 = signal exits (SIGINT=130, SIGKILL=137, SIGTERM=143, etc.) + // 104 = ECONNRESET (transient network failures during stage-in/out) + // 2147483647 = Integer.MAX_VALUE, Nextflow's sentinel for tasks that died + // before writing .exitcode (Nextflow surfaces this as + // "terminated for an unknown reason -- Likely it has been + // terminated by the external system"). Common on AWS Batch + // spot capacity, kubernetes preemption, and grid-scheduler + // cancellations. See nextflow docs/aws.md for the AWS case. errorStrategy = { task.exitStatus in ((130..145) + 104 + 2147483647) ? 'retry' : 'finish' } maxRetries = 3 maxErrors = '-1' From 1585a9dfa3b800c83c0becc22691a2ddae706286 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 30 Apr 2026 19:16:38 +0000 Subject: [PATCH 2/9] style(modules.config): replace findAll().join(' ') with join(' ').trim() for STARDIST ext.args Replaces the Groovy-fallthrough `findAll()` (no closure) with the documented stdlib `join` + `trim` chain. Internal double-spaces from empty-string entries get collapsed by bash word-splitting at `${args}` interpolation; `.trim()` removes leading/trailing whitespace. Matches the existing project idiom used in subworkflows/local/utils_nfcore_spatialxe_pipeline/main.nf:341,352 (toolCitationText / toolBibliographyText). Addresses PR #139 review comment r3165345526. --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 2272691..b5066cf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -307,7 +307,7 @@ process { params.stardist_prob_thresh != null ? "--prob_thresh ${params.stardist_prob_thresh}" : "", params.stardist_nms_thresh != null ? "--nms_thresh ${params.stardist_nms_thresh}" : "", params.stardist_n_tiles != null ? "--n_tiles ${params.stardist_n_tiles}" : "", - ].findAll().join(' ')} + ].join(' ').trim()} } withName: 'STARDIST_NUCLEI' { From 74aa9171df54256b9a7eddd9de1efa0c89d117c5 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 30 Apr 2026 19:16:50 +0000 Subject: [PATCH 3/9] chore: remove orphan modules/local/utility/Dockerfile This Dockerfile was unreferenced by any module: a repo-wide grep for "modules/local/utility/Dockerfile" returns zero hits. Each utility submodule (convert_mask_uint32, extract_dapi, resize_tif, segger2xr, etc.) declares its own container directly via `container "..."`. The associated `conda.yml` it expected (per the COPY instruction) does not exist next to it. Leftover from an earlier development phase; no longer needed. Addresses PR #139 review comment r3165438419. --- modules/local/utility/Dockerfile | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 modules/local/utility/Dockerfile diff --git a/modules/local/utility/Dockerfile b/modules/local/utility/Dockerfile deleted file mode 100644 index 79213eb..0000000 --- a/modules/local/utility/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM mambaorg/micromamba:1.5.10-noble -COPY --chown=$MAMBA_USER:$MAMBA_USER conda.yml /tmp/conda.yml -RUN micromamba install -y -n base -f /tmp/conda.yml \ - && micromamba install -y -n base conda-forge::procps-ng \ - && micromamba env export --name base --explicit > environment.lock \ - && echo ">> CONDA_LOCK_START" \ - && cat environment.lock \ - && echo "<< CONDA_LOCK_END" \ - && micromamba clean -a -y -USER root -ENV PATH="$MAMBA_ROOT_PREFIX/bin:$PATH" From d59fac34b1290b7474b21ae25abb2695e343a1ae Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 30 Apr 2026 19:17:03 +0000 Subject: [PATCH 4/9] feat(schema): add format validators (directory-path, file-path) to schema_input.json MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add nf-schema `format` keys so the validator confirms paths exist: - bundle: format: directory-path - image: format: file-path These validate path existence at runtime, not name conventions — so the team's concern about varying 10x bundle/image naming doesn't conflict. The format key is documented at https://nextflow-io.github.io/nf-schema/latest/nextflow_schema/nextflow_schema_specification/ Addresses PR #139 review comments r3165291219 (C3) and r3165292878 (C4). --- assets/schema_input.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/assets/schema_input.json b/assets/schema_input.json index 2dfb357..5561c0f 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -15,11 +15,13 @@ }, "bundle": { "type": "string", + "format": "directory-path", "pattern": "^\\S+$", "errorMessage": "Please provide a bundle as input data" }, "image": { "type": "string", + "format": "file-path", "pattern": "^\\S+$", "errorMessage": "You can provide an image. If you do not then please leave the field empty." } From cf0d42da9eb7fb5fd27491f34b8b75a6f2a9b80f Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 30 Apr 2026 19:17:14 +0000 Subject: [PATCH 5/9] fix(xenium_patch/stitch): thread baysor_tiling_min_transcripts_per_cell via ext.args Removes the inline `params.baysor_tiling_min_transcripts_per_cell` reference from the stitch process script. Instead: - The XENIUM_PATCH_STITCH withName: block in conf/modules.config sets the flag via ext.args - The .nf script declares `def args = task.ext.args ?: ''` and interpolates `${args}` into the command Per Florian's narrowing directive, only the cited param at line 40 is fixed. Other params references in the file (if any) remain as-is for follow-up work. Addresses PR #139 review comment r3165442902. --- conf/modules.config | 1 + modules/local/xenium_patch/stitch/main.nf | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index b5066cf..bb5a878 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -157,6 +157,7 @@ process { ext.filter_method = params.patch_filter_method ?: null ext.iqr_multiplier = params.patch_filter_iqr_multiplier ext.z_threshold = params.patch_filter_z_threshold + ext.args = { "--min-transcripts-per-cell ${params.baysor_tiling_min_transcripts_per_cell}" } publishDir = [ path: { "${params.outdir}/${meta.id}/xenium_patch" }, mode: params.publish_dir_mode, diff --git a/modules/local/xenium_patch/stitch/main.nf b/modules/local/xenium_patch/stitch/main.nf index 3d52397..5f050e2 100644 --- a/modules/local/xenium_patch/stitch/main.nf +++ b/modules/local/xenium_patch/stitch/main.nf @@ -33,11 +33,12 @@ process XENIUM_PATCH_STITCH { task.ext.when == null || task.ext.when script: + def args = task.ext.args ?: '' """ stitch_transcripts.py \\ --patches ${patches} \\ --output output \\ - --min-transcripts-per-cell ${params.baysor_tiling_min_transcripts_per_cell} + ${args} # Post-process: ensure all GeoJSON geometries are Polygon. # make_valid() and solve_conflicts() can produce MultiPolygon, From 1db7443d7c9db983f3e6e070f193ac1a50d95f9f Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 30 Apr 2026 19:17:31 +0000 Subject: [PATCH 6/9] refactor(segger): extract create_dataset orchestration to module binary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the three inline `python3 - <<'PYEOF'` heredoc blocks (parquet column statistics, tile-split workaround, NaN bd.x fix) plus the bundle symlink shell loop with a single module binary at resources/usr/bin/run_create_dataset.py. The .nf script block shrinks from ~190 lines to ~15. The container (quay.io/dongzehe/segger:1.0.14) stays general — pipeline-specific workarounds for upstream segger bugs now live in the module binary where they can be reviewed, tested, and removed when fixed upstream. Each WORKAROUND function in the binary has a section comment naming the upstream bug and removal condition. Pattern matches existing baysor reference modules (modules/local/baysor/{create_dataset,preprocess}/) which use resources/usr/bin/ scripts called without ${moduleDir}/ prefix. Addresses PR #139 review comment r3165415041 (segger create_dataset only; predict refactor in next commit; train kept as-is per single-line debug P2 carve-out). --- modules/local/segger/create_dataset/main.nf | 194 +------------- .../resources/usr/bin/run_create_dataset.py | 253 ++++++++++++++++++ 2 files changed, 264 insertions(+), 183 deletions(-) create mode 100755 modules/local/segger/create_dataset/resources/usr/bin/run_create_dataset.py diff --git a/modules/local/segger/create_dataset/main.nf b/modules/local/segger/create_dataset/main.nf index 06848c4..520ef34 100644 --- a/modules/local/segger/create_dataset/main.nf +++ b/modules/local/segger/create_dataset/main.nf @@ -22,7 +22,6 @@ process SEGGER_CREATE_DATASET { } def args = task.ext.args ?: '' - def script_path = "/workspace/segger_dev/src/segger/cli/create_dataset_fast.py" prefix = task.ext.prefix ?: "${meta.id}" // check for platform values @@ -31,193 +30,22 @@ process SEGGER_CREATE_DATASET { } """ - # Set numba cache directory to avoid caching issues in container export NUMBA_CACHE_DIR=\$PWD/.numba_cache mkdir -p \$NUMBA_CACHE_DIR - # Create local bundle directory with symlinks to all original files - # This is necessary because input files from S3/Fusion are read-only - # Use absolute paths to avoid broken relative symlinks - mkdir -p bundle_local - for item in ${base_dir}/*; do - # Resolve to absolute path (follow any symlinks) - abs_path=\$(readlink -f "\$item" 2>/dev/null || realpath "\$item" 2>/dev/null || echo "\$item") - basename=\$(basename "\$item") - ln -sf "\$abs_path" "bundle_local/\$basename" - done - - # Segger expects nucleus_boundaries.parquet but Xenium bundles have cell_boundaries.parquet - # Create the symlink if nucleus_boundaries doesn't exist but cell_boundaries does - if [ ! -e "bundle_local/nucleus_boundaries.parquet" ] && [ -e "bundle_local/cell_boundaries.parquet" ]; then - echo "Creating nucleus_boundaries.parquet symlink from cell_boundaries.parquet" - cell_bounds_path=\$(readlink -f "bundle_local/cell_boundaries.parquet" 2>/dev/null || realpath "bundle_local/cell_boundaries.parquet" 2>/dev/null) - ln -sf "\$cell_bounds_path" bundle_local/nucleus_boundaries.parquet - fi - - # List bundle contents for debugging - echo "Bundle contents:" - ls -la bundle_local/ - - # Fix: Add parquet column statistics for segger - echo "Adding statistics to parquet files..." - python3 - << 'PYEOF' -import pyarrow.parquet as pq -import os - -def add_stats(inp, out): - if not os.path.exists(inp): - print(f" Skip {inp}") - return - t = pq.read_table(inp) - pq.write_table(t, out, write_statistics=True, compression='snappy') - print(f" Done {os.path.basename(inp)} ({len(t)} rows)") - -os.makedirs('bundle_stats', exist_ok=True) -for f in ['transcripts.parquet', 'nucleus_boundaries.parquet']: - add_stats(f'bundle_local/{f}', f'bundle_stats/{f}') - -for item in os.listdir('bundle_local'): - s, d = f'bundle_local/{item}', f'bundle_stats/{item}' - if not os.path.exists(d): - os.symlink(os.path.realpath(s), d) -print("Done") - -# Debug: Check overlaps_nucleus column data -print("") -print("=== Debugging overlaps_nucleus data ===") -import pyarrow.compute as pc - -tx = pq.read_table('bundle_stats/transcripts.parquet') -bd = pq.read_table('bundle_stats/nucleus_boundaries.parquet') - -if 'overlaps_nucleus' in tx.column_names: - col = tx.column('overlaps_nucleus') - print(f"overlaps_nucleus dtype: {col.type}") - unique_vals = pc.unique(col) - print(f"overlaps_nucleus unique values: {unique_vals.to_pylist()[:10]}") - val_counts = pc.value_counts(col) - print(f"overlaps_nucleus value_counts: {val_counts.to_pylist()}") -else: - print("WARNING: overlaps_nucleus column NOT FOUND in transcripts.parquet") - -# Check cell_id overlap between transcripts and boundaries -if 'cell_id' in tx.column_names and 'cell_id' in bd.column_names: - tx_cells = set(pc.unique(tx.column('cell_id')).to_pylist()) - bd_cells = set(pc.unique(bd.column('cell_id')).to_pylist()) - overlap = tx_cells & bd_cells - print("") - print(f"Transcripts unique cell_ids: {len(tx_cells)}") - print(f"Boundaries unique cell_ids: {len(bd_cells)}") - print(f"Overlapping cell_ids: {len(overlap)}") - -print("=== End Debug ===") -PYEOF - ls -la bundle_stats/ - - python3 ${script_path} \\ - --base_dir bundle_stats \\ - --data_dir ${prefix} \\ - --sample_type ${params.format} \\ - --tile_width ${params.tile_width} \\ - --tile_height ${params.tile_height} \\ - --n_workers ${task.cpus} \\ + run_create_dataset.py \\ + --bundle-dir ${base_dir} \\ + --output-dir ${prefix} \\ + --sample-type ${params.format} \\ + --tile-width ${params.tile_width} \\ + --tile-height ${params.tile_height} \\ + --n-workers ${task.cpus} \\ ${args} - # Verify tiles were created and show distribution - echo "Dataset split (before fix):" - echo " train_tiles: \$(ls ${prefix}/train_tiles/processed/ 2>/dev/null | wc -l) files" - echo " val_tiles: \$(ls ${prefix}/val_tiles/processed/ 2>/dev/null | wc -l) files" - echo " test_tiles: \$(ls ${prefix}/test_tiles/processed/ 2>/dev/null | wc -l) files" - - # Workaround: segger commit 0787167 has a bug where all tiles go to test_tiles - # regardless of test_prob/val_prob settings. Move ONLY trainable tiles (those with - # edge_label_index) from test_tiles to train_tiles. - # Tiles without tx-belongs-bd edges don't have edge_label_index and cannot be used for training. - train_count=\$(ls ${prefix}/train_tiles/processed/ 2>/dev/null | wc -l) - test_count=\$(ls ${prefix}/test_tiles/processed/ 2>/dev/null | wc -l) - - if [ "\$train_count" -eq 0 ] && [ "\$test_count" -gt 0 ]; then - echo "Applying workaround: filtering trainable tiles from test_tiles (segger split bug)" - export SEGGER_PREFIX="${prefix}" - python3 - << 'PYEOF' -import torch -import os -import shutil - -prefix = os.environ['SEGGER_PREFIX'] -test_dir = f"{prefix}/test_tiles/processed" -train_dir = f"{prefix}/train_tiles/processed" - -moved = 0 -skipped = 0 - -for f in os.listdir(test_dir): - if not f.endswith('.pt'): - continue - fpath = os.path.join(test_dir, f) - try: - tile = torch.load(fpath, weights_only=False) - edge_store = tile['tx', 'belongs', 'bd'] - # Check if edge_label_index exists and has data - if hasattr(edge_store, 'edge_label_index') and edge_store.edge_label_index.numel() > 0: - shutil.move(fpath, os.path.join(train_dir, f)) - moved += 1 - else: - skipped += 1 - except Exception as e: - print(f"Warning: Could not process {f}: {e}") - skipped += 1 - -print(f"Moved {moved} trainable tiles to train_tiles") -print(f"Skipped {skipped} test-only tiles (no edge_label_index)") -PYEOF - fi - - echo "Dataset split (after fix):" - echo " train_tiles: \$(ls ${prefix}/train_tiles/processed/ 2>/dev/null | wc -l) files" - echo " val_tiles: \$(ls ${prefix}/val_tiles/processed/ 2>/dev/null | wc -l) files" - echo " test_tiles: \$(ls ${prefix}/test_tiles/processed/ 2>/dev/null | wc -l) files" - - train_tiles_dir="${prefix}/train_tiles/processed" - if [ ! -d "\$train_tiles_dir" ] || [ -z "\$(ls -A \$train_tiles_dir 2>/dev/null)" ]; then - echo "ERROR: No trainable tiles were created in \$train_tiles_dir" - echo "This usually means no transcripts overlap with nucleus boundaries in the dataset." - echo "Check if the Xenium bundle contains valid overlaps_nucleus data in transcripts.parquet." - exit 1 - fi - echo "Successfully created \$(ls \$train_tiles_dir | wc -l) trainable tiles" - - # Workaround: Segger's get_polygon_props() produces NaN boundary features (bd.x) - # when polygon geometries have zero area or index misalignment during GeoDataFrame - # construction. Replace NaN bd.x with zeros so BCEWithLogitsLoss doesn't propagate NaN. - export SEGGER_PREFIX="${prefix}" - python3 - << 'PYEOF' -import torch -import os - -prefix = os.environ['SEGGER_PREFIX'] -fixed = 0 -total = 0 - -for split in ['train_tiles', 'test_tiles', 'val_tiles']: - tile_dir = f"{prefix}/{split}/processed" - if not os.path.isdir(tile_dir): - continue - for f in os.listdir(tile_dir): - if not f.endswith('.pt'): - continue - total += 1 - fpath = os.path.join(tile_dir, f) - tile = torch.load(fpath, weights_only=False) - bd_x = tile['bd'].x - if bd_x.isnan().any(): - tile['bd'].x = torch.nan_to_num(bd_x, nan=0.0) - torch.save(tile, fpath) - fixed += 1 - -print(f"Fixed NaN bd.x in {fixed}/{total} tiles") -PYEOF - + cat <<-END_VERSIONS > versions.yml + "${task.process}": + segger: 0.1.0 + END_VERSIONS """ stub: diff --git a/modules/local/segger/create_dataset/resources/usr/bin/run_create_dataset.py b/modules/local/segger/create_dataset/resources/usr/bin/run_create_dataset.py new file mode 100755 index 0000000..c73ab00 --- /dev/null +++ b/modules/local/segger/create_dataset/resources/usr/bin/run_create_dataset.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +""" +Run segger create_dataset with spatialxe-specific preprocessing and workarounds. + +Wraps segger's create_dataset_fast.py with: + - bundle_local symlink prep (handles read-only S3/Fusion mounts) + - parquet column statistics (segger needs these) + - WORKAROUND: filter trainable tiles from test_tiles when segger commit 0787167 mis-splits + - WORKAROUND: replace NaN bd.x with zeros after get_polygon_props produces NaN + +Each WORKAROUND should be removable when the upstream segger bug is fixed. +""" + +import argparse +import os +import shutil +import subprocess +import sys +from pathlib import Path + +# imports for actual work (used in functions below) +import pyarrow.parquet as pq +import pyarrow.compute as pc +import torch + + +SEGGER_CLI = "/workspace/segger_dev/src/segger/cli/create_dataset_fast.py" + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--bundle-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--sample-type", required=True, choices=["xenium"]) + p.add_argument("--tile-width", type=int, required=True) + p.add_argument("--tile-height", type=int, required=True) + p.add_argument("--n-workers", type=int, required=True) + # remaining args forwarded to segger CLI + args, extra = p.parse_known_args() + return args, extra + + +def prepare_bundle(bundle_dir): + """Create local bundle dir with absolute symlinks (S3/Fusion read-only-safe).""" + Path("bundle_local").mkdir(exist_ok=True) + for item in Path(bundle_dir).iterdir(): + try: + abs_path = item.resolve() + except Exception: + abs_path = item + target = Path("bundle_local") / item.name + if target.exists() or target.is_symlink(): + target.unlink() + target.symlink_to(abs_path) + + # Segger expects nucleus_boundaries.parquet but Xenium bundles have cell_boundaries.parquet + nb = Path("bundle_local/nucleus_boundaries.parquet") + cb = Path("bundle_local/cell_boundaries.parquet") + if not nb.exists() and cb.exists(): + print( + "Creating nucleus_boundaries.parquet symlink from cell_boundaries.parquet" + ) + nb.symlink_to(cb.resolve()) + + print("Bundle contents:") + for item in sorted(Path("bundle_local").iterdir()): + print(f" {item.name}") + + +def add_parquet_stats(): + """Rewrite key parquet files with column statistics (segger requires them).""" + Path("bundle_stats").mkdir(exist_ok=True) + for fname in ["transcripts.parquet", "nucleus_boundaries.parquet"]: + src = Path("bundle_local") / fname + dst = Path("bundle_stats") / fname + if not src.exists(): + print(f" Skip {src}") + continue + t = pq.read_table(str(src)) + pq.write_table(t, str(dst), write_statistics=True, compression="snappy") + print(f" Done {fname} ({len(t)} rows)") + + # Symlink everything else from bundle_local into bundle_stats + for item in Path("bundle_local").iterdir(): + dst = Path("bundle_stats") / item.name + if not dst.exists(): + dst.symlink_to(item.resolve()) + + # Debug: check overlaps_nucleus column in transcripts + print("\n=== Debugging overlaps_nucleus data ===") + tx = pq.read_table("bundle_stats/transcripts.parquet") + bd = pq.read_table("bundle_stats/nucleus_boundaries.parquet") + if "overlaps_nucleus" in tx.column_names: + col = tx.column("overlaps_nucleus") + print(f"overlaps_nucleus dtype: {col.type}") + unique_vals = pc.unique(col) + print(f"overlaps_nucleus unique values: {unique_vals.to_pylist()[:10]}") + val_counts = pc.value_counts(col) + print(f"overlaps_nucleus value_counts: {val_counts.to_pylist()}") + else: + print("WARNING: overlaps_nucleus column NOT FOUND in transcripts.parquet") + + if "cell_id" in tx.column_names and "cell_id" in bd.column_names: + tx_cells = set(pc.unique(tx.column("cell_id")).to_pylist()) + bd_cells = set(pc.unique(bd.column("cell_id")).to_pylist()) + overlap = tx_cells & bd_cells + print(f"Transcripts unique cell_ids: {len(tx_cells)}") + print(f"Boundaries unique cell_ids: {len(bd_cells)}") + print(f"Overlapping cell_ids: {len(overlap)}") + print("=== End Debug ===\n") + + +def run_segger_cli(args, extra): + cmd = [ + "python3", + SEGGER_CLI, + "--base_dir", + "bundle_stats", + "--data_dir", + args.output_dir, + "--sample_type", + args.sample_type, + "--tile_width", + str(args.tile_width), + "--tile_height", + str(args.tile_height), + "--n_workers", + str(args.n_workers), + *extra, + ] + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd) + if result.returncode != 0: + sys.exit(result.returncode) + + +def filter_trainable_tiles_if_needed(prefix): + """ + WORKAROUND: segger commit 0787167 has a bug where all tiles end up in test_tiles + regardless of test_prob/val_prob settings. Move ONLY trainable tiles (those with + edge_label_index) from test_tiles to train_tiles. + + Remove this function once segger >= 0.1.x is bumped with the upstream fix. + """ + train_dir = Path(prefix) / "train_tiles" / "processed" + test_dir = Path(prefix) / "test_tiles" / "processed" + val_dir = Path(prefix) / "val_tiles" / "processed" + + train_count = len(list(train_dir.iterdir())) if train_dir.exists() else 0 + test_count = len(list(test_dir.iterdir())) if test_dir.exists() else 0 + val_count = len(list(val_dir.iterdir())) if val_dir.exists() else 0 + print( + f"Dataset split (before fix): train={train_count} val={val_count} test={test_count}" + ) + + if train_count == 0 and test_count > 0: + print( + "Applying workaround: filtering trainable tiles from test_tiles (segger split bug)" + ) + moved = 0 + skipped = 0 + for tile_path in list(test_dir.iterdir()): + if not tile_path.name.endswith(".pt"): + continue + try: + tile = torch.load(str(tile_path), weights_only=False) + edge_store = tile["tx", "belongs", "bd"] + if ( + hasattr(edge_store, "edge_label_index") + and edge_store.edge_label_index.numel() > 0 + ): + shutil.move(str(tile_path), str(train_dir / tile_path.name)) + moved += 1 + else: + skipped += 1 + except Exception as e: + print(f"Warning: Could not process {tile_path.name}: {e}") + skipped += 1 + print(f"Moved {moved} trainable tiles to train_tiles") + print(f"Skipped {skipped} test-only tiles (no edge_label_index)") + + train_count = len(list(train_dir.iterdir())) if train_dir.exists() else 0 + test_count = len(list(test_dir.iterdir())) if test_dir.exists() else 0 + val_count = len(list(val_dir.iterdir())) if val_dir.exists() else 0 + print( + f"Dataset split (after fix): train={train_count} val={val_count} test={test_count}" + ) + + if train_count == 0: + print(f"ERROR: No trainable tiles were created in {train_dir}", file=sys.stderr) + print( + "This usually means no transcripts overlap with nucleus boundaries in the dataset.", + file=sys.stderr, + ) + print( + "Check if the Xenium bundle contains valid overlaps_nucleus data in transcripts.parquet.", + file=sys.stderr, + ) + sys.exit(1) + print(f"Successfully created {train_count} trainable tiles") + + +def fix_bd_x_nan(prefix): + """ + WORKAROUND: segger's get_polygon_props() produces NaN boundary features (bd.x) + when polygon geometries have zero area or index misalignment during GeoDataFrame + construction. Replace NaN bd.x with zeros so BCEWithLogitsLoss doesn't propagate NaN. + + Remove this function once segger >= 0.1.x is bumped with the upstream fix. + """ + fixed = 0 + total = 0 + for split in ["train_tiles", "test_tiles", "val_tiles"]: + tile_dir = Path(prefix) / split / "processed" + if not tile_dir.is_dir(): + continue + for tile_path in tile_dir.iterdir(): + if not tile_path.name.endswith(".pt"): + continue + total += 1 + tile = torch.load(str(tile_path), weights_only=False) + bd_x = tile["bd"].x + if bd_x.isnan().any(): + tile["bd"].x = torch.nan_to_num(bd_x, nan=0.0) + torch.save(tile, str(tile_path)) + fixed += 1 + print(f"Fixed NaN bd.x in {fixed}/{total} tiles") + + +def main(): + args, extra = parse_args() + + # Ensure numba cache dir is writable (env var should be set by caller, but belt-and-suspenders) + os.environ.setdefault("NUMBA_CACHE_DIR", os.path.join(os.getcwd(), ".numba_cache")) + os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True) + + prepare_bundle(args.bundle_dir) + print("Adding statistics to parquet files...") + add_parquet_stats() + + # Sanity-check bundle_stats + print("bundle_stats contents:") + for item in sorted(Path("bundle_stats").iterdir()): + print(f" {item.name}") + + run_segger_cli(args, extra) + + filter_trainable_tiles_if_needed(args.output_dir) + fix_bd_x_nan(args.output_dir) + + +if __name__ == "__main__": + main() From 381c9e368b773b0bafe9d22b60ffd4efbb21b792 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 30 Apr 2026 19:17:43 +0000 Subject: [PATCH 7/9] refactor(segger): extract predict orchestration to module binary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces inline GPU enumeration (`python3 -c` for torch.cuda.device_count), predict_parquet.py path resolver, and two `sed -i` runtime patches against installed segger source with a single module binary at resources/usr/bin/run_predict.py. The .nf script block shrinks from ~50 lines to ~15. The two sed patches (torch.no_grad() for VRAM savings, deterministic GPU seed) move into a named `patch_predict_parquet()` function with a clear "remove once upstreamed to segger" comment. Container (quay.io/dongzehe/segger:1.0.14) stays general — patches live in the pipeline. Addresses PR #139 review comment r3165415041 (segger predict). --- modules/local/segger/predict/main.nf | 58 ++------ .../predict/resources/usr/bin/run_predict.py | 137 ++++++++++++++++++ 2 files changed, 151 insertions(+), 44 deletions(-) create mode 100755 modules/local/segger/predict/resources/usr/bin/run_predict.py diff --git a/modules/local/segger/predict/main.nf b/modules/local/segger/predict/main.nf index a7aec75..3a8f58c 100644 --- a/modules/local/segger/predict/main.nf +++ b/modules/local/segger/predict/main.nf @@ -25,53 +25,23 @@ process SEGGER_PREDICT { } def args = task.ext.args ?: '' - def script_path = "/workspace/segger_dev/src/segger/cli/predict_fast.py" prefix = task.ext.prefix ?: "${meta.id}" """ - # Limit cupy GPU memory to 80% so PyTorch has headroom for graph attention ops - export CUPY_GPU_MEMORY_LIMIT="80%" - # Belt-and-suspenders: ensure PyTorch uses expandable segments (also set in env {} block) - export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True,max_split_size_mb:512" - - # Set numba cache directory to avoid caching issues in container - export NUMBA_CACHE_DIR=\$PWD/.numba_cache - mkdir -p \$NUMBA_CACHE_DIR - - # GPU detection logging - echo "=== GPU Detection (SEGGER_PREDICT) ===" - nvidia-smi 2>/dev/null && echo "GPU available: yes" || echo "GPU available: no (nvidia-smi failed)" - python3 -c "import torch; print(f'PyTorch CUDA available: {torch.cuda.is_available()}'); print(f'CUDA device count: {torch.cuda.device_count()}')" 2>/dev/null || echo "PyTorch CUDA check failed" - echo "======================================" - - # Use all available GPUs (autocast reduces VRAM ~50%, so multi-GPU is safe) - GPU_IDS=\$(python3 -c " -import torch -n = torch.cuda.device_count() -print(','.join(str(i) for i in range(n)) if n > 0 else '0') -" 2>/dev/null || echo "0") - echo "Using GPUs: \$GPU_IDS" - - # Patch predict_parquet.py at runtime (avoids Docker rebuild) - PRED_PY=\$(python3 -c "import segger.prediction.predict_parquet as m; print(m.__file__)") - - # 1. Add torch.no_grad() to disable gradient graphs during inference (~30-50% VRAM savings) - sed -i 's/with cp.cuda.Device(gpu_id):/with cp.cuda.Device(gpu_id), torch.no_grad():/' "\$PRED_PY" - - # 2. Seed random for deterministic GPU assignment (avoids stochastic OOM) - sed -i 's/gpu_id = random.choice(gpu_ids)/random.seed(0); gpu_id = random.choice(gpu_ids)/' "\$PRED_PY" - echo "Patched \$PRED_PY: torch.no_grad() + round-robin GPU assignment" - - python3 ${script_path} \\ - --models_dir ${models_dir} \\ - --segger_data_dir ${segger_dataset} \\ - --transcripts_file ${transcripts} \\ - --benchmarks_dir benchmarks_dir \\ - --batch_size ${params.batch_size_predict} \\ - --use_cc ${params.cc_analysis} \\ - --knn_method ${params.segger_knn_method} \\ - --num_workers ${task.cpus} \\ - --gpu_ids \$GPU_IDS \\ + run_predict.py \\ + --models-dir ${models_dir} \\ + --segger-data-dir ${segger_dataset} \\ + --transcripts-file ${transcripts} \\ + --benchmarks-dir benchmarks_dir \\ + --batch-size ${params.batch_size_predict} \\ + --use-cc ${params.cc_analysis} \\ + --knn-method ${params.segger_knn_method} \\ + --num-workers ${task.cpus} \\ ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + segger: 0.1.0 + END_VERSIONS """ stub: diff --git a/modules/local/segger/predict/resources/usr/bin/run_predict.py b/modules/local/segger/predict/resources/usr/bin/run_predict.py new file mode 100755 index 0000000..56a77ff --- /dev/null +++ b/modules/local/segger/predict/resources/usr/bin/run_predict.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +Run segger predict with spatialxe-specific preprocessing. + +Wraps segger's predict_fast.py with: + - GPU enumeration (replaces inline python3 -c torch check) + - WORKAROUND: patch predict_parquet.py at runtime to add torch.no_grad() for ~30-50% VRAM savings + - WORKAROUND: seed random.choice for deterministic GPU assignment (avoids stochastic OOM) + +Both WORKAROUNDs should be removable once the patches are upstreamed to segger. +""" + +import argparse +import os +import subprocess +import sys + + +SEGGER_CLI = "/workspace/segger_dev/src/segger/cli/predict_fast.py" + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument("--models-dir", required=True) + p.add_argument("--segger-data-dir", required=True) + p.add_argument("--transcripts-file", required=True) + p.add_argument("--benchmarks-dir", required=True) + p.add_argument("--batch-size", type=int, required=True) + p.add_argument("--use-cc", required=True) + p.add_argument("--knn-method", required=True) + p.add_argument("--num-workers", type=int, required=True) + args, extra = p.parse_known_args() + return args, extra + + +def detect_gpus(): + """Return comma-separated list of available CUDA device ids (or "0" if none).""" + import torch + + print("=== GPU Detection (SEGGER_PREDICT) ===") + print(f"PyTorch CUDA available: {torch.cuda.is_available()}") + n = torch.cuda.device_count() + print(f"CUDA device count: {n}") + print("======================================") + if n > 0: + return ",".join(str(i) for i in range(n)) + return "0" + + +def patch_predict_parquet(): + """ + WORKAROUND: patch segger.prediction.predict_parquet at runtime. + + Avoids rebuilding the segger Docker image. Two patches: + 1. Add torch.no_grad() to disable gradient graphs during inference (~30-50% VRAM savings). + 2. Seed random for deterministic GPU assignment (avoids stochastic OOM). + + Remove this function once the patches are upstreamed to segger. + """ + import segger.prediction.predict_parquet as m + + pred_py = m.__file__ + print(f"Patching {pred_py}: torch.no_grad() + round-robin GPU assignment") + # Use sed via subprocess for in-place edit (matches the original behavior exactly) + subprocess.run( + [ + "sed", + "-i", + "s/with cp.cuda.Device(gpu_id):/with cp.cuda.Device(gpu_id), torch.no_grad():/", + pred_py, + ], + check=True, + ) + subprocess.run( + [ + "sed", + "-i", + "s/gpu_id = random.choice(gpu_ids)/random.seed(0); gpu_id = random.choice(gpu_ids)/", + pred_py, + ], + check=True, + ) + + +def run_segger_cli(args, extra, gpu_ids): + cmd = [ + "python3", + SEGGER_CLI, + "--models_dir", + args.models_dir, + "--segger_data_dir", + args.segger_data_dir, + "--transcripts_file", + args.transcripts_file, + "--benchmarks_dir", + args.benchmarks_dir, + "--batch_size", + str(args.batch_size), + "--use_cc", + str(args.use_cc), + "--knn_method", + args.knn_method, + "--num_workers", + str(args.num_workers), + "--gpu_ids", + gpu_ids, + *extra, + ] + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd) + if result.returncode != 0: + sys.exit(result.returncode) + + +def main(): + args, extra = parse_args() + + # Limit cupy GPU memory to 80% so PyTorch has headroom for graph attention ops + os.environ.setdefault("CUPY_GPU_MEMORY_LIMIT", "80%") + # Belt-and-suspenders: ensure PyTorch uses expandable segments + os.environ.setdefault( + "PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:512" + ) + # Numba cache directory + os.environ.setdefault("NUMBA_CACHE_DIR", os.path.join(os.getcwd(), ".numba_cache")) + os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True) + + gpu_ids = detect_gpus() + print(f"Using GPUs: {gpu_ids}") + + patch_predict_parquet() + + run_segger_cli(args, extra, gpu_ids) + + +if __name__ == "__main__": + main() From ac58e09d0b342d502e405d9a45fb52ca21d51fc2 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 30 Apr 2026 20:13:56 +0000 Subject: [PATCH 8/9] =?UTF-8?q?revert(schema):=20drop=20format=20validator?= =?UTF-8?q?s=20on=20bundle/image=20=E2=80=94=20incompatible=20with=20tar.g?= =?UTF-8?q?z=20test=20data?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The format: directory-path / file-path validators added in d59fac3 break CI because: - bundle field accepts EITHER a directory path (real Xenium bundle) OR a .tar.gz archive URL (test data, extracted at runtime by UNTAR). Neither 'directory-path' nor 'file-path' fits both, and both fail against remote URLs. - image field is similar — can be a local OME-TIFF path, a URL, or empty. The reviewer's suggestion in r3165291219 / r3165292878 was reasonable in isolation but conflicts with this pipeline's dual-purpose input semantics where the same field can be a path or a URL or an archive. Reverts the format keys; keeps the type / pattern / errorMessage entries. --- assets/schema_input.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 5561c0f..2dfb357 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -15,13 +15,11 @@ }, "bundle": { "type": "string", - "format": "directory-path", "pattern": "^\\S+$", "errorMessage": "Please provide a bundle as input data" }, "image": { "type": "string", - "format": "file-path", "pattern": "^\\S+$", "errorMessage": "You can provide an image. If you do not then please leave the field empty." } From 966e3f44718a2de345550b727173e10a4ab83cc2 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 30 Apr 2026 21:28:58 +0000 Subject: [PATCH 9/9] chore(segger): drop versions.yml heredocs from create_dataset and predict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The heredocs were inadvertently introduced when extracting the inline Python to module binaries. Project convention (CLAUDE.md "Version Reporting") is topic-channel only — no `versions.yml` files. Topic channels (versions_segger, versions_python, etc.) emit the actual versions; the hardcoded `segger: 0.1.0` heredoc was unused and risked going stale. No other modules/local/* writes a versions.yml file. Aligns the segger modules with the rest of the pipeline. --- modules/local/segger/create_dataset/main.nf | 5 ----- modules/local/segger/predict/main.nf | 5 ----- 2 files changed, 10 deletions(-) diff --git a/modules/local/segger/create_dataset/main.nf b/modules/local/segger/create_dataset/main.nf index 520ef34..81320ef 100644 --- a/modules/local/segger/create_dataset/main.nf +++ b/modules/local/segger/create_dataset/main.nf @@ -41,11 +41,6 @@ process SEGGER_CREATE_DATASET { --tile-height ${params.tile_height} \\ --n-workers ${task.cpus} \\ ${args} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - segger: 0.1.0 - END_VERSIONS """ stub: diff --git a/modules/local/segger/predict/main.nf b/modules/local/segger/predict/main.nf index 3a8f58c..0da7a59 100644 --- a/modules/local/segger/predict/main.nf +++ b/modules/local/segger/predict/main.nf @@ -37,11 +37,6 @@ process SEGGER_PREDICT { --knn-method ${params.segger_knn_method} \\ --num-workers ${task.cpus} \\ ${args} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - segger: 0.1.0 - END_VERSIONS """ stub: