From 2678ae420e06a045c7943d2591c91f75d2f0affd Mon Sep 17 00:00:00 2001 From: bschilder Date: Thu, 21 May 2026 15:10:50 -0400 Subject: [PATCH] fix: ndim guard on geno_offsets in choose_exonic_variants second loop The first prange loop in `choose_exonic_variants` and the sibling `filter_af` kernel both branch on `geno_offsets.ndim == 1` before indexing, but the second prange loop (lines ~471-473) unconditionally used scalar arithmetic: `o_s, o_e = geno_offsets[o_idx], geno_offsets[ o_idx + 1]; qh_genos = geno_v_idxs[o_s:o_e]`. When the caller passes a 2-D `geno_offsets` (shape `(total_variants, 2)`, an internal storage variant of the canonical 1-D layout), `geno_offsets[o_idx]` returns a length-2 row instead of a scalar. `geno_v_idxs[:]` then tries `slice(array(int64, 1d, C), array(int64, 1d, C))` and numba bails at JIT-compile time: TypingError: Failed in nopython mode pipeline (step: nopython frontend) No implementation of function Function() found for signature: >>> slice(array(int64, 1d, C), array(int64, 1d, C)) Repro: any `Dataset.open(..., splice_info=..., var_filter="exonic")` on a `.gvl/.svar` pair whose `geno_offsets` happens to materialize as 2-D. We hit it on chr2 of 1KG NYGC 30x (3,202 samples, MANE Select panel); chr1 and chr3-22 materialise 1-D `geno_offsets` and didn't. Fix mirrors the existing first-loop guard. Added regression test in `tests/dataset/genotypes/test_choose_exonic_variants.py` exercising both 1-D and 2-D layouts; both produce the same logical keep mask. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/genvarloader/_dataset/_genotypes.py | 12 +++- .../genotypes/test_choose_exonic_variants.py | 55 +++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 tests/dataset/genotypes/test_choose_exonic_variants.py diff --git a/python/genvarloader/_dataset/_genotypes.py b/python/genvarloader/_dataset/_genotypes.py index 2cb7d82f..dee627ba 100644 --- a/python/genvarloader/_dataset/_genotypes.py +++ b/python/genvarloader/_dataset/_genotypes.py @@ -469,7 +469,17 @@ def choose_exonic_variants( ref_end: int = ends[query] for hap in nb.prange(ploidy): o_idx = geno_offset_idxs[query, hap] - o_s, o_e = geno_offsets[o_idx], geno_offsets[o_idx + 1] + # Mirror the ndim guard from the first loop (lines ~455-458) + # and from the sibling `filter_af` kernel (lines ~549-552). + # Without the guard, a 2-D `geno_offsets` makes + # `geno_offsets[o_idx]` return a length-2 array, which then + # makes the slice below `geno_v_idxs[array:array]` — numba + # raises `TypingError: slice(array(int64, 1d, C), + # array(int64, 1d, C))` at JIT compile time. + if geno_offsets.ndim == 1: + o_s, o_e = geno_offsets[o_idx], geno_offsets[o_idx + 1] + else: + o_s, o_e = geno_offsets[o_idx] qh_genos = geno_v_idxs[o_s:o_e] k_idx = query * ploidy + hap diff --git a/tests/dataset/genotypes/test_choose_exonic_variants.py b/tests/dataset/genotypes/test_choose_exonic_variants.py new file mode 100644 index 00000000..667f43e0 --- /dev/null +++ b/tests/dataset/genotypes/test_choose_exonic_variants.py @@ -0,0 +1,55 @@ +"""Regression test for the choose_exonic_variants 2-D geno_offsets bug. + +The function used to JIT-fail with +``TypingError: slice(array(int64, 1d, C), array(int64, 1d, C))`` when +``geno_offsets`` was 2-D, because the second prange loop indexed +``geno_offsets[o_idx]`` (returning a length-2 row, not scalars) and +then sliced ``geno_v_idxs[o_s:o_e]`` with those rows. + +Mirror the fix in the first loop + the sibling ``filter_af`` kernel +which both branch on ``geno_offsets.ndim == 1``. +""" + +from __future__ import annotations + +import numpy as np + +from genvarloader._dataset._genotypes import choose_exonic_variants + + +def _common_inputs() -> dict[str, np.ndarray]: + """Tiny shared fixture: 1 region, ploidy 2, 2 variants, both exonic.""" + return { + "starts": np.asarray([0], dtype=np.int32), + "ends": np.asarray([100], dtype=np.int32), + "geno_offset_idxs": np.asarray([[0, 1]], dtype=np.intp), + # Two variants, indices 0 and 1, both inside [0, 100): + "geno_v_idxs": np.asarray([0, 1], dtype=np.int32), + "v_starts": np.asarray([10, 50], dtype=np.int32), + "ilens": np.asarray([0, 0], dtype=np.int32), # SNVs, length-0 + } + + +def test_choose_exonic_variants_1d_geno_offsets() -> None: + """1-D geno_offsets always worked; lock the baseline output.""" + inputs = _common_inputs() + # Shape (total_variants + 1,) -- the canonical 1-D layout. + inputs["geno_offsets"] = np.asarray([0, 1, 2], dtype=np.int64) + keep, keep_offsets = choose_exonic_variants(**inputs) + assert keep.dtype == np.bool_ + assert keep_offsets.shape == (3,) # n_regions * ploidy + 1 = 1 * 2 + 1 + # Both variants are inside [0, 100) so both kept. + assert keep.tolist() == [True, True] + + +def test_choose_exonic_variants_2d_geno_offsets() -> None: + """2-D geno_offsets used to JIT-fail at numba compile time.""" + inputs = _common_inputs() + # Shape (total_variants, 2) -- each row is [o_s, o_e] for one variant. + # Equivalent logical content to the 1-D fixture above. + inputs["geno_offsets"] = np.asarray([[0, 1], [1, 2]], dtype=np.int64) + keep, keep_offsets = choose_exonic_variants(**inputs) + # Same output as the 1-D path -- the 2-D layout is just a different + # internal storage shape; logical content is identical. + assert keep_offsets.shape == (3,) + assert keep.tolist() == [True, True]