In [1]:
import genvarloader as gvl
import polars as pl
import pooch
from pathlib import Path
from tempfile import TemporaryDirectory

In [2]:
# GRCh38 chromosome 22 sequence
reference = pooch.retrieve(
    url="https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.22.fa.gz",
    known_hash="sha256:974f97ac8ef7ffae971b63b47608feda327403be40c27e391ee4a1a78b800df5",
    progressbar=True,
)
if not Path(f"{reference[:-3]}.bgz").exists():
    !gzip -dc {reference} | bgzip > {reference[:-3]}.bgz
reference = reference[:-3] + ".bgz"

# PLINK 2 files
variants = pooch.retrieve(
    url="doi:10.5281/zenodo.13656224/1kGP.chr22.pgen",
    known_hash="md5:31aba970e35f816701b2b99118dfc2aa",
    progressbar=True,
    fname="1kGP.chr22.pgen",
)
pooch.retrieve(
    url="doi:10.5281/zenodo.13656224/1kGP.chr22.psam",
    known_hash="md5:eefa7aad5acffe62bf41df0a4600129c",
    progressbar=True,
    fname="1kGP.chr22.psam",
)
pooch.retrieve(
    url="doi:10.5281/zenodo.13656224/1kGP.chr22.pvar",
    known_hash="md5:5f922af91c1a2f6822e2f1bb4469d12b",
    progressbar=True,
    fname="1kGP.chr22.pvar",
)

# GRCh38 Gencode GFF3
gff = pooch.retrieve(
    url="https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.annotation.gff3.gz",
    known_hash="md5:918948a059c72088e44861835ade8a89",
)

In [3]:
gff_df = pl.read_csv(
    gff,
    separator="\t",
    comment_prefix="#",
    has_header=False,
    new_columns=[
        "seqname",
        "source",
        "feature",
        "start",
        "end",
        "score",
        "strand",
        "frame",
        "attribute",
    ],
)
gff_df.head()

seqname,source,feature,start,end,score,strand,frame,attribute
str,str,str,i64,i64,str,str,str,str
"""chr1""","""HAVANA""","""gene""",11121,24894,""".""","""+""",""".""","""ID=ENSG00000290825.2;gene_id=E…"
"""chr1""","""HAVANA""","""transcript""",11121,14413,""".""","""+""",""".""","""ID=ENST00000832824.1;Parent=EN…"
"""chr1""","""HAVANA""","""exon""",11121,11211,""".""","""+""",""".""","""ID=exon:ENST00000832824.1:1;Pa…"
"""chr1""","""HAVANA""","""exon""",12010,12227,""".""","""+""",""".""","""ID=exon:ENST00000832824.1:2;Pa…"
"""chr1""","""HAVANA""","""exon""",12613,12721,""".""","""+""",""".""","""ID=exon:ENST00000832824.1:3;Pa…"


In [4]:
chr22_genes = (
    gff_df.filter(pl.col("seqname") == "chr22")
    .filter(pl.col("feature") == "exon")
    .with_columns(
        gene=pl.col("attribute").str.extract(r"gene_id=(.*?);"),
        exon_num=pl.col("attribute").str.extract(r"exon_number=(.*?);"),
    )
    .rename({"seqname": "chrom", "start": "chromStart", "end": "chromEnd"})
)
chr22_genes.head()

chrom,source,feature,chromStart,chromEnd,score,strand,frame,attribute,gene,exon_num
str,str,str,i64,i64,str,str,str,str,str,str
"""chr22""","""HAVANA""","""exon""",10529036,10529164,""".""","""-""",""".""","""ID=exon:ENST00000724296.1:1;Pa…","""ENSG00000294541.1""","""1"""
"""chr22""","""HAVANA""","""exon""",10527853,10528040,""".""","""-""",""".""","""ID=exon:ENST00000724296.1:2;Pa…","""ENSG00000294541.1""","""2"""
"""chr22""","""HAVANA""","""exon""",10524345,10524446,""".""","""-""",""".""","""ID=exon:ENST00000724296.1:3;Pa…","""ENSG00000294541.1""","""3"""
"""chr22""","""ENSEMBL""","""exon""",10736171,10736283,""".""","""-""",""".""","""ID=exon:ENST00000615943.1:1;Pa…","""ENSG00000277248.1""","""1"""
"""chr22""","""HAVANA""","""exon""",10742050,10742191,""".""","""+""",""".""","""ID=exon:ENST00000779064.1:1;Pa…","""ENSG00000301473.1""","""1"""


In [5]:
ds_path = TemporaryDirectory(suffix=".gvl").name
gvl.write(ds_path, chr22_genes[:100], variants)

  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
ds = gvl.Dataset.open(ds_path, reference, splice_info=("gene", "exon_num"))

In [15]:
ds._getitem_spliced([1])[0].view("S1")

IndexError: cannot slice RegularArray (of length 1) with [3]: index out of range while attempting to get index 3 (in compiled code: https://github.com/scikit-hep/awkward/blob/awkward-cpp-44/awkward-cpp/src/cpu-kernels/awkward_RegularArray_getitem_next_array_regularize.cpp#L19)