In [1]:
import genvarloader as gvl
import pooch
from pathlib import Path
import pyranges as pr
import polars as pl
from tempfile import NamedTemporaryFile

In [2]:
reference = pooch.retrieve(
    url="https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.22.fa.gz",
    known_hash="sha256:974f97ac8ef7ffae971b63b47608feda327403be40c27e391ee4a1a78b800df5",
    progressbar=True,
)
if not Path(f"{reference[:-3]}.bgz").exists():
    !gzip -dc {reference} | bgzip > {reference[:-3]}.bgz
reference = f"{reference[:-3]}.bgz"

clinvar_vcf = pooch.retrieve(
    url="https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz",
    known_hash="50f599dc8618a22722ae46dd0bd0514441aba5a82cac6b8df467a55c95dde667",
)
if not Path(f"{clinvar_vcf[:-7]}.bcf").exists():
    with NamedTemporaryFile("w+") as renamer:
        contigs = [str(i) for i in range(1, 23)] + ["X", "Y", "MT"]
        for c in contigs:
            renamer.write(f"chr{c} {c}\n")
        renamer.flush()
        !bcftools norm -f {reference} -r 22 -a --atom-overlaps . -m - -O b -W -o {clinvar_vcf[:-7]}.bcf {clinvar_vcf}
clinvar_vcf = f"{clinvar_vcf[:-7]}.bcf"

In [3]:
clinvar = gvl.SitesOnlyVCF(Path(clinvar_vcf))

In [None]:
bed = (
    pl.DataFrame(
        dict(
            Chromosome=["22", "22"],
            Start=[15528170, 50777975],
            End=[15528170 + int(1e4), 50777975 + int(1e4)],
        )
    )
    .with_row_index("ds_row")
    .to_pandas()
)
ds_pyr = pr.PyRanges(bed)
sites_pyr = clinvar.get_pyranges("site_row")
ds_pyr.join(sites_pyr, suffix="_site")

Unnamed: 0,ds_row,Chromosome,Start,End,site_row,Start_site,End_site
0,0,22,15528170,15538170,0,15528170,15528171
1,0,22,15528170,15538170,1,15528189,15528190
2,0,22,15528170,15538170,2,15528193,15528194
3,0,22,15528170,15538170,3,15528194,15528195
4,0,22,15528170,15538170,4,15528206,15528207
...,...,...,...,...,...,...,...
60,0,22,15528170,15538170,60,15529118,15529119
61,1,22,50777975,50787975,74395,50777975,50777976
62,1,22,50777975,50787975,74396,50782203,50782204
63,1,22,50777975,50787975,74397,50782242,50782243
