In [1]:
import requests
import polars as pl
from liftover import get_lifter

In [2]:
def parse_locstring(locstring: str):
    chrom, coords, strand = locstring.split(":")
    start, end = coords.split("-")
    return {"chr": chrom, "start": int(start), "end": int(end), "strand": strand}

In [3]:
max_shift = 3

In [4]:
df = pl.scan_csv(
    "../chapters/4_results_and_discussion/figures/dea/letrozole_gene_intersection.tsv", 
    separator="\t",
    has_header=False,
    new_columns=["locstring", "type", "gene", "database"]
)
df = df.with_columns(location=pl.col("locstring").map_elements(parse_locstring, return_dtype=pl.Struct))

df = df.with_columns(
    chr=pl.col("location").map_elements(lambda x: x["chr"], return_dtype=str),
    start=pl.col("location").map_elements(lambda x: x["start"], return_dtype=int),
    end=pl.col("location").map_elements(lambda x: x["end"], return_dtype=int),
    strand=pl.col("location").map_elements(lambda x: x["strand"], return_dtype=str)
)

df = df.select("chr", "start", "end", "strand", "type", "gene", "database")

In [5]:
df = df.sort("end"  ).with_columns(end_group  =pl.col("end"  ).diff().fill_null(0).gt(max_shift).cum_sum())
df = df.sort("start").with_columns(start_group=pl.col("start").diff().fill_null(0).gt(max_shift).cum_sum())

df = df.group_by(["chr", "start_group", "end_group"]).agg(
    start=pl.col("start").min(),
    end=pl.col("end").max(),
    gene=pl.col("gene").str.split(",").flatten().unique().map_elements(lambda x: ",".join(sorted(x)), return_dtype=str),
    database=pl.col("database").str.split(",").flatten().unique().map_elements(lambda x: ",".join(sorted(x)), return_dtype=str),
    type=pl.col("type").unique().map_elements(lambda x: ",".join(sorted(x)), return_dtype=str)
)
df = df.select("chr", "start", "end", "gene", "database", "type")

df = df.with_columns(location=pl.struct(pl.col("chr"), pl.col("start"), pl.col("end")))

In [None]:
lifter = get_lifter("mm39", "hg38")
lifter

In [7]:
test = {
    "chr": "chr5",
    "start": 87817372,
    "end": 87821140
}

In [8]:
def liftover(location: dict):
    res_start = lifter.convert_coordinate(location["chr"], location["start"])
    res_end = lifter.convert_coordinate(location["chr"], location["end"])

    if len(res_start) == 0 or len(res_end) == 0:
        print("No results for" + str(location))
        return None

    if len(res_start) > 1 or len(res_end) > 1:
        raise ValueError("Multiple results")
    
    start_chr, start, _ = res_start[0]
    end_chr, end, _ = res_end[0]

    if start_chr != end_chr:
        raise ValueError("Chromsomes do not match")
    
    return {"chr": start_chr, "start": start, "end": end}

In [9]:
def fetch_snps(location: dict):
    species = "hsapiens_grch38"
    location = liftover(location)
    if location is None:
        return {"numResults": 0}
    chrom = location["chr"].replace("chr", "")
    url = f"https://www.ebi.ac.uk/eva/webservices/rest/v1/segments/{chrom}:{location["start"]}-{location["end"]}/variants?species={species}&limit=1000"
    response = requests.get(url)
    return response.json()['response'][0]

In [None]:
df_locations = df.collect().to_pandas()
df_locations["resp"] = df_locations["location"].map(fetch_snps)
df_locations["n_results"] = df_locations["resp"].map(lambda x: x["numResults"])
df_locations