In [68]:
import polars as pl
import os

In [69]:
chr = "chr5"
start = 87925915
end = 87926842
max_shift = 5

In [70]:
def parse_locstring(locstring: str):
    chrom, coords, strand = locstring.split(":")
    start, end = coords.split("-")
    return {"chr": chrom, "start": int(start), "end": int(end), "strand": strand}

In [73]:
dfs = []
for tool_csv in os.listdir("detection"):
    tool = tool_csv.split(".")[0]
    df = pl.scan_csv(f"detection/{tool_csv}", separator="\t")
    df = df.unpivot(index="id", variable_name="sample", value_name="count")
    df = df.filter(pl.col("count").is_not_null())
    df = df.group_by("id").agg(pl.col("sample").n_unique().alias("n_samples"))
    df = df.with_columns(tool=pl.lit(tool))
    dfs.append(df)
df = pl.concat(dfs)

df = df.with_columns(location=pl.col("id").map_elements(parse_locstring, return_dtype=pl.Struct))
df = df.with_columns(
    chr=pl.col("location").map_elements(lambda x: x["chr"], return_dtype=str),
    start=pl.col("location").map_elements(lambda x: x["start"], return_dtype=int),
    end=pl.col("location").map_elements(lambda x: x["end"], return_dtype=int),
    strand=pl.col("location").map_elements(lambda x: x["strand"], return_dtype=str)
)
df = df.select("tool", "chr", "start", "end", "strand", "n_samples")
df = df.filter(pl.col("chr") == chr)
df = df.filter((pl.col("start") - start).abs() <= max_shift)
df = df.filter((pl.col("end") - end).abs() <= max_shift)

df.collect()

tool,chr,start,end,strand,n_samples
str,str,i64,i64,str,u32
"""segemehl""","""chr5""",87925916,87926842,"""-""",1
"""segemehl""","""chr5""",87925915,87926842,"""-""",10
"""dcc""","""chr5""",87925916,87926842,"""-""",28
"""circexplorer2""","""chr5""",87925915,87926842,"""+""",28
"""ciri2""","""chr5""",87925916,87926842,"""+""",29
