In [1]:
import polars as pl
import matplotlib.pyplot as plt
from scipy import signal
plt.rcParams['figure.figsize'] = [16, 5]


In [16]:
df = pl.read_csv(
    "../../test/cdr/input/HG00731_intersect_nowindow.bed",
    separator="\t",
    has_header=False,
    new_columns=[
        "chr",
        "st",
        "end",
        "mod",
        "score",
        "strand",
        "7",
        "8",
        "9",
        "cov",
        "perc_mod",
        "n_canon",
        "n_mod",
        "n_filt",
        "n_nocall",
        "n_altmod"
    ]
).drop("7", "8", "9").filter(pl.col("cov") != 0)

In [17]:
df

chr,st,end,mod,score,strand,cov,perc_mod,n_canon,n_mod,n_filt,n_nocall,n_altmod
str,i64,i64,str,i64,str,i64,str,i64,i64,i64,i64,i64
"""haplotype1-000…",130641453,130641454,"""5mC""",941,"""+""",17,"""100.00""",0,16,1,0,0
"""haplotype1-000…",130641764,130641765,"""5mC""",470,"""+""",17,"""12.50""",7,1,0,9,0
"""haplotype1-000…",130642064,130642065,"""5mC""",1000,"""+""",17,"""100.00""",0,17,0,0,0
"""haplotype1-000…",130642209,130642210,"""5mC""",705,"""+""",17,"""41.67""",7,5,4,1,0
"""haplotype1-000…",130642842,130642843,"""5mC""",1000,"""+""",17,"""100.00""",0,17,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…
"""haplotype2-000…",49770961,49770962,"""5mC""",578,"""+""",19,"""9.09""",10,1,2,5,0
"""haplotype2-000…",49771079,49771080,"""5mC""",473,"""+""",19,"""55.56""",4,5,0,9,0
"""haplotype2-000…",49772122,49772123,"""5mC""",631,"""+""",19,"""100.00""",0,12,0,5,0
"""haplotype2-000…",49772342,49772343,"""5mC""",578,"""+""",19,"""90.91""",1,10,0,7,0


In [24]:
df = pl.read_csv("../../test/cdr/input/HG00731_intersect.bed", separator="\t", has_header=False, new_columns=["chr", "st", "end", "avg", "idx"])
df

chr,st,end,avg,idx
str,i64,i64,f64,i64
"""haplotype1-000…",615000,620000,40.32,123
"""haplotype1-000…",620000,625000,37.49,124
"""haplotype1-000…",625000,630000,36.82,125
"""haplotype1-000…",630000,635000,36.85,126
"""haplotype1-000…",635000,640000,37.68,127
…,…,…,…,…
"""haplotype2-000…",4690000,4695000,36.35,19288
"""haplotype2-000…",4695000,4700000,33.48,19289
"""haplotype2-000…",4700000,4705000,33.77,19290
"""haplotype2-000…",4705000,4710000,28.82,19291


In [25]:
for chr, df_chr_methyl in df.group_by(["chr"]):
    chr = chr[0]
    methyl_signal = df_chr_methyl["avg"]
    smoothed_methyl_signal = signal.savgol_filter(methyl_signal, 5, 4)
    # smoothed_methyl_signal = signal.medfilt(methyl_signal, 5)
    valley_prom = smoothed_methyl_signal.mean() * 0.33
    peaks, peak_info = signal.find_peaks(
        -smoothed_methyl_signal,
        width=1,
        prominence=valley_prom
    )
    plt.plot(methyl_signal)
    ax = plt.gca()
    for l, r in zip(peak_info["left_ips"], peak_info["right_ips"]):
        ax.axvspan(l, r, color="red", alpha=0.5)
    plt.savefig(f"{chr}.png")
    plt.close()