In [1]:
import polars as pl

In [25]:
def split_coord_ctg_name(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.with_columns(split_contig=pl.col("contig").str.split_exact(":", n=1))
        .unnest("split_contig")
        .rename({"field_0": "ctg_name", "field_1":"ctg_coord"})
        .with_columns(pl.col("ctg_coord").str.split_exact("-", n=1))
        .unnest("ctg_coord")
        .rename({"field_0": "ctg_start", "field_1":"ctg_end"})
        .cast({"ctg_start": pl.Int64, "ctg_end": pl.Int64})
    )

In [26]:
cols = ["idx", "div", "deldiv", "insdiv", "contig", "start", "end", "left", "C", "type", "rClass", "right", "x", "y", "z", "other"]

In [27]:
df_new = split_coord_ctg_name(pl.read_csv("data/annotations/chm13_chm1_cens_v21.trimmed.fa.noheader_copy.out", separator="\t", new_columns=cols, truncate_ragged_lines=True))
df_old = split_coord_ctg_name(pl.read_csv("data/annotations/chm13_chm1_cens_v21.trimmed.fa.noheader.out", separator="\t", new_columns=cols, truncate_ragged_lines=True))

In [34]:
df_new

idx,div,deldiv,insdiv,contig,start,end,left,C,type,rClass,right,x,y,z,other,ctg_name,ctg_start,ctg_end
i64,f64,f64,f64,str,i64,i64,str,str,str,str,str,i64,str,i64,str,str,i64,i64
2126,23.6,2.6,2.2,"""chm1_cen10v8:38450001-42600000""",304,809,"""(4149191)""","""C""","""MER21C""","""LTR/ERVL""","""(404)""",534,"""27""",2,,"""chm1_cen10v8""",38450001,42600000
1088,20.4,0.3,13.3,"""chm1_cen10v8:38450001-42600000""",811,1099,"""(4148901)""","""C""","""AluJb""","""SINE/Alu""","""(13)""",299,"""44""",3,,"""chm1_cen10v8""",38450001,42600000
978,23.6,0.5,3.2,"""chm1_cen10v8:38450001-42600000""",1137,1359,"""(4148641)""","""C""","""AluJb""","""SINE/Alu""","""(94)""",218,"""2""",4,,"""chm1_cen10v8""",38450001,42600000
1491,19.0,2.5,8.0,"""chm1_cen10v8:38450001-42600000""",1360,1713,"""(4148287)""","""+""","""MLT1D""","""LTR/ERVL-MaLR""","""162""",497,"""(8)""",5,,"""chm1_cen10v8""",38450001,42600000
1647,18.8,2.3,1.3,"""chm1_cen10v8:38450001-42600000""",1724,1856,"""(4148144)""","""+""","""AluJb""","""SINE/Alu""","""2""",135,"""(177)""",6,,"""chm1_cen10v8""",38450001,42600000
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2710,1.3,0.0,0.0,"""chm13_chrX:56950001-61750000""",4793599,4793908,"""(6092)""","""C""","""L1PA2""","""LINE/L1""","""(4)""",6151,"""5842""",109281,,"""chm13_chrX""",56950001,61750000
9867,1.7,0.1,0.0,"""chm13_chrX:56950001-61750000""",4793909,4795709,"""(4291)""","""+""","""L1HS""","""LINE/L1""","""1058""",2861,"""(3285)""",109282,,"""chm13_chrX""",56950001,61750000
719,8.0,0.0,9.7,"""chm13_chrX:56950001-61750000""",4795709,4795833,"""(4167)""","""+""","""L1PA12""","""LINE/L1""","""4581""",4694,"""(1452)""",109280,"""*""","""chm13_chrX""",56950001,61750000
5160,10.3,4.5,0.2,"""chm13_chrX:56950001-61750000""",4795871,4797284,"""(2716)""","""+""","""L1PA12""","""LINE/L1""","""4694""",6168,"""(0)""",109280,,"""chm13_chrX""",56950001,61750000


In [29]:
dfs = []
for c, df_c_new in df_new.filter(pl.col("contig").str.starts_with("chm13")).group_by(["contig"]):
    c = c[0]
    df_c_old = df_old.filter(pl.col("contig") == c)

    min_old_start = df_c_old["start"].min()
    max_old_stop = df_c_old["end"].max()

    min_new_start = df_c_new["start"].min()
    max_new_stop = df_c_new["end"].max()

    print(c)
    if min_old_start != min_new_start:
        add_to_ctg_start = min_new_start - min_old_start
        df_c_new = df_c_new.with_columns(
            pl.col("start") - min_new_start,
            pl.col("end") - min_new_start,
            pl.col("ctg_start") + add_to_ctg_start
        )
    if max_old_stop != max_new_stop:
        sub_to_ctg_end = max_old_stop - max_new_stop
        df_c_new = df_c_new.with_columns(pl.col("ctg_end") - sub_to_ctg_end)
    
    dfs.append(df_c_new)

chm13_chr20:25800001-32500000
chm13_chrX:56950001-61750000
chm13_chr5:45650001-51600000
chm13_chr15:13500001-18250000
chm13_chr16:32400001-38950000
chm13_chr2:91800001-95600000
chm13_chr22:8000001-17400000
chm13_chr18:15050001-21650000
chm13_chr13:10650001-18100000
chm13_chr21:7700001-11850000
chm13_chr7:57650001-64700000
chm13_chr10:38500001-42550000
chm13_chr3:89850001-97000000
chm13_chr11:48300001-55700000
chm13_chr14:5600001-13300000
chm13_chr6:57750001-63100000
chm13_chr1:121100001-127300000
chm13_chr19:23850001-30750000
chm13_chr12:33800001-38500000
chm13_chr9:44200001-48100000
chm13_chr8:43350001-47450000
chm13_chr4:49200001-55800000
chm13_chr17:22850001-28650000


In [37]:
df_chm13_new = pl.concat(dfs).with_columns(contig=pl.col("ctg_name") + ":" + pl.col("ctg_start").cast(pl.String) + "-" + pl.col("ctg_end").cast(pl.String)).drop("ctg_name", "ctg_start", "ctg_end")
df_chm13_new

idx,div,deldiv,insdiv,contig,start,end,left,C,type,rClass,right,x,y,z,other
i64,f64,f64,f64,str,i64,i64,str,str,str,str,str,i64,str,i64,str
661,18.3,4.8,16.5,"""chm13_chr20:25800001-30050963""",1141,1409,"""(6698591)""","""C""","""L1MA9""","""LINE/L1""","""(1)""",6311,"""6070""",82316,
1287,21.5,3.9,0.3,"""chm13_chr20:25800001-30050963""",1427,1706,"""(6698294)""","""C""","""AluJb""","""SINE/Alu""","""(11)""",301,"""12""",82317,
3549,19.7,8.0,2.8,"""chm13_chr20:25800001-30050963""",1722,2873,"""(6697127)""","""C""","""L1MA9""","""LINE/L1""","""(533)""",5779,"""4582""",82316,
2086,10.7,0.0,3.7,"""chm13_chr20:25800001-30050963""",2874,3183,"""(6696817)""","""C""","""AluSx""","""SINE/Alu""","""(11)""",301,"""3""",82318,
3549,20.9,5.9,3.9,"""chm13_chr20:25800001-30050963""",3184,3495,"""(6696505)""","""C""","""L1MA9""","""LINE/L1""","""(1559)""",4581,"""4249""",82316,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
278,29.4,2.8,11.1,"""chm13_chr17:22850001-28101270""",5248173,5248386,"""(551614)""","""+""","""SST1""","""Satellite/centr""","""770""",967,"""(596)""",77081,
30,15.5,4.5,1.5,"""chm13_chr17:22850001-28101270""",5248669,5248734,"""(551266)""","""+""","""(CA)n""","""Simple_repeat""","""1""",68,"""(0)""",77082,
722,27.9,6.8,6.8,"""chm13_chr17:22850001-28101270""",5248735,5249161,"""(550839)""","""C""","""SST1""","""Satellite/centr""","""(11)""",1552,"""1126""",77083,
1149,28.3,4.2,3.7,"""chm13_chr17:22850001-28101270""",5249572,5250188,"""(549812)""","""C""","""SST1""","""Satellite/centr""","""(904)""",659,"""40""",77083,


In [41]:
pl.concat([
    df_new.filter(~pl.col("contig").str.starts_with("chm13")).drop("ctg_name", "ctg_start", "ctg_end"),
    df_chm13_new
]).write_csv("data/annotations/chm13_chm1_cens_v21.trimmed.fa.noheader.refmt.out", separator="\t", include_header=False)