In [2]:
import polars as pl
from intervaltree import IntervalTree, Interval

In [3]:
df_ld_regions = pl.read_csv(
    "data/pq_arm.bed",
    separator="\t",
    has_header=False,
    new_columns=["chr", "start", "end", "length"],
    null_values=["N/A"],
    truncate_ragged_lines=True,
)
df_ld_regions

chr,start,end,length
str,i64,i64,i64
"""chr1""",118753176,147302491,28549315
"""chr2""",88012476,98516864,10504388
"""chr3""",89151152,97946851,8795699
"""chr4""",47517515,56878107,9360592
"""chr5""",45120933,52498253,7377320
…,…,…,…
"""chr19""",23789807,30598756,6808949
"""chr20""",25039616,34291110,9251494
"""chr21""",,12752875,12752875
"""chr22""",,18280857,18280857


In [4]:
SEGDUP_COLS = [
    "chrom",
    "chromStart",
    "chromEnd",
    "name",
    "score",
    "strand",
    "thickStart",
    "thickEnd",
    "color",
    "chrom2",
    "Start2",
    "End2",
    "score2",
    "strand2",
    "maxlen",
    "alnlen",
    "indela",
    "indelb",
    "alnB",
    "matchB",
    "mismatchB",
    "transitionsB",
    "transvertionsB",
    "fracMatch",
    "fracMatchIndel",
    "jck",
    "k2K",
    "alngaps",
    "uppercaseA",
    "uppercaseB",
    "uppercaseMatches",
    "alnMatches",
    "alnMismatches",
    "alnGapBases",
    "filterScore",
    "satBases",
    "uniqueID",
    "original",
    "telo",
    "peri",
    "acro",
    "telo2",
    "peri2",
    "acro2"
]

In [5]:
df_segdups = pl.read_csv(
    "data/sedefSegDups.bed",
    separator="\t",
    new_columns=SEGDUP_COLS,
    has_header=False
)
df_segdups

chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,color,chrom2,Start2,End2,score2,strand2,maxlen,alnlen,indela,indelb,alnB,matchB,mismatchB,transitionsB,transvertionsB,fracMatch,fracMatchIndel,jck,k2K,alngaps,uppercaseA,uppercaseB,uppercaseMatches,alnMatches,alnMismatches,alnGapBases,filterScore,satBases,uniqueID,original,telo,peri,acro,telo2,peri2,acro2
str,i64,i64,str,i64,str,i64,i64,str,str,i64,i64,f64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,bool,i64,i64,i64,i64,i64,i64
"""chr1""",1,13731,"""chr9:150595369…",0,"""+""",1,13731,"""154,154,154""","""chr9""",150595369,150617116,44.8,"""-""",21747,22497,8767,750,12980,12422,558,240,318,0.957011,0.552163,0.0442705,0.044285,66,3670,4662,3432,12422,558,9517,0.952169,1693,4,true,1,0,0,1,0,0
"""chr1""",3,140066,"""chr6:171986258…",0,"""+""",3,140066,"""255,103,0""","""chr6""",171986258,172126591,11.6,"""-""",140333,148402,8339,8069,131994,131242,752,438,314,0.994303,0.884368,0.005719,0.005721,126,56586,61380,54101,131242,752,16408,0.993355,1693,5,true,1,0,0,1,0,0
"""chr1""",16,13731,"""chr20:599-2174…",0,"""+""",16,13731,"""167,167,167""","""chr20""",599,21749,47.1,"""+""",21150,22350,8635,1200,12515,11832,683,274,409,0.945425,0.529396,0.056662,0.0566739,49,3670,4825,3511,11832,683,9835,0.941738,1693,14,true,1,0,0,1,0,0
"""chr1""",16,13731,"""chr7:156-22557…",0,"""+""",16,13731,"""147,147,147""","""chr7""",156,22557,43.6,"""+""",22401,22787,9072,386,13329,12841,488,226,262,0.963388,0.563523,0.0375357,0.0375544,62,3670,4961,3508,12841,488,9458,0.958928,1693,15,true,1,0,0,1,0,0
"""chr1""",16,198196,"""chr5:181848663…",0,"""+""",16,198196,"""255,103,0""","""chr5""",181848663,182045430,11.1,"""-""",198180,208191,10011,11424,186756,185082,1674,979,695,0.991036,0.889001,0.009018,0.009021,191,84255,83569,80570,185082,1674,21435,0.990024,1693,16,true,1,0,0,1,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",62445192,62453929,"""chr2:114018268…",0,"""+""",62445192,62453929,"""204,204,0""","""chr2""",114018268,114027657,22.1,"""+""",9389,10120,731,1383,8006,7887,119,74,45,0.985136,0.779348,0.0150131,0.0150274,23,7777,6533,6412,7887,119,2114,0.982314,672,86514,false,1,0,0,0,0,0
"""chrY""",62445192,62454747,"""chr20:70763-80…",0,"""+""",62445192,62454747,"""133,133,133""","""chr20""",70763,80691,23.6,"""-""",9928,10924,996,1369,8559,8349,210,131,79,0.975464,0.76428,0.0249459,0.0249864,23,7783,6533,6361,8349,210,2365,0.97285,1151,89953,false,1,0,0,1,0,0
"""chrY""",62445192,62456950,"""chr20:66197305…",0,"""+""",62445192,62456950,"""204,204,0""","""chr20""",66197305,66210255,19.3,"""+""",12950,13584,634,1826,11124,10958,166,80,86,0.985077,0.806684,0.0150731,0.015077,72,7909,6533,6448,10958,166,2460,0.978742,1451,92524,false,1,0,0,1,0,0
"""chrY""",62445192,62457871,"""chr3:201091920…",0,"""+""",62445192,62457871,"""140,140,140""","""chr3""",201091920,201105948,35.2,"""+""",14028,16004,1976,3325,10703,10373,330,162,168,0.969168,0.64815,0.0314842,0.0315034,75,7708,6533,6361,10373,330,5301,0.962423,1125,105009,false,1,0,0,1,0,0


In [6]:
CENSAT_COLS = [
    "chrom",
    "chromStart",
    "chromEnd",
    "name",
    "score",
    "strand",
    "thickStart",
    "thickEnd",
    "reserved",
    "component"
]

In [7]:
df_censat = pl.read_csv(
    "data/censat.bed",
    separator="\t",
    new_columns=CENSAT_COLS,
    has_header=False
)
df_censat

chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,reserved,component
str,i64,i64,str,i64,str,i64,i64,str,str
"""chr1""",116796047,121405145,"""ct_1_1(p_arm)""",100,""".""",116796047,121405145,"""224,224,224""","""ct_1_1(p_arm)"""
"""chr1""",121405145,121406286,"""censat_1_1""",100,""".""",121405145,121406286,"""0,204,204""","""censat_1_1(rnd…"
"""chr1""",121406286,121619169,"""ct_1_2""",100,""".""",121406286,121619169,"""224,224,224""","""ct_1_2"""
"""chr1""",121619169,121625213,"""hor_1_1(S3C1H2…",100,""".""",121619169,121625213,"""255,146,0""","""hor_1_1(S3C1H2…"
"""chr1""",121625213,121667941,"""hor_1_2(S3C1H2…",100,""".""",121625213,121667941,"""255,146,0""","""hor_1_2(S3C1H2…"
…,…,…,…,…,…,…,…,…,…
"""chrY""",62025776,62037362,"""censat_Y_97""",100,""".""",62025776,62037362,"""0,204,204""","""censat_Y_97(CO…"
"""chrY""",62052955,62059882,"""censat_Y_98""",100,""".""",62052955,62059882,"""0,204,204""","""censat_Y_98(CO…"
"""chrY""",62061431,62072743,"""hsat3_Y_50(A4)…",100,""".""",62061431,62072743,"""0,0,250""","""hsat3_Y_50(A4)…"
"""chrY""",62072743,62087298,"""bsat_Y_34""",100,""".""",62072743,62087298,"""250,153,255""","""bsat_Y_34"""


In [17]:
df_censat.filter(
    (pl.col("chrom") == "chr1") &
    (~pl.col("name").str.contains("hor")) &
    (~pl.col("name").str.contains("hsat"))
)

chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,reserved,component
str,i64,i64,str,i64,str,i64,i64,str,str
"""chr1""",116796047,121405145,"""ct_1_1(p_arm)""",100,""".""",116796047,121405145,"""224,224,224""","""ct_1_1(p_arm)"""
"""chr1""",121405145,121406286,"""censat_1_1""",100,""".""",121405145,121406286,"""0,204,204""","""censat_1_1(rnd…"
"""chr1""",121406286,121619169,"""ct_1_2""",100,""".""",121406286,121619169,"""224,224,224""","""ct_1_2"""
"""chr1""",121788213,121790362,"""ct_1_3""",100,""".""",121788213,121790362,"""224,224,224""","""ct_1_3"""
"""chr1""",126824128,126828704,"""censat_1_2""",100,""".""",126824128,126828704,"""0,204,204""","""censat_1_2(SST…"
…,…,…,…,…,…,…,…,…,…
"""chr1""",143424993,143742419,"""ct_1_28""",100,""".""",143424993,143742419,"""224,224,224""","""ct_1_28"""
"""chr1""",143742419,143743494,"""censat_1_24""",100,""".""",143742419,143743494,"""0,204,204""","""censat_1_24(rn…"
"""chr1""",143743494,144007117,"""ct_1_29""",100,""".""",143743494,144007117,"""224,224,224""","""ct_1_29"""
"""chr1""",144007117,144008994,"""censat_1_25""",100,""".""",144007117,144008994,"""0,204,204""","""censat_1_25(rn…"
