In [1]:
import polars as pl
import seaborn as sns
from src.utils import read_gtf

# Should ISMs be kept?

## Load datasets

In [2]:
classification = pl.read_parquet("nextflow_results/V47/final_classification.parquet")

In [3]:
reftss = pl.read_csv("data/liftovered_mm39_to_hg38_peaks_overlapped_reftss_hg38_500bp.bed", separator="\t", has_header = False, new_columns=["seqname", "start", "end", "name", "score", "strand"])

In [4]:
gtf = read_gtf("nextflow_results/V47/final_transcripts.gtf")

## Calculate CAGE peak overlaps

In [5]:
validated_pbids = gtf\
    .filter(pl.col("feature")=="exon")\
    .group_by("transcript_id")\
    .agg(
        pl.col("seqname").map_elements(lambda x : x[0], return_dtype=pl.String),
        pl.col("strand").map_elements(lambda x : x[0], return_dtype=pl.String),
        pl.col("start").min(),
        pl.col("end").max()
    )\
    .select(
        pl.col("seqname"),
        pl.col("transcript_id"),
        pos = pl.when(pl.col("strand")=="+")\
            .then(pl.col("start"))\
            .otherwise(pl.col("end"))
    )\
    .join(
        reftss,
        on = "seqname",
        how = "inner"
    )\
    .filter(
        (pl.col("start") <= pl.col("pos") + 100) &
        (pl.col("end")   >= pl.col("pos") - 100)
    )\
    .unique("transcript_id")\
    .select("transcript_id")

In [19]:
classification\
    .with_columns(
        CAGE = pl.col("isoform").is_in(validated_pbids),
        structural_category2 = pl.when(pl.col("structural_category").is_in(["full-splice_match", "incomplete-splice_match", "novel_in_catalog", "novel_not_in_catalog"]))\
            .then(pl.col("structural_category"))\
            .otherwise(pl.lit("Other"))
    )\
    .with_columns(
        structural_category2 = pl.when(pl.col("subcategory") == "3prime_fragment")\
            .then(pl.lit("3prime_fragment"))\
            .when(pl.col("subcategory") == "5prime_fragment")\
            .then(pl.lit("5prime_fragment"))\
            .otherwise(pl.col("structural_category2"))
    )\
    .group_by(["structural_category2", "CAGE"])\
    .len()\
    .group_by("structural_category2")\
    .agg([
        pl.col("len").filter(pl.col("CAGE") == True).sum().alias("true_len"),
        pl.col("len").sum().alias("total_len")
    ]).with_columns(
        (pl.col("true_len") / pl.col("total_len") * 100).alias("pct_true")
    )

structural_category2,true_len,total_len,pct_true
str,u32,u32,f64
"""Other""",2873,5562,51.654081
"""novel_in_catalog""",50590,62190,81.347484
"""incomplete-splice_match""",2672,4304,62.081784
"""novel_not_in_catalog""",30882,41763,73.945837
"""3prime_fragment""",942,4757,19.802396
"""5prime_fragment""",9712,11079,87.661341
"""full-splice_match""",52711,68432,77.02683


In [20]:
classification\
    .filter(
        pl.col("structural_category") == "incomplete-splice_match"
    )\
    .unique("subcategory")

isoform,chrom,strand,length,exons,structural_category,associated_gene,associated_transcript,ref_length,ref_exons,diff_to_TSS,diff_to_TTS,diff_to_gene_TSS,diff_to_gene_TTS,subcategory,RTS_stage,all_canonical,min_sample_cov,min_cov,min_cov_pos,sd_cov,FL,n_indels,n_indels_junc,bite,iso_exp,gene_exp,ratio_exp,FSM_class,coding,ORF_length,CDS_length,CDS_start,CDS_end,CDS_genomic_start,CDS_genomic_end,predicted_NMD,perc_A_downstream_TTS,seq_A_downstream_TTS,dist_to_CAGE_peak,within_CAGE_peak,dist_to_polyA_site,within_polyA_site,polyA_motif,polyA_dist,polyA_motif_found,ORF_seq,ratio_TSS,fl_assoc,cell_barcodes,containing_novel_spl
str,str,str,i32,i32,str,str,str,i32,i32,i32,i32,i32,i32,str,str,str,str,str,str,str,str,str,str,bool,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,bool
"""PB.23.140""","""chr1""","""+""",3818,14,"""incomplete-splice_match""","""AGRN""","""ENST00000652369.2""",7411,35,13761,8,392,-1,"""intron_retention""","""FALSE""","""canonical""",,,,,,,,False,,,,"""C""","""non_coding""",,,,,,,,25.0,"""TAAAAGCATTGCTTTTGTCC""",,,,,,,,,,,,False
"""PB.6.53""","""chr1""","""-""",5260,9,"""incomplete-splice_match""","""WASH7P""","""ENST00000831505.1""",1705,11,7237,5,2715,0,"""3prime_fragment""","""FALSE""","""canonical""",,,,,,,,False,,,,"""C""","""non_coding""",,,,,,,,35.0,"""AGAAACCAACAGTGTGCTTT""",,,,,,,,,,,,False
"""PB.6.1011""","""chr1""","""-""",5551,2,"""incomplete-splice_match""","""ENSG00000292994""","""ENST00000634344.2""",1554,3,44,4236,44,255,"""5prime_fragment""","""FALSE""","""canonical""",,,,,,,,False,,,,"""B""","""non_coding""",,,,,,,,50.0,"""TAAAAAAACTGTGTTTTAAA""",,,,,,,,,,,,False
"""PB.419.241""","""chr1""","""+""",5924,37,"""incomplete-splice_match""","""VPS13D""","""ENST00000011700.10""",10969,52,44797,2719,10392,479,"""internal_fragment""","""FALSE""","""canonical""",,,,,,,,False,,,,"""C""","""non_coding""",,,,,,,,20.0,"""TAACGTTCCTAGAGGGCTGG""",,,,,,,,,,,,False


In [23]:
IR_pbids = classification\
    .filter(pl.col("subcategory")=="intron_retention")\
    .select("isoform")

In [22]:
protein_classification = pl.read_csv("export/SFARI.protein_classification.tsv", separator="\t")

In [24]:
protein_classification\
    .filter(
        pl.col("pb").is_in(IR_pbids)
    )

pb,tx_cat,pr_splice_cat,tx_subcat,pr_splice_subcat,tx_tss_diff,tx_tts_diff,tx_tss_gene_diff,tx_tts_gene_diff,pr_nterm_diff,pr_cterm_diff,pr_nterm_gene_diff,pr_cterm_gene_diff,tx_transcripts,pr_transcripts,tx_gene,pr_gene,tx_num_exons,pr_num_exons,is_nmd,num_junc_after_stop_codon,num_nt_after_stop_codon,tx_5hang,tx_3hang,pr_nhang,pr_chang,utr_exon_status,utr_cat,protein_classification,protein_classification_base,protein_classification_subset,base_isoform
str,str,str,str,str,f64,f64,i64,i64,f64,f64,f64,f64,str,str,str,str,i64,i64,bool,i64,i64,f64,f64,f64,f64,str,str,str,str,str,str
"""PB.107407.382""","""novel_not_in_catalog""","""full-splice_match""","""intron_retention""","""multi-exon""",,,-1,5012,0.0,0.0,0.0,0.0,"""novel""","""ENST00000381657.8""","""ENSG00000182378.15""","""ENSG00000182378.15""",9,6,true,2,2146,,,0.0,0.0,"""multiexonic""","""unique""","""pFSM,known_nterm_known_splice_…","""pFSM""","""known_nterm_known_splice_known…","""PB.107407.38"""
"""PB.107442.404""","""novel_in_catalog""","""incomplete-splice_match""","""intron_retention""","""3prime_fragment""",,,-56,0,4592.0,0.0,4592.0,0.0,"""novel""","""ENST00000381401.11""","""ENSG00000169100.14""","""ENSG00000169100.14""",4,2,false,0,1773,,,-2.0,0.0,"""multiexonic""","""unique""","""pNNC,novel_nterm_known_splice_…","""pNNC""","""novel_nterm_known_splice_known…","""PB.107442.402"""
"""PB.107667.109""","""novel_in_catalog""","""incomplete-splice_match""","""intron_retention""","""5prime_fragment""",,,-65,0,0.0,1705.0,0.0,-1705.0,"""novel""","""ENST00000452575.1""","""ENSG00000146950.13""","""ENSG00000146950.13""",5,4,false,0,5258,,,0.0,90.0,"""multiexonic""","""unique""","""pNNC,known_nterm_known_splice_…","""pNNC""","""known_nterm_known_splice_novel…","""PB.107667.109"""
"""PB.107667.528""","""novel_in_catalog""","""incomplete-splice_match""","""intron_retention""","""3prime_fragment""",,,-3,4,13237.0,0.0,13237.0,0.0,"""novel""","""ENST00000674669.1""","""ENSG00000073464.13""","""ENSG00000073464.13""",11,5,false,0,3533,,,-141.0,0.0,"""multiexonic""","""unique""","""pNNC,novel_nterm_known_splice_…","""pNNC""","""novel_nterm_known_splice_known…","""PB.107667.528"""
"""PB.107687.14""","""novel_in_catalog""","""full-splice_match""","""intron_retention""","""multi-exon""",,,-4,7,0.0,0.0,0.0,0.0,"""novel""","""ENST00000321143.8""","""ENSG00000004961.15""","""ENSG00000004961.15""",6,6,false,0,2047,,,0.0,0.0,"""monoexonic""","""subset""","""pFSM,known_nterm_known_splice_…","""pFSM""","""known_nterm_known_splice_known…","""PB.107687.33"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""PB.107399.98""","""novel_in_catalog""","""full-splice_match""","""intron_retention""","""multi-exon""",,,7,0,0.0,0.0,0.0,0.0,"""novel""","""ENST00000425098.5""","""ENSG00000079974.19""","""ENSG00000079974.19""",9,5,true,2,1903,,,0.0,0.0,"""multiexonic""","""unique""","""pFSM,known_nterm_known_splice_…","""pFSM""","""known_nterm_known_splice_known…","""PB.107399.123"""
"""PB.107399.143""","""novel_in_catalog""","""incomplete-splice_match""","""intron_retention""","""5prime_fragment""",,,0,0,0.0,1100.0,0.0,-332.0,"""novel""","""ENST00000354869.8""","""ENSG00000079974.19""","""ENSG00000079974.19""",7,5,true,1,2421,,,0.0,26.0,"""multiexonic""","""unique""","""pNNC,known_nterm_known_splice_…","""pNNC""","""known_nterm_known_splice_novel…","""PB.107399.47"""
"""PB.107399.164""","""novel_in_catalog""","""incomplete-splice_match""","""intron_retention""","""5prime_fragment""",,,7,-1,0.0,1100.0,0.0,-332.0,"""novel""","""ENST00000354869.8""","""ENSG00000079974.19""","""ENSG00000079974.19""",7,5,true,2,3228,,,0.0,26.0,"""monoexonic""","""subset""","""pNNC,known_nterm_known_splice_…","""pNNC""","""known_nterm_known_splice_novel…","""PB.107399.47"""
"""PB.107399.21""","""novel_in_catalog""","""incomplete-splice_match""","""intron_retention""","""5prime_fragment""",,,2,0,0.0,1100.0,0.0,-332.0,"""novel""","""ENST00000354869.8""","""ENSG00000079974.19""","""ENSG00000079974.19""",8,5,true,2,2555,,,0.0,26.0,"""multiexonic""","""unique""","""pNNC,known_nterm_known_splice_…","""pNNC""","""known_nterm_known_splice_novel…","""PB.107399.47"""
