In [1]:
import pandas as pd
import numpy as np
import gffutils
from pybedtools import BedTool
from collections import defaultdict
import warnings

In [2]:
f = "test_data_m6anet_mod_sites.csv"
gtf_file = "gencode.v32.annotation.gtf"

In [48]:
print("Loading data...")

# Load modification sites
modification_sites = pd.read_csv(f)
modification_sites

Loading data...


Unnamed: 0,transcript_id,gene_id,transcript_position,n_reads,probability_modified,kmer,mod_ratio,sample_id,group_id
0,ENST00000506640,ENSG00000228327,6083,29,0.98085,GAACT,0.655172,s1,caud
1,ENST00000506640,ENSG00000228327,6171,23,0.965541,AGACT,0.913043,s1,caud
2,ENST00000416718,ENSG00000198744,341,801,0.92723,GGACA,0.666667,s1,caud
3,ENST00000360001,ENSG00000078808,1320,157,0.936434,GGACT,0.687898,s1,caud
4,ENST00000360001,ENSG00000078808,1753,194,0.93897,AAACT,0.56701,s1,caud
5,ENST00000660930,ENSG00000078808,1502,45,0.96971,GGACT,0.777778,s2,caud
6,ENST00000465727,ENSG00000078808,1482,42,0.949604,GGACT,0.761905,s2,caud
7,ENST00000263741,ENSG00000078808,1466,42,0.928761,GGACT,0.761905,s2,caud
8,ENST00000494748,ENSG00000078808,2904,81,0.94142,GGACT,0.765432,s2,caud
9,ENST00000494748,ENSG00000078808,3337,111,0.903208,AAACT,0.558559,s2,caud


In [4]:
db_file = gtf_file + ".db"
try:
    db = gffutils.FeatureDB(db_file, keep_order=True)
except:
    print("Creating GTF database...")
    db = gffutils.create_db(gtf_file, dbfn=db_file, force=True, keep_order=True, disable_infer_genes=True, disable_infer_transcripts=True)
    db = gffutils.FeatureDB(db_file, keep_order=True)
db

<gffutils.interface.FeatureDB at 0x7f0f441a3a40>

In [21]:
# Extract the unique transcript IDs from the modification sites
transcript_ids_to_keep = set(modification_sites["transcript_id"])

# Filter features from the GTF database to include only relevant transcript IDs
filtered_features = []
transcript_data = []
for feature in db.all_features():  # Iterate over all features in the database
    if "transcript_id" in feature.attributes:
        transcript_id = feature.attributes["transcript_id"][0].split(".")[0]  # Remove version
        if transcript_id in transcript_ids_to_keep:
            filtered_features.append(feature)  # Keep only relevant features
            if feature.featuretype == "transcript":
                transcript_data.append({
                "transcript_id": feature.id.split(".")[0],  # Remove version
                "gene_name": feature.attributes.get("gene_name", [None])[0],
                "transcript_type": feature.attributes.get("transcript_type", [None])[0],
            })
filtered_features, transcript_data

([<Feature transcript (chr1:634376-634922[+]) at 0x7f0f43327950>,
  <Feature exon (chr1:634376-634922[+]) at 0x7f0f43327e90>,
  <Feature transcript (chr1:725885-778626[-]) at 0x7f0f4334cb90>,
  <Feature exon (chr1:778284-778626[-]) at 0x7f0f4334ce30>,
  <Feature exon (chr1:774171-774280[-]) at 0x7f0f4334c7d0>,
  <Feature exon (chr1:772976-773107[-]) at 0x7f0f4334cd40>,
  <Feature exon (chr1:769497-769712[-]) at 0x7f0f4334d0d0>,
  <Feature exon (chr1:768548-768613[-]) at 0x7f0f4334d310>,
  <Feature exon (chr1:766329-766387[-]) at 0x7f0f4334d520>,
  <Feature exon (chr1:764723-765247[-]) at 0x7f0f4334d730>,
  <Feature exon (chr1:758967-759123[-]) at 0x7f0f4334d940>,
  <Feature exon (chr1:756077-756141[-]) at 0x7f0f4334db50>,
  <Feature exon (chr1:746695-746818[-]) at 0x7f0f4334dd60>,
  <Feature exon (chr1:744728-744825[-]) at 0x7f0f4334df70>,
  <Feature exon (chr1:744195-744391[-]) at 0x7f0f4334e1b0>,
  <Feature exon (chr1:743180-743350[-]) at 0x7f0f4334e3f0>,
  <Feature exon (chr1:735423

In [22]:
transcript_data = pd.DataFrame(transcript_data).drop_duplicates("transcript_id")
transcript_data

Unnamed: 0,transcript_id,gene_name,transcript_type
0,ENST00000416718,MTCO3P12,unprocessed_pseudogene
1,ENST00000506640,AL669831.1,lncRNA
2,ENST00000360001,SDF4,protein_coding
3,ENST00000660930,SDF4,protein_coding
4,ENST00000465727,SDF4,nonsense_mediated_decay
5,ENST00000263741,SDF4,protein_coding
6,ENST00000494748,SDF4,retained_intron
7,ENST00000478938,SDF4,retained_intron
8,ENST00000379198,B3GALT6,protein_coding
9,ENST00000492936,ACAP3,retained_intron


In [23]:
# Group filtered features by transcript_id
grouped_features = defaultdict(list)
for feature in filtered_features:
    transcript_id = feature.attributes["transcript_id"][0].split(".")[0]  # Remove version
    grouped_features[transcript_id].append(feature)
grouped_features

defaultdict(list,
            {'ENST00000416718': [<Feature transcript (chr1:634376-634922[+]) at 0x7f0f43327950>,
              <Feature exon (chr1:634376-634922[+]) at 0x7f0f43327e90>],
             'ENST00000506640': [<Feature transcript (chr1:725885-778626[-]) at 0x7f0f4334cb90>,
              <Feature exon (chr1:778284-778626[-]) at 0x7f0f4334ce30>,
              <Feature exon (chr1:774171-774280[-]) at 0x7f0f4334c7d0>,
              <Feature exon (chr1:772976-773107[-]) at 0x7f0f4334cd40>,
              <Feature exon (chr1:769497-769712[-]) at 0x7f0f4334d0d0>,
              <Feature exon (chr1:768548-768613[-]) at 0x7f0f4334d310>,
              <Feature exon (chr1:766329-766387[-]) at 0x7f0f4334d520>,
              <Feature exon (chr1:764723-765247[-]) at 0x7f0f4334d730>,
              <Feature exon (chr1:758967-759123[-]) at 0x7f0f4334d940>,
              <Feature exon (chr1:756077-756141[-]) at 0x7f0f4334db50>,
              <Feature exon (chr1:746695-746818[-]) at 0x7f0f4334dd

In [25]:
def calculate_transcript_features(grouped_features):
    """
    Compute transcript features for grouped transcript data.
    For each transcript:
    - Sort CDS features by start and end positions to find start_codon and stop_codon positions.
    - Calculate CDS length by summing all CDS feature lengths.
    - Determine UTR5 and UTR3 lengths based on UTR positions relative to start_codon and stop_codon, considering the strand.
    - If no CDS features exist, set CDS_len, utr5_len, and utr3_len to 0.
    """

    transcript_features = []  # Store the results for all transcripts

    for tx_id, features in grouped_features.items():
        cds_length = 0
        utr5_length = 0
        utr3_length = 0
        tx_length = 0
        start_codon_pos = None
        stop_codon_pos = None

        # Separate features by type
        cds_features = [f for f in features if f.featuretype == "CDS"]
        utr_features = [f for f in features if f.featuretype == "UTR"]
        exon_features = [f for f in features if f.featuretype == "exon"]
        transcript_feature = next((f for f in features if f.featuretype == "transcript"), None)


        # If no CDS features exist, set all lengths to 0
        if not cds_features:
            cds_length = 0
            utr5_length = 0
            utr3_length = 0
            for exon in exon_features:
                tx_length += exon.end - exon.start + 1

        else:
            # Sort CDS features by start and end positions
            sorted_cds = sorted(cds_features, key=lambda x: (x.start, x.end))
            if utr_features[0].strand == "+":  # Positive strand
                start_codon_pos = sorted_cds[0].start  # First CDS start
                stop_codon_pos = sorted_cds[-1].end  # Last CDS end
            elif utr_features[0].strand == "-":  # Negative strand
                start_codon_pos = sorted_cds[-1].end  # Last CDS end
                stop_codon_pos = sorted_cds[0].start  # First CDS start

            # Calculate CDS length by summing the lengths of all CDS features
            for cds in sorted_cds:
                cds_length += cds.end - cds.start + 1

            # Handle UTRs and classify them as UTR5 or UTR3
            if utr_features:
                strand = utr_features[0].strand  # Assume all UTRs share the same strand
                for utr in utr_features:
                    utr_length = utr.end - utr.start + 1
                    if strand == "+":
                        if utr.end < start_codon_pos:  # 5' UTR is before start_codon
                            utr5_length += utr_length
                        elif utr.start > stop_codon_pos:  # 3' UTR is after stop_codon
                            utr3_length += utr_length
                    elif strand == "-":
                        if utr.start > start_codon_pos:  # 5' UTR is after start_codon on negative strand
                            utr5_length += utr_length
                        elif utr.end < stop_codon_pos:  # 3' UTR is before stop_codon on negative strand
                            utr3_length += utr_length
            tx_length = cds_length + utr3_length + utr5_length
       
        # Add the results for this transcript
        transcript_features.append({
            "transcript_id": tx_id,
            "tx_length": tx_length,
            "cds_length": cds_length,
            "utr5_length": utr5_length,
            "utr3_length": utr3_length
        })

    return pd.DataFrame(transcript_features)
# Compute transcript features for the filtered transcripts
tx_features = calculate_transcript_features(grouped_features)
tx_features

Unnamed: 0,transcript_id,tx_length,cds_length,utr5_length,utr3_length
0,ENST00000416718,547,0,0,0
1,ENST00000506640,6432,0,0,0
2,ENST00000360001,1956,1065,284,607
3,ENST00000660930,2137,1044,329,764
4,ENST00000465727,2095,762,293,1040
5,ENST00000263741,2079,1023,314,742
6,ENST00000494748,3516,0,0,0
7,ENST00000478938,2591,0,0,0
8,ENST00000379198,2805,987,42,1776
9,ENST00000492936,5319,0,0,0


In [26]:
tx_features = tx_features.merge(transcript_data, on="transcript_id", how="left")
tx_features

Unnamed: 0,transcript_id,tx_length,cds_length,utr5_length,utr3_length,gene_name,transcript_type
0,ENST00000416718,547,0,0,0,MTCO3P12,unprocessed_pseudogene
1,ENST00000506640,6432,0,0,0,AL669831.1,lncRNA
2,ENST00000360001,1956,1065,284,607,SDF4,protein_coding
3,ENST00000660930,2137,1044,329,764,SDF4,protein_coding
4,ENST00000465727,2095,762,293,1040,SDF4,nonsense_mediated_decay
5,ENST00000263741,2079,1023,314,742,SDF4,protein_coding
6,ENST00000494748,3516,0,0,0,SDF4,retained_intron
7,ENST00000478938,2591,0,0,0,SDF4,retained_intron
8,ENST00000379198,2805,987,42,1776,B3GALT6,protein_coding
9,ENST00000492936,5319,0,0,0,ACAP3,retained_intron


In [27]:
# Filter exons for relevant transcripts
exons = defaultdict(list)
for tx_id, features in grouped_features.items():
    for feature in features:
        if feature.featuretype == "exon":
            exons[tx_id].append(feature)
exons

defaultdict(list,
            {'ENST00000416718': [<Feature exon (chr1:634376-634922[+]) at 0x7f0f43327e90>],
             'ENST00000506640': [<Feature exon (chr1:778284-778626[-]) at 0x7f0f4334ce30>,
              <Feature exon (chr1:774171-774280[-]) at 0x7f0f4334c7d0>,
              <Feature exon (chr1:772976-773107[-]) at 0x7f0f4334cd40>,
              <Feature exon (chr1:769497-769712[-]) at 0x7f0f4334d0d0>,
              <Feature exon (chr1:768548-768613[-]) at 0x7f0f4334d310>,
              <Feature exon (chr1:766329-766387[-]) at 0x7f0f4334d520>,
              <Feature exon (chr1:764723-765247[-]) at 0x7f0f4334d730>,
              <Feature exon (chr1:758967-759123[-]) at 0x7f0f4334d940>,
              <Feature exon (chr1:756077-756141[-]) at 0x7f0f4334db50>,
              <Feature exon (chr1:746695-746818[-]) at 0x7f0f4334dd60>,
              <Feature exon (chr1:744728-744825[-]) at 0x7f0f4334df70>,
              <Feature exon (chr1:744195-744391[-]) at 0x7f0f4334e1b0>,
       

In [39]:
# Function to convert transcript coordinates to genomic coordinates
def convert_to_genome_coordinates(tx_name, tx_pos, exonsdb, txfdb):
    """
    Convert transcript coordinates to genome coordinates and annotate regions.
    """
    if tx_name not in exonsdb:
        warnings.warn(f"Transcript {tx_name} not found in exons database.")
        return {
            "genome_pos": None,
            "chromosome": None,
            "dist_up_exon_junc": None,
            "dist_down_exon_junc": None,
            "region": None
        }
    
    tx_exons = sorted(exonsdb[tx_name], key=lambda x: x.start)  # Sort exons by start
    chrom = tx_exons[0].chrom
    strand = tx_exons[0].strand
    
    # Calculate cumulative exon lengths
    exon_lengths = [exon.end - exon.start + 1 for exon in tx_exons]
    cum_lengths = np.cumsum(exon_lengths)

    # Identify the exon containing the position
    idx = np.searchsorted(cum_lengths, tx_pos)
    
    if idx >= len(tx_exons):
        warnings.warn(f"Transcript position {tx_pos} is out of range for transcript {tx_name}")
        return {
            "genome_pos": None,
            "chromosome": None,
            "dist_up_exon_junc": None,
            "dist_down_exon_junc": None,
            "region": None
        }
    
    # Compute genomic position
    rel_pos = tx_pos - (cum_lengths[idx - 1] if idx > 0 else 0)
    exon = tx_exons[idx]
    if strand == "+":
        genome_pos = exon.start + rel_pos - 1
    else:
        genome_pos = exon.end - rel_pos + 1
    
    # Determine distance to exon junctions
    dist_upstream = rel_pos
    dist_downstream = exon.end - exon.start - rel_pos

    # Determine region (UTR, CDS, etc.)
    tx_feat = txfdb[txfdb["transcript_id"] == tx_name].iloc[0]
    if tx_feat["cds_length"] == 0:
        region = "ncRNA"
    elif tx_pos <= tx_feat["utr5_length"]:
        region = "UTR5"
    elif tx_pos <= tx_feat["utr5_length"] + tx_feat["cds_length"]:
        region = "CDS"
    elif tx_pos <= tx_feat["tx_length"]:
        region = "UTR3"
    else:
        region = None
    
    return {
        "genome_pos": genome_pos,
        "chromosome": chrom,
        "dist_up_exon_junc": dist_upstream,
        "dist_down_exon_junc": dist_downstream,
        "region": region
    }

In [49]:
modification_sites["annotation"] = modification_sites.apply(
    lambda row: convert_to_genome_coordinates(row["transcript_id"], row["transcript_position"],exons, tx_features), axis=1
)
modification_sites

Unnamed: 0,transcript_id,gene_id,transcript_position,n_reads,probability_modified,kmer,mod_ratio,sample_id,group_id,annotation
0,ENST00000506640,ENSG00000228327,6083,29,0.98085,GAACT,0.655172,s1,caud,"{'genome_pos': 774177, 'chromosome': 'chr1', '..."
1,ENST00000506640,ENSG00000228327,6171,23,0.965541,AGACT,0.913043,s1,caud,"{'genome_pos': 778545, 'chromosome': 'chr1', '..."
2,ENST00000416718,ENSG00000198744,341,801,0.92723,GGACA,0.666667,s1,caud,"{'genome_pos': 634716, 'chromosome': 'chr1', '..."
3,ENST00000360001,ENSG00000078808,1320,157,0.936434,GGACT,0.687898,s1,caud,"{'genome_pos': 1223879, 'chromosome': 'chr1', ..."
4,ENST00000360001,ENSG00000078808,1753,194,0.93897,AAACT,0.56701,s1,caud,"{'genome_pos': 1228561, 'chromosome': 'chr1', ..."
5,ENST00000660930,ENSG00000078808,1502,45,0.96971,GGACT,0.777778,s2,caud,"{'genome_pos': 1228927, 'chromosome': 'chr1', ..."
6,ENST00000465727,ENSG00000078808,1482,42,0.949604,GGACT,0.761905,s2,caud,"{'genome_pos': 1228941, 'chromosome': 'chr1', ..."
7,ENST00000263741,ENSG00000078808,1466,42,0.928761,GGACT,0.761905,s2,caud,"{'genome_pos': 1228941, 'chromosome': 'chr1', ..."
8,ENST00000494748,ENSG00000078808,2904,81,0.94142,GGACT,0.765432,s2,caud,"{'genome_pos': 1219381, 'chromosome': 'chr1', ..."
9,ENST00000494748,ENSG00000078808,3337,111,0.903208,AAACT,0.558559,s2,caud,"{'genome_pos': 1218948, 'chromosome': 'chr1', ..."


In [50]:
# Expand dictionary columns
annotation_df = modification_sites["annotation"].apply(pd.Series)
modification_sites = pd.concat([modification_sites, annotation_df], axis=1).drop(columns=["annotation"])
modification_sites

Unnamed: 0,transcript_id,gene_id,transcript_position,n_reads,probability_modified,kmer,mod_ratio,sample_id,group_id,genome_pos,chromosome,dist_up_exon_junc,dist_down_exon_junc,region
0,ENST00000506640,ENSG00000228327,6083,29,0.98085,GAACT,0.655172,s1,caud,774177,chr1,104,5,ncRNA
1,ENST00000506640,ENSG00000228327,6171,23,0.965541,AGACT,0.913043,s1,caud,778545,chr1,82,260,ncRNA
2,ENST00000416718,ENSG00000198744,341,801,0.92723,GGACA,0.666667,s1,caud,634716,chr1,341,205,ncRNA
3,ENST00000360001,ENSG00000078808,1320,157,0.936434,GGACT,0.687898,s1,caud,1223879,chr1,90,46,CDS
4,ENST00000360001,ENSG00000078808,1753,194,0.93897,AAACT,0.56701,s1,caud,1228561,chr1,386,92,UTR3
5,ENST00000660930,ENSG00000078808,1502,45,0.96971,GGACT,0.777778,s2,caud,1228927,chr1,20,458,UTR3
6,ENST00000465727,ENSG00000078808,1482,42,0.949604,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3
7,ENST00000263741,ENSG00000078808,1466,42,0.928761,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3
8,ENST00000494748,ENSG00000078808,2904,81,0.94142,GGACT,0.765432,s2,caud,1219381,chr1,1971,611,ncRNA
9,ENST00000494748,ENSG00000078808,3337,111,0.903208,AAACT,0.558559,s2,caud,1218948,chr1,2404,178,ncRNA


In [51]:
# Merge with transcript biotype information
annotated_modification_sites = modification_sites.merge(transcript_data, on="transcript_id", how="left")
annotated_modification_sites

Unnamed: 0,transcript_id,gene_id,transcript_position,n_reads,probability_modified,kmer,mod_ratio,sample_id,group_id,genome_pos,chromosome,dist_up_exon_junc,dist_down_exon_junc,region,gene_name,transcript_type
0,ENST00000506640,ENSG00000228327,6083,29,0.98085,GAACT,0.655172,s1,caud,774177,chr1,104,5,ncRNA,AL669831.1,lncRNA
1,ENST00000506640,ENSG00000228327,6171,23,0.965541,AGACT,0.913043,s1,caud,778545,chr1,82,260,ncRNA,AL669831.1,lncRNA
2,ENST00000416718,ENSG00000198744,341,801,0.92723,GGACA,0.666667,s1,caud,634716,chr1,341,205,ncRNA,MTCO3P12,unprocessed_pseudogene
3,ENST00000360001,ENSG00000078808,1320,157,0.936434,GGACT,0.687898,s1,caud,1223879,chr1,90,46,CDS,SDF4,protein_coding
4,ENST00000360001,ENSG00000078808,1753,194,0.93897,AAACT,0.56701,s1,caud,1228561,chr1,386,92,UTR3,SDF4,protein_coding
5,ENST00000660930,ENSG00000078808,1502,45,0.96971,GGACT,0.777778,s2,caud,1228927,chr1,20,458,UTR3,SDF4,protein_coding
6,ENST00000465727,ENSG00000078808,1482,42,0.949604,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3,SDF4,nonsense_mediated_decay
7,ENST00000263741,ENSG00000078808,1466,42,0.928761,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3,SDF4,protein_coding
8,ENST00000494748,ENSG00000078808,2904,81,0.94142,GGACT,0.765432,s2,caud,1219381,chr1,1971,611,ncRNA,SDF4,retained_intron
9,ENST00000494748,ENSG00000078808,3337,111,0.903208,AAACT,0.558559,s2,caud,1218948,chr1,2404,178,ncRNA,SDF4,retained_intron


In [52]:
annotated_modification_sites = annotated_modification_sites.sort_values(by='transcript_id')
annotated_modification_sites

Unnamed: 0,transcript_id,gene_id,transcript_position,n_reads,probability_modified,kmer,mod_ratio,sample_id,group_id,genome_pos,chromosome,dist_up_exon_junc,dist_down_exon_junc,region,gene_name,transcript_type
7,ENST00000263741,ENSG00000078808,1466,42,0.928761,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3,SDF4,protein_coding
16,ENST00000354700,ENSG00000131584,3222,78,0.955948,AGACA,0.794872,s2,caud,1300627,chr1,66,117,UTR3,ACAP3,protein_coding
3,ENST00000360001,ENSG00000078808,1320,157,0.936434,GGACT,0.687898,s1,caud,1223879,chr1,90,46,CDS,SDF4,protein_coding
4,ENST00000360001,ENSG00000078808,1753,194,0.93897,AAACT,0.56701,s1,caud,1228561,chr1,386,92,UTR3,SDF4,protein_coding
14,ENST00000379198,ENSG00000176022,2612,31,0.952746,GGACT,0.677419,s2,caud,1234848,chr1,2612,192,UTR3,B3GALT6,protein_coding
13,ENST00000379198,ENSG00000176022,2579,27,0.99583,GGACA,0.962963,s2,caud,1234815,chr1,2579,225,UTR3,B3GALT6,protein_coding
12,ENST00000379198,ENSG00000176022,2432,22,0.987061,GGACT,0.909091,s2,caud,1234668,chr1,2432,372,UTR3,B3GALT6,protein_coding
2,ENST00000416718,ENSG00000198744,341,801,0.92723,GGACA,0.666667,s1,caud,634716,chr1,341,205,ncRNA,MTCO3P12,unprocessed_pseudogene
6,ENST00000465727,ENSG00000078808,1482,42,0.949604,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3,SDF4,nonsense_mediated_decay
17,ENST00000467278,ENSG00000131584,2633,76,0.96678,AGACA,0.868421,s2,caud,1298659,chr1,297,91,ncRNA,ACAP3,retained_intron


In [43]:
# Save output
output_prefix = 'test_output'
output_file = f"4_{output_prefix}_{f}"
annotated_modification_sites.to_csv(output_file, index=False)

print("Annotated modification sites saved to", output_file)

Annotated modification sites saved to 4_test_output_test_data_m6anet_mod_sites.csv


In [26]:
results = []
for _, row in modification_sites.iterrows():  # Iterate over rows in the DataFrame
    # Process the row to compute genomic coordinates
    result = convert_to_genome_coordinates(
        row["transcript_id"], row["transcript_position"], exons, tx_features
    )

    # Merge the original row data into the result dictionary
    result.update(row.to_dict())

    # Reorder the fields in the desired order
    reordered_result = {
        # Fields from the original modification_sites DataFrame
        "transcript_id": result["transcript_id"],
        "gene_id": result["gene_id"],
        "transcript_position": result["transcript_position"],
        "n_reads": result["n_reads"],
        "probability_modified": result["probability_modified"],
        "kmer": result["kmer"],
        "mod_ratio": result["mod_ratio"],
        "sample_id": result["sample_id"],
        "group_id": result["group_id"],
        # Fields calculated by convert_to_genome_coordinates
        "genome_pos": result["genome_pos"],
        "chromosome": result["chromosome"],
        "dist_up_exon_junc": result["dist_up_exon_junc"],
        "dist_down_exon_junc": result["dist_down_exon_junc"],
        "region": result["region"]
    }

    # Append the reordered result to the results list
    results.append(reordered_result)

# Convert the final results list to a DataFrame
annotated_modification_sites = pd.DataFrame(results)
annotated_modification_sites

Unnamed: 0,transcript_id,gene_id,transcript_position,n_reads,probability_modified,kmer,mod_ratio,sample_id,group_id,genome_pos,chromosome,dist_up_exon_junc,dist_down_exon_junc,region
0,ENST00000506640,ENSG00000228327,6083,29,0.98085,GAACT,0.655172,s1,caud,774177,chr1,104,5,ncRNA
1,ENST00000506640,ENSG00000228327,6171,23,0.965541,AGACT,0.913043,s1,caud,778545,chr1,82,260,ncRNA
2,ENST00000416718,ENSG00000198744,341,801,0.92723,GGACA,0.666667,s1,caud,634716,chr1,341,205,ncRNA
3,ENST00000360001,ENSG00000078808,1320,157,0.936434,GGACT,0.687898,s1,caud,1223879,chr1,90,46,CDS
4,ENST00000360001,ENSG00000078808,1753,194,0.93897,AAACT,0.56701,s1,caud,1228561,chr1,386,92,UTR3
5,ENST00000660930,ENSG00000078808,1502,45,0.96971,GGACT,0.777778,s2,caud,1228927,chr1,20,458,UTR3
6,ENST00000465727,ENSG00000078808,1482,42,0.949604,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3
7,ENST00000263741,ENSG00000078808,1466,42,0.928761,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3
8,ENST00000494748,ENSG00000078808,2904,81,0.94142,GGACT,0.765432,s2,caud,1219381,chr1,1971,611,ncRNA
9,ENST00000494748,ENSG00000078808,3337,111,0.903208,AAACT,0.558559,s2,caud,1218948,chr1,2404,178,ncRNA


In [30]:
annotated_modification_sites = annotated_modification_sites.sort_values(by='transcript_id')
annotated_modification_sites

Unnamed: 0,transcript_id,gene_id,transcript_position,n_reads,probability_modified,kmer,mod_ratio,sample_id,group_id,genome_pos,chromosome,dist_up_exon_junc,dist_down_exon_junc,region
7,ENST00000263741,ENSG00000078808,1466,42,0.928761,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3
16,ENST00000354700,ENSG00000131584,3222,78,0.955948,AGACA,0.794872,s2,caud,1300627,chr1,66,117,UTR3
3,ENST00000360001,ENSG00000078808,1320,157,0.936434,GGACT,0.687898,s1,caud,1223879,chr1,90,46,CDS
4,ENST00000360001,ENSG00000078808,1753,194,0.93897,AAACT,0.56701,s1,caud,1228561,chr1,386,92,UTR3
14,ENST00000379198,ENSG00000176022,2612,31,0.952746,GGACT,0.677419,s2,caud,1234848,chr1,2612,192,UTR3
13,ENST00000379198,ENSG00000176022,2579,27,0.99583,GGACA,0.962963,s2,caud,1234815,chr1,2579,225,UTR3
12,ENST00000379198,ENSG00000176022,2432,22,0.987061,GGACT,0.909091,s2,caud,1234668,chr1,2432,372,UTR3
2,ENST00000416718,ENSG00000198744,341,801,0.92723,GGACA,0.666667,s1,caud,634716,chr1,341,205,ncRNA
6,ENST00000465727,ENSG00000078808,1482,42,0.949604,GGACT,0.761905,s2,caud,1228941,chr1,6,472,UTR3
17,ENST00000467278,ENSG00000131584,2633,76,0.96678,AGACA,0.868421,s2,caud,1298659,chr1,297,91,ncRNA


In [31]:
# Save output
output_prefix = '3_test_output'
output_file = f"{output_prefix}_{f}"
annotated_modification_sites.to_csv(output_file, index=False)

print("Annotated modification sites saved to", output_file)

Annotated modification sites saved to 3_test_output_test_data_m6anet_mod_sites.csv
