## RNA-seq Pipeline for Detecting Exitrons (Exonic Introns)

### 0. Installs

In [36]:
import pandas as pd
import glob
import os
from tqdm import tqdm # progress tracker
import pyranges as pr # parsing gff
import numpy as np

### 1. Parse Regtools Data 
- Parse raw junction data from regtools output files

- Processes 1 file at a time

In [37]:
def parseJunctionFile(file_path):
    # column names for RegTools junction files
    regtools_column_names = [
        'chrom', 'start_anchor', 'end_anchor', 'name', 'score', 'strand',
        'thick_start_orig', 'thick_end_orig', 'item_rgb_orig',
        'block_count_orig', 'block_sizes_orig', 'block_starts_orig'
    ]
    
    # extract sample ID from the filename
    sample_id = os.path.basename(file_path).split('.')[0]
    
    # read the file into a pandas DataFrame
    df = pd.read_csv(
        file_path, sep='\t', header=None, names=regtools_column_names,
        dtype={'chrom': str, 'block_sizes_orig': str, 'block_starts_orig': str}
    )
        
    df['sample_id_source'] = sample_id

    # convert relevant columns to numeric types, coercing errors
    for col in ['start_anchor', 'end_anchor', 'score']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # drop rows if info is missing
    df.dropna(subset=['start_anchor', 'end_anchor', 'score', 'block_sizes_orig'], inplace=True)
    
    # ensure int types
    for col in ['start_anchor', 'end_anchor', 'score']:
        df[col] = df[col].astype(int)

    return df

### 2. Transform Junction Data ** optimization needed **
- Recalculates junction coordinates, following Regtools documentation to take into account blockSize

- Recalculates block size to represent length of junction

- Outputs junction info in BED12 format

In [38]:
def transformJunctionData(raw_df):
    
    # CHROMOSOME FILTERING
    original_row_count = len(raw_df)
    
    # allowed chromosomes
    allowed_chrom_numbers = [str(i) for i in range(1, 23)]
    allowed_sex_chroms_upper = ['X', 'Y'] 
    allowed_chromosomes = set()
    for num_chrom in allowed_chrom_numbers:
        allowed_chromosomes.add(num_chrom)
        allowed_chromosomes.add(f"chr{num_chrom}")
    for sex_chrom in allowed_sex_chroms_upper:
        allowed_chromosomes.add(sex_chrom)
        allowed_chromosomes.add(sex_chrom.lower())
        allowed_chromosomes.add(f"chr{sex_chrom}")
        allowed_chromosomes.add(f"chr{sex_chrom.lower()}")
    
    raw_df_filtered = raw_df[raw_df['chrom'].isin(allowed_chromosomes)].copy()
    filtered_row_count = len(raw_df_filtered)
    print(f"Removed {original_row_count - filtered_row_count} rows with non-standard chromosomes.")

    # JUNCTION COORD CORRECTION
    # initialize lists for each column
    chrom_list = []
    chromStart_list = []
    chromEnd_list = []
    name_list = []
    score_list = []
    strand_list = []
    thickStart_list = []
    thickEnd_list = []
    itemRgb_list = []
    blockCount_list = []
    blockSizes_list = []
    blockStarts_list = []
    sample_id_source_list = []

    # iterate over the filtered df (set to 1000 rows)
    # realStart = 

    for i, (index, row) in enumerate(raw_df_filtered.iterrows()):

        regtools_start = row['start_anchor']
        regtools_end = row['end_anchor']
        regtools_block_sizes_str = row['block_sizes_orig']

        parsed_block_sizes = [int(s) for s in regtools_block_sizes_str.strip(',').split(',')]
        if len(parsed_block_sizes) < 2:
            skipped_rows += 1
            continue
        
        overhang_left = parsed_block_sizes[0]
        overhang_right = parsed_block_sizes[1]

        junc_start = regtools_start + overhang_left
        junc_end = regtools_end - overhang_right

        if junc_start >= junc_end: 
            skipped_rows += 1
            continue

        junc_length = junc_end - junc_start

        # add values to respective lists
        chrom_list.append(row['chrom'])
        chromStart_list.append(junc_start)
        chromEnd_list.append(junc_end)
        name_list.append(row['name'])
        score_list.append(row['score'])
        strand_list.append(row['strand'])
        thickStart_list.append(junc_start)
        thickEnd_list.append(junc_end)
        itemRgb_list.append(row.get('item_rgb_orig', '0'))
        blockCount_list.append(1)
        blockSizes_list.append(str(junc_length))
        blockStarts_list.append("0")
        sample_id_source_list.append(row['sample_id_source'])

    # create df from dictionary of lists
    transformed_df = pd.DataFrame({
        'chrom': chrom_list,
        'chromStart': chromStart_list,
        'chromEnd': chromEnd_list,
        'name': name_list,
        'score': score_list,
        'strand': strand_list,
        'thickStart': thickStart_list,
        'thickEnd': thickEnd_list,
        'itemRgb': itemRgb_list,
        'blockCount': blockCount_list,
        'blockSizes': blockSizes_list,
        'blockStarts': blockStarts_list,
        'sample_id_source': sample_id_source_list
    })
    
    print(f"Transformed {len(transformed_df)} junction records.")
    return transformed_df

### 3. Find Exitrons Within Junction Data
- Converts transformed junction data (transformed_df) and exon data (from gff3 file) into PyRanges objects with labels Chromosome, Start, End, Strand, and Title (a unique junction id formed by chrom:start:end:strand)

- Finds junctions that overlap with CDS regions using PyRanges method .overlap

In [39]:
# convert CDS data to PyRanges object
gff = pr.read_gff3("gencode.v48.annotation.gff3.gz")
cds = gff[gff.Feature == "CDS"]
print(f"Found {len(cds)} CDS regions.")

Found 903356 CDS regions.


In [40]:
def findExitrons(transformed_df):
    # generate a unique ID for each junction (chrom:start:end:strand
    unique_id = transformed_df['chrom'].astype(str) + ':' + \
                transformed_df['chromStart'].astype(str) + ':' + \
                transformed_df['chromEnd'].astype(str) + ':' + \
                transformed_df['strand'].astype(str)

    # convert junction data to PyRanges object
    junction_pr = pr.PyRanges({'Chromosome': transformed_df['chrom'],
                    'Start': transformed_df['chromStart'],
                    'End': transformed_df['chromEnd'],
                    'Strand': transformed_df['strand'],
                    'title': unique_id,
                    'reads': transformed_df['score'],
                    'sourceID': transformed_df['sample_id_source']}) 

    # find overlapping junctions
    contained_junctions = junction_pr.overlap(cds, contained_intervals_only=True)
    print(f"Found {len(contained_junctions)} junctions contained within CDS regions.")
            
    return contained_junctions

### 4. Compile All Exitron Info
- Iterates through each person's file, finding all exitron data then concatenating to a final matrix

- Includes person ID (file name) and junction scores (total reads)

In [41]:
def compileExitronData(directory_path, file_pattern="*.bam.junc"):

    all_exitron_info = []
    file_paths = glob.glob(os.path.join(directory_path, file_pattern))
    print(f"Found {len(file_paths)} files to process.")

    # testing first 5 out of 100
    files_to_process = file_paths[:5]
    print(f"Processing the first {len(files_to_process)} files.")

    for file_path in tqdm(files_to_process):
        print("Parsing new file...")
        file_name_only = os.path.basename(file_path)
        try:
            # 1.
            parsed_data = parseJunctionFile(file_path)
            # 2.
            transformed_df = transformJunctionData(parsed_data)
            # 3.
            gr_file = findExitrons(transformed_df)
            #4.
            all_exitron_info.append(gr_file)

        # skip to the next file if an error occurs
        except Exception as e:
            print(f"An error occurred while processing file {file_name_only}: {e}")
            import traceback
            traceback.print_exc()
            continue 

    # concatenate all individual data into matrix 
    final_gr = pr.concat(all_exitron_info)
    print(f"\nSuccessfully compiled exitron data from {len(all_exitron_info)} files.")
    return final_gr

In [42]:
compileExitronData("/gpfs/commons/groups/knowles_lab/atokolyi/als/juncs_min6bp/",file_pattern="*.bam.junc")

Found 100 files to process.
Processing the first 5 files.


  0%|          | 0/5 [00:00<?, ?it/s]

Parsing new file...
Removed 405 rows with non-standard chromosomes.
Transformed 225089 junction records.


  contained_junctions = junction_pr.overlap(cds, contained_intervals_only=True)
 20%|██        | 1/5 [00:48<03:14, 48.74s/it]

Found 5332 junctions contained within CDS regions.
Parsing new file...
Removed 686 rows with non-standard chromosomes.
Transformed 277524 junction records.


  contained_junctions = junction_pr.overlap(cds, contained_intervals_only=True)
 40%|████      | 2/5 [01:48<02:46, 55.39s/it]

Found 8193 junctions contained within CDS regions.
Parsing new file...
Removed 523 rows with non-standard chromosomes.
Transformed 283082 junction records.


  contained_junctions = junction_pr.overlap(cds, contained_intervals_only=True)
 60%|██████    | 3/5 [02:42<01:49, 54.69s/it]

Found 7228 junctions contained within CDS regions.
Parsing new file...
Removed 1009 rows with non-standard chromosomes.
Transformed 301814 junction records.


  contained_junctions = junction_pr.overlap(cds, contained_intervals_only=True)
 80%|████████  | 4/5 [03:43<00:57, 57.11s/it]

Found 9622 junctions contained within CDS regions.
Parsing new file...
Removed 671 rows with non-standard chromosomes.
Transformed 270306 junction records.


  contained_junctions = junction_pr.overlap(cds, contained_intervals_only=True)
100%|██████████| 5/5 [04:35<00:00, 55.03s/it]

Found 10160 junctions contained within CDS regions.

Successfully concatenated exitron data from 5 files.





Unnamed: 0,Chromosome,Start,End,Strand,title,reads,sourceID
252,chr1,999787,999865,-,chr1:999787:999865:-,2,CGND-HRA-00117-2
343,chr1,1287596,1287672,+,chr1:1287596:1287672:+,3,CGND-HRA-00117-2
362,chr1,1295826,1295889,-,chr1:1295826:1295889:-,1,CGND-HRA-00117-2
362,chr1,1295826,1295889,-,chr1:1295826:1295889:-,1,CGND-HRA-00117-2
800,chr1,2029762,2029982,+,chr1:2029762:2029982:+,61,CGND-HRA-00117-2
...,...,...,...,...,...,...,...
269443,chrY,13335800,13335891,-,chrY:13335800:13335891:-,1,CGND-HRA-00015
269646,chrY,19709578,19709689,-,chrY:19709578:19709689:-,1,CGND-HRA-00015
269646,chrY,19709578,19709689,-,chrY:19709578:19709689:-,1,CGND-HRA-00015
269646,chrY,19709578,19709689,-,chrY:19709578:19709689:-,1,CGND-HRA-00015


### 5. Summarize Exitron Info 
- Lists all unique exitrons and their counts

- Identifies which exitrons are already annotated

### 6. Exitron Normalization
- Divides exitron score by the reads of surrounding exons to find proportion of time that the exitron gets expressed