## RNA-seq Pipeline for Detecting Exitrons (Exonic Introns)

### 0. Installs

In [1]:
import pandas as pd
import glob
import os
from tqdm import tqdm # progress tracker
import pyranges as pr # parsing gff
import numpy as np
import pysam

### 1. Parse Regtools Data 
- Parse raw junction data from regtools output files

- Processes 1 file at a time

In [14]:
def parseJunctionFile(file_path):
    # column names for RegTools junction files
    regtools_column_names = [
        'chrom', 'start_anchor', 'end_anchor', 'name', 'score', 'strand',
        'thick_start_orig', 'thick_end_orig', 'item_rgb_orig',
        'block_count_orig', 'block_sizes_orig', 'block_starts_orig'
    ]
    
    # extract sample ID from the filename
    sample_id = os.path.basename(file_path).split('.')[0]
    
    # read the file into a pandas DataFrame
    df = pd.read_csv(
        file_path, sep='\t', header=None, names=regtools_column_names,
        dtype={'chrom': str, 'block_sizes_orig': str, 'block_starts_orig': str}
    )
        
    df['sample_id_source'] = sample_id

    # convert relevant columns to numeric types, coercing errors
    for col in ['start_anchor', 'end_anchor', 'score']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # drop rows if info is missing
    df.dropna(subset=['start_anchor', 'end_anchor', 'score', 'block_sizes_orig'], inplace=True)
    
    # ensure int types
    for col in ['start_anchor', 'end_anchor', 'score']:
        df[col] = df[col].astype(int)

    return df

### 2. Transform Junction Data
- Recalculates junction coordinates, following Regtools documentation to take into account blockSize

- Recalculates block size to represent length of junction

- Outputs junction info in BED12 format

In [15]:
def transformJunctionData(raw_df):
    
    # CHROMOSOME FILTERING
    original_row_count = len(raw_df)
    
    # allowed chromosomes
    allowed_chrom_numbers = [str(i) for i in range(1, 23)]
    allowed_sex_chroms_upper = ['X', 'Y'] 
    allowed_chromosomes = set()
    for num_chrom in allowed_chrom_numbers:
        allowed_chromosomes.add(num_chrom)
        allowed_chromosomes.add(f"chr{num_chrom}")
    for sex_chrom in allowed_sex_chroms_upper:
        allowed_chromosomes.add(sex_chrom)
        allowed_chromosomes.add(sex_chrom.lower())
        allowed_chromosomes.add(f"chr{sex_chrom}")
        allowed_chromosomes.add(f"chr{sex_chrom.lower()}")
    
    raw_df_filtered = raw_df[raw_df['chrom'].isin(allowed_chromosomes)].copy()
    filtered_row_count = len(raw_df_filtered)
    print(f"Removed {original_row_count - filtered_row_count} rows with non-standard chromosomes.")


    # JUNCTION COORD CORRECTION
    # filter rows for valid blocks
    parsed_blocks_list = raw_df_filtered['block_sizes_orig'].str.strip(',').str.split(',')
    has_sufficient_blocks = parsed_blocks_list.str.len() >= 2
    raw_df_filtered = raw_df_filtered[has_sufficient_blocks].copy()
    parsed_blocks_list = parsed_blocks_list[has_sufficient_blocks]
    
    # recalculating junction coordinates
    raw_df_filtered.loc[:, 'overhang_left'] = parsed_blocks_list.str[0].astype(int)
    raw_df_filtered.loc[:, 'overhang_right'] = parsed_blocks_list.str[1].astype(int)

    junc_start = raw_df_filtered['start_anchor'] + raw_df_filtered['overhang_left']
    junc_end = raw_df_filtered['end_anchor'] - raw_df_filtered['overhang_right']

    # filter out invalid junctions
    valid_junction = junc_start < junc_end
    raw_df_filtered = raw_df_filtered[valid_junction].copy()
    junc_start = junc_start[valid_junction]
    junc_end = junc_end[valid_junction]


    junc_length = junc_end - junc_start

    # create df
    transformed_df = pd.DataFrame()
    transformed_df['chrom'] = raw_df_filtered['chrom']
    transformed_df['chromStart'] = junc_start
    transformed_df['chromEnd'] = junc_end
    transformed_df['name'] = raw_df_filtered['name']
    transformed_df['score'] = raw_df_filtered['score']
    transformed_df['strand'] = raw_df_filtered['strand']
    transformed_df['thickStart'] = junc_start
    transformed_df['thickEnd'] = junc_end
    transformed_df['itemRgb'] = raw_df_filtered['item_rgb_orig']
    transformed_df['blockCount'] = 1
    transformed_df['blockSizes'] = junc_length.astype(str)
    transformed_df['blockStarts'] = "0"
    transformed_df['sample_id_source'] = raw_df_filtered['sample_id_source']

    print(f"Transformed {len(transformed_df)} junction records.")
    
    return transformed_df

### 3. Find Exitrons Within Junction Data
- Converts transformed junction data (transformed_df) and exon data (from gff3 file) into PyRanges objects with labels Chromosome, Start, End, Strand, and Title (a unique junction id formed by chrom:start:end:strand)

- Finds junctions that overlap with CDS regions using PyRanges method .overlap

In [16]:
# convert CDS data to PyRanges object
gff = pr.read_gff3("gencode.v48.annotation.gff3.gz")
cds = gff[gff.Feature == "CDS"]
print(f"Found {len(cds)} CDS regions.")

Found 903356 CDS regions.


In [17]:
def findExitrons(transformed_df):
    transformed_df = transformed_df[transformed_df['strand'].isin(['+', '-'])]

    # generate a unique ID for each junction (chrom:start:end:strand
    unique_id = transformed_df['chrom'].astype(str) + ':' + \
                transformed_df['chromStart'].astype(str) + ':' + \
                transformed_df['chromEnd'].astype(str) + ':' + \
                transformed_df['strand'].astype(str)

    # convert junction data to PyRanges object
    junction_pr = pr.PyRanges({'Chromosome': transformed_df['chrom'],
                    'Start': transformed_df['chromStart'],
                    'End': transformed_df['chromEnd'],
                    'Strand': transformed_df['strand'],
                    'title': unique_id,
                    'reads': transformed_df['score'],
                    'sourceID': transformed_df['sample_id_source']}) 

    # find overlapping junctions
    contained_junctions = junction_pr.overlap(cds, contained_intervals_only=True, strand_behavior='same')
    print(f"Found {len(contained_junctions)} junctions contained within CDS regions.")
            
    return contained_junctions

### 4. Compile All Exitron Info
- Iterates through each person's file, finding all exitron data then concatenating to a final matrix

- Includes person ID (file name) and junction scores (total reads)

In [24]:
def compileExitronData(directory_path, output_filepath, file_pattern="*.bam.junc"):

    all_exitron_info = []
    file_paths = glob.glob(os.path.join(directory_path, file_pattern))
    print(f"Found {len(file_paths)} files to process.")

    # testing first 5 out of 100
    '''
    files_to_process = file_paths[:5]
    print(f"Processing the first {len(files_to_process)} files.")
    '''

    for file_path in tqdm(file_paths):
        print("Parsing new file...")
        file_name_only = os.path.basename(file_path)
        try:
            # 1.
            parsed_data = parseJunctionFile(file_path)
            # 2.
            transformed_df = transformJunctionData(parsed_data)
            # 3.
            gr_file = findExitrons(transformed_df)
            #4.
            all_exitron_info.append(gr_file)

        # skip to the next file if an error occurs
        except Exception as e:
            print(f"An error occurred while processing file {file_name_only}: {e}")
            import traceback
            traceback.print_exc()
            continue 

    # concatenate all individual data into matrix 
    final_gr = pr.concat(all_exitron_info)
    print(f"\nSuccessfully compiled exitron data from {len(all_exitron_info)} files.")
    final_gr.to_parquet(output_filepath, index=False)
    print(f"Successfully saved data to {output_filepath}")
    return final_gr

In [23]:
compileExitronData("/gpfs/commons/groups/knowles_lab/atokolyi/als/juncs_min6bp/", "/gpfs/commons/home/ncui/project/final_exitron_data.parquet", file_pattern="*.bam.junc")

Found 1876 files to process.


  0%|          | 0/1876 [00:00<?, ?it/s]

Parsing new file...
Removed 898 rows with non-standard chromosomes.
Transformed 245111 junction records.


  0%|          | 1/1876 [00:01<1:00:51,  1.95s/it]

Found 7873 junctions contained within CDS regions.
Parsing new file...
Removed 1401 rows with non-standard chromosomes.
Transformed 281682 junction records.


  0%|          | 2/1876 [00:04<1:04:30,  2.07s/it]

Found 10733 junctions contained within CDS regions.
Parsing new file...
Removed 2779 rows with non-standard chromosomes.
Transformed 311550 junction records.


  0%|          | 3/1876 [00:06<1:07:58,  2.18s/it]

Found 13517 junctions contained within CDS regions.
Parsing new file...
Removed 903 rows with non-standard chromosomes.
Transformed 274854 junction records.


  0%|          | 4/1876 [00:08<1:07:46,  2.17s/it]

Found 11393 junctions contained within CDS regions.
Parsing new file...
Removed 1158 rows with non-standard chromosomes.
Transformed 303449 junction records.


  0%|          | 5/1876 [00:11<1:11:04,  2.28s/it]

Found 8532 junctions contained within CDS regions.
Parsing new file...
Removed 797 rows with non-standard chromosomes.
Transformed 272826 junction records.


  0%|          | 6/1876 [00:13<1:10:10,  2.25s/it]

Found 11530 junctions contained within CDS regions.
Parsing new file...
Removed 1401 rows with non-standard chromosomes.
Transformed 429798 junction records.


  0%|          | 7/1876 [00:16<1:19:16,  2.54s/it]

Found 19533 junctions contained within CDS regions.
Parsing new file...
Removed 1689 rows with non-standard chromosomes.
Transformed 274252 junction records.


  0%|          | 8/1876 [00:18<1:15:48,  2.43s/it]

Found 11662 junctions contained within CDS regions.
Parsing new file...
Removed 1071 rows with non-standard chromosomes.
Transformed 286807 junction records.


  0%|          | 9/1876 [00:20<1:12:55,  2.34s/it]

Found 13559 junctions contained within CDS regions.
Parsing new file...
Removed 3154 rows with non-standard chromosomes.
Transformed 277697 junction records.


  1%|          | 10/1876 [00:22<1:10:45,  2.28s/it]

Found 11014 junctions contained within CDS regions.
Parsing new file...
Removed 518 rows with non-standard chromosomes.
Transformed 242223 junction records.


  1%|          | 11/1876 [00:24<1:07:12,  2.16s/it]

Found 7248 junctions contained within CDS regions.
Parsing new file...
Removed 716 rows with non-standard chromosomes.
Transformed 259781 junction records.


  1%|          | 12/1876 [00:26<1:05:34,  2.11s/it]

Found 7208 junctions contained within CDS regions.
Parsing new file...
Removed 681 rows with non-standard chromosomes.
Transformed 293495 junction records.


  1%|          | 13/1876 [00:28<1:06:13,  2.13s/it]

Found 8172 junctions contained within CDS regions.
Parsing new file...
Removed 1329 rows with non-standard chromosomes.
Transformed 287014 junction records.


  1%|          | 14/1876 [00:31<1:06:21,  2.14s/it]

Found 12506 junctions contained within CDS regions.
Parsing new file...
Removed 1941 rows with non-standard chromosomes.


### 5. Summarize Exitron Info 
- Lists all unique exitrons and their counts

- Identifies which exitrons are already annotated

### 6. Exitron Normalization
- Divides exitron score by the reads of surrounding exons to find proportion of time that the exitron gets expressed