In [22]:
import pandas as pd
import glob
import os
from tqdm import tqdm # progress tracker
import pyranges as pr # parsing gff
import numpy as np

### 1. Parse Regtools Data 
- Parse raw junction data from regtools output files

In [31]:
def parseJunctionFiles(directory_path, file_pattern="*.reverse.output.junc", file_count=1):

    all_data = []
    file_paths = glob.glob(os.path.join(directory_path, file_pattern))

    # slicing data set (set file_count to None to include all data)
    if file_count is not None and file_count > 0:
        file_paths_to_process = file_paths[:file_count]
        print(f"Found {len(file_paths)} files. Processing the first {len(file_paths_to_process)} files.")
    else:
        file_paths_to_process = file_paths
        print(f"Found {len(file_paths_to_process)} files to process.")

    # pre-modification columns 
    regtools_column_names = [
        'chrom', 'start_anchor', 'end_anchor', 'name', 'score', 'strand',
        'thick_start_orig', 'thick_end_orig', 'item_rgb_orig',
        'block_count_orig', 'block_sizes_orig', 'block_starts_orig'
    ]
    
    for file_path in tqdm(file_paths_to_process):
        try:
            sample_id = os.path.basename(file_path).split('.')[0]
            df = pd.read_csv(
                file_path, sep='\t', header=None, names=regtools_column_names,
                dtype={'chrom': str, 'block_sizes_orig': str, 'block_starts_orig': str}
            )
            if df.empty:
                print(f"File is empty or failed to load: {file_path}")
                continue
            df['sample_id_source'] = sample_id
            all_data.append(df)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    if not all_data:
        print("No data was read from any file.")
        return None

    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"\nSuccessfully combined raw data from {len(all_data)} files into a df with {len(combined_df)} rows.")
    
    # type conversion for  numeric columns from regtools
    for col in ['start_anchor', 'end_anchor', 'score']:
        if col in combined_df.columns:
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
    
    # drop rows if info is missing
    combined_df.dropna(subset=['start_anchor', 'end_anchor', 'score', 'block_sizes_orig'], inplace=True)
    # ensure int types
    for col in ['start_anchor', 'end_anchor', 'score']:
         combined_df[col] = combined_df[col].astype(int)

    return combined_df

### 2. Transform Junction Data
- Recalculates junction coordinates, following Regtools documentation to take into account blockSize
- Recalculates block size to represent length of junction
- Outputs junctino info in BED12 format

In [24]:
def transformJunctionData(raw_df):
    if raw_df.empty:
        print("Raw DataFrame is empty.")
        return pd.DataFrame()
    
    # CHROMOSOME FILTERING
    original_row_count = len(raw_df)
    
    # allowed chromosomes
    allowed_chrom_numbers = [str(i) for i in range(1, 23)]
    allowed_sex_chroms_upper = ['X', 'Y'] 
    
    allowed_chromosomes = set()
    for num_chrom in allowed_chrom_numbers:
        allowed_chromosomes.add(num_chrom)
        allowed_chromosomes.add(f"chr{num_chrom}")
    for sex_chrom in allowed_sex_chroms_upper:
        allowed_chromosomes.add(sex_chrom)
        allowed_chromosomes.add(sex_chrom.lower())
        allowed_chromosomes.add(f"chr{sex_chrom}")
        allowed_chromosomes.add(f"chr{sex_chrom.lower()}")
    
    raw_df_filtered = raw_df[raw_df['chrom'].isin(allowed_chromosomes)].copy()
    
    filtered_row_count = len(raw_df_filtered)
    print(f"Removed {original_row_count - filtered_row_count} rows with non-standard chromosomes.")

    # JUNCTION COORD CORRECTION
    # initialize lists for each column
    chrom_list = []
    chromStart_list = []
    chromEnd_list = []
    name_list = []
    score_list = []
    strand_list = []
    thickStart_list = []
    thickEnd_list = []
    itemRgb_list = []
    blockCount_list = []
    blockSizes_list = []
    blockStarts_list = []
    sample_id_source_list = []

    # iterate over the filtered df (set to 1000 rows)
    for i, (index, row) in enumerate(tqdm(raw_df_filtered.iterrows(), total=100000000000000000)):
        if i >= 100000000000000000:
            break

        regtools_start = row['start_anchor']
        regtools_end = row['end_anchor']
        regtools_block_sizes_str = row['block_sizes_orig']

        parsed_block_sizes = [int(s) for s in regtools_block_sizes_str.strip(',').split(',')]
        if len(parsed_block_sizes) < 2:
            skipped_rows += 1
            continue
        
        overhang_left = parsed_block_sizes[0]
        overhang_right = parsed_block_sizes[1]

        junc_start = regtools_start + overhang_left
        junc_end = regtools_end - overhang_right

        if junc_start >= junc_end: 
            skipped_rows += 1
            continue

        junc_length = junc_end - junc_start

        # add values to respective lists
        chrom_list.append(row['chrom'])
        chromStart_list.append(junc_start)
        chromEnd_list.append(junc_end)
        name_list.append(row['name'])
        score_list.append(row['score'])
        strand_list.append(row['strand'])
        thickStart_list.append(junc_start)
        thickEnd_list.append(junc_end)
        itemRgb_list.append(row.get('item_rgb_orig', '0'))
        blockCount_list.append(1)
        blockSizes_list.append(str(junc_length))
        blockStarts_list.append("0")
        sample_id_source_list.append(row['sample_id_source'])

    # create df from dictionary of lists
    transformed_df = pd.DataFrame({
        'chrom': chrom_list,
        'chromStart': chromStart_list,
        'chromEnd': chromEnd_list,
        'name': name_list,
        'score': score_list,
        'strand': strand_list,
        'thickStart': thickStart_list,
        'thickEnd': thickEnd_list,
        'itemRgb': itemRgb_list,
        'blockCount': blockCount_list,
        'blockSizes': blockSizes_list,
        'blockStarts': blockStarts_list,
        'sample_id_source': sample_id_source_list
    })
    
    print(f"Transformed {len(transformed_df)} junction records.")
    return transformed_df

In [85]:
transformed_df = transformJunctionData(parseJunctionFiles("/gpfs/commons/groups/knowles_lab/atokolyi/als/juncs_min6bp/",file_pattern="*.bam.junc"))

Found 100 files. Processing the first 1 files.


100%|██████████| 1/1 [00:03<00:00,  3.27s/it]



Successfully combined raw data from 1 files into a df with 225494 rows.
Removed 405 rows with non-standard chromosomes.


100%|██████████| 1000/1000 [00:00<00:00, 3115.96it/s]


Transformed 1000 junction records.


3. Convert transformed junction data and exon data into PyRanges object including Chromosome, Start, End, Strand, and Title (a unique junction id). Find junctions that overlap with CDS regions

In [84]:
def findExitrons(junction_df):

    # convert junction data to PyRanges object
    unique_id = transformed_df['chrom'].astype(str) + ':' + \
                transformed_df['chromStart'].astype(str) + ':' + \
                transformed_df['chromEnd'].astype(str) + ':' + \
                transformed_df['strand'].astype(str)

    junction_pr = pr.PyRanges({'Chromosome': transformed_df['chrom'],
                    'Start': transformed_df['chromStart'],
                    'End': transformed_df['chromEnd'],
                    'Strand': transformed_df['strand'],
                    'title': unique_id })
    
    # convert CDS data to PyRanges object
    gff = pr.read_gff3("gencode.v48.annotation.gff3.gz")
    cds = gff[gff.Feature == "CDS"]

    print(f"Found {len(cds)} CDS features.")

    # overlapping junctions
    contained_junctions = junction_pr.overlap(cds, contained_intervals_only=True)
    print(f"Found {len(contained_junctions)} junctions that are contained within CDS regions.")
            
    return contained_junctions


In [86]:
findExitrons(transformed_df)

Found 100 files. Processing the first 1 files.


100%|██████████| 1/1 [00:00<00:00,  1.19it/s]



Successfully combined raw data from 1 files into a df with 225494 rows.
Removed 405 rows with non-standard chromosomes.


100%|██████████| 1000/1000 [00:00<00:00, 3086.11it/s]


Transformed 1000 junction records.
Found 903356 CDS features.


  contained_junctions = junction_pr.overlap(cds, contained_intervals_only=True)


Found 9 junctions that are contained within CDS regions.


Unnamed: 0,Chromosome,Start,End,Strand,title
252,chr1,999787,999865,-,chr1:999787:999865:-
343,chr1,1287596,1287672,+,chr1:1287596:1287672:+
362,chr1,1295826,1295889,-,chr1:1295826:1295889:-
362,chr1,1295826,1295889,-,chr1:1295826:1295889:-
800,chr1,2029762,2029982,+,chr1:2029762:2029982:+
972,chr1,2496880,2496999,+,chr1:2496880:2496999:+
972,chr1,2496880,2496999,+,chr1:2496880:2496999:+
972,chr1,2496880,2496999,+,chr1:2496880:2496999:+
972,chr1,2496880,2496999,+,chr1:2496880:2496999:+


4. Iterate through each file. In each file, compare all its junctions to the ones in cds regions, find the contained junctions(exitrons), and add to final result matrix. Include score and person ID (what file it came from)

4. Summarize junction info as list of unique junctions and their counts 
