In [1]:
import pandas as pd
import glob
import os
from tqdm import tqdm # progress tracker
import pyranges as pr # parsing gff
import numpy as np

1. Parse raw junction data from regtools output files.

In [2]:
def parseJunctionFiles(directory_path, file_pattern="*.reverse.output.junc", file_count=10):

    all_data = []
    file_paths = glob.glob(os.path.join(directory_path, file_pattern))

    # slicing data set (set file_count to None to include all data)
    if file_count is not None and file_count > 0:
        file_paths_to_process = file_paths[:file_count]
        print(f"Found {len(file_paths)} files. Processing the first {len(file_paths_to_process)} files.")
    else:
        file_paths_to_process = file_paths
        print(f"Found {len(file_paths_to_process)} files to process.")

    # pre-modification columns 
    regtools_column_names = [
        'chrom', 'start_anchor', 'end_anchor', 'name', 'score', 'strand',
        'thick_start_orig', 'thick_end_orig', 'item_rgb_orig',
        'block_count_orig', 'block_sizes_orig', 'block_starts_orig'
    ]
    
    for file_path in tqdm(file_paths_to_process):
        try:
            sample_id = os.path.basename(file_path).split('.')[0]
            df = pd.read_csv(
                file_path, sep='\t', header=None, names=regtools_column_names,
                dtype={'chrom': str, 'block_sizes_orig': str, 'block_starts_orig': str}
            )
            if df.empty:
                print(f"File is empty or failed to load: {file_path}")
                continue
            df['sample_id_source'] = sample_id
            all_data.append(df)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    if not all_data:
        print("No data was read from any file.")
        return None

    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"\nSuccessfully combined raw data from {len(all_data)} files into a df with {len(combined_df)} rows.")
    
    # type conversion for  numeric columns from regtools
    for col in ['start_anchor', 'end_anchor', 'score']:
        if col in combined_df.columns:
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
    
    # drop rows if info is missing
    combined_df.dropna(subset=['start_anchor', 'end_anchor', 'score', 'block_sizes_orig'], inplace=True)
    # ensure int types
    for col in ['start_anchor', 'end_anchor', 'score']:
         combined_df[col] = combined_df[col].astype(int)

    return combined_df

2. Transform raw regtools df into BED12 df where each entry is an intron/junction. Junction coords recalculated, taking into account blockSize. Recalculated block size to represent length of junction.

In [3]:
def transformJunctionData(raw_df):
    if raw_df.empty:
        print("Raw DataFrame is empty.")
        return pd.DataFrame()
    
    # CHROMOSOME FILTERING
    original_row_count = len(raw_df)
    
    # allowed chromosomes
    allowed_chrom_numbers = [str(i) for i in range(1, 23)]
    allowed_sex_chroms_upper = ['X', 'Y'] 
    
    allowed_chromosomes = set()
    for num_chrom in allowed_chrom_numbers:
        allowed_chromosomes.add(num_chrom)
        allowed_chromosomes.add(f"chr{num_chrom}")
    for sex_chrom in allowed_sex_chroms_upper:
        allowed_chromosomes.add(sex_chrom)
        allowed_chromosomes.add(sex_chrom.lower())
        allowed_chromosomes.add(f"chr{sex_chrom}")
        allowed_chromosomes.add(f"chr{sex_chrom.lower()}")
    
    raw_df_filtered = raw_df[raw_df['chrom'].isin(allowed_chromosomes)].copy()
    
    filtered_row_count = len(raw_df_filtered)
    print(f"Removed {original_row_count - filtered_row_count} rows with non-standard chromosomes.")

    # JUNCTION COORD CORRECTION
    # initialize lists for each column
    chrom_list = []
    chromStart_list = []
    chromEnd_list = []
    name_list = []
    score_list = []
    strand_list = []
    thickStart_list = []
    thickEnd_list = []
    itemRgb_list = []
    blockCount_list = []
    blockSizes_list = []
    blockStarts_list = []
    sample_id_source_list = []
    
    skipped_rows = 0

    # iterate over the filtered df (set to 1000 rows)
    for i, (index, row) in enumerate(tqdm(raw_df_filtered.iterrows(), total=1000)):
        if i >= 1000:
            break
        try:
            regtools_start = row['start_anchor']
            regtools_end = row['end_anchor']
            regtools_block_sizes_str = row['block_sizes_orig']

            parsed_block_sizes = [int(s) for s in regtools_block_sizes_str.strip(',').split(',')]
            if len(parsed_block_sizes) < 2:
                skipped_rows += 1
                continue
            
            overhang_left = parsed_block_sizes[0]
            overhang_right = parsed_block_sizes[1]

            junc_start = regtools_start + overhang_left
            junc_end = regtools_end - overhang_right

            if junc_start >= junc_end: 
                skipped_rows += 1
                continue

            junc_length = junc_end - junc_start

            # add values to respective lists
            chrom_list.append(row['chrom'])
            chromStart_list.append(junc_start)
            chromEnd_list.append(junc_end)
            name_list.append(row['name'])
            score_list.append(row['score'])
            strand_list.append(row['strand'])
            thickStart_list.append(junc_start)
            thickEnd_list.append(junc_end)
            itemRgb_list.append(row.get('item_rgb_orig', '0'))
            blockCount_list.append(1)
            blockSizes_list.append(str(junc_length))
            blockStarts_list.append("0")
            sample_id_source_list.append(row['sample_id_source'])
            
        except Exception as e:
            skipped_rows += 1
            continue
    
    if skipped_rows > 0:
        print(f"Skipped {skipped_rows} rows.")

    # create df from dictionary of lists
    transformed_df = pd.DataFrame({
        'chrom': chrom_list,
        'chromStart': chromStart_list,
        'chromEnd': chromEnd_list,
        'name': name_list,
        'score': score_list,
        'strand': strand_list,
        'thickStart': thickStart_list,
        'thickEnd': thickEnd_list,
        'itemRgb': itemRgb_list,
        'blockCount': blockCount_list,
        'blockSizes': blockSizes_list,
        'blockStarts': blockStarts_list,
        'sample_id_source': sample_id_source_list
    })
    
    print(f"Transformed {len(transformed_df)} junction records.")
    return transformed_df

In [4]:
# TESTING: transforming parsed data
transformJunctionData(parseJunctionFiles("/gpfs/commons/groups/knowles_lab/atokolyi/als/juncs/",file_pattern="*.reverse.output.junc"))

Found 1876 files. Processing the first 10 files.


100%|██████████| 10/10 [00:12<00:00,  1.22s/it]



Successfully combined raw data from 10 files into a df with 3102409 rows.
Removed 5253 rows with non-standard chromosomes.


100%|██████████| 1000/1000 [00:02<00:00, 429.92it/s]


Transformed 1000 records.


Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,sample_id_source
0,chr1,14829,14969,JUNC00000001,101,-,14829,14969,25500,1,140,0,CGND-HRA-02241
1,chr1,14829,14962,JUNC00000003,1,-,14829,14962,25500,1,133,0,CGND-HRA-02241
2,chr1,14829,185483,JUNC00000002,1,-,14829,185483,25500,1,170654,0,CGND-HRA-02241
3,chr1,15038,15795,JUNC00000004,30,-,15038,15795,25500,1,757,0,CGND-HRA-02241
4,chr1,15038,15310,JUNC00000005,2,-,15038,15310,25500,1,272,0,CGND-HRA-02241
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,chr1,1670151,1738320,JUNC00001016,4,-,1670151,1738320,25500,1,68169,0,CGND-HRA-02241
996,chr1,1670151,1671508,JUNC00001014,36,-,1670151,1671508,25500,1,1357,0,CGND-HRA-02241
997,chr1,1671629,1675462,JUNC00001017,58,-,1671629,1675462,25500,1,3833,0,CGND-HRA-02241
998,chr1,1675590,1676063,JUNC00001018,63,-,1675590,1676063,25500,1,473,0,CGND-HRA-02241


3. Filter for junctions that are contained within exon sequences (obtain from gff file, parse using pyranges).

In [3]:
gff = pr.read_gff3("gencode.v48.annotation.gff3.gz")
cds = gff[gff['Feature']=="CDS"] #only CDS (protein-coding exons)

gr = pr.PyRanges({'Chromosome': ['chr1', 'chr1', 'chr1', 'chr3'],
                  'Start': [5, 20, 80, 10],
                  'End': [10, 28, 95, 38],
                  'Strand': ['+', '+', '-', '+'],
                  'title': ['a', 'b', 'c', 'd']}) # Can use junction IDs here i.e. chr:start:end:strand
# Use the pyranges overlap function to find which of these junctions are wholly contained within CDS

def findExitrons(junction_df, gff_file_path):

    print(f"\nLoading GFF3 file: {gff_file_path} for CDS filtering...")
    try:
        gff_pr = pr.read_gff3(gff_file_path, duplicate_attr=True) # duplicate_attr=True might be needed for some GFFs
        cds_pr = gff_pr[gff_pr.Feature == "CDS"]
    except Exception as e:
        print(f"Error reading or processing GFF3 file: {e}")
        return junction_df # Return original on error

    if cds_pr.empty:
        print("No CDS features found in GFF3 file. Returning unfiltered junctions.")
        return junction_df

    print(f"Found {len(cds_pr)} CDS features.")

    # Convert junction DataFrame to PyRanges object
    # PyRanges uses 'Chromosome', 'Start', 'End', 'Strand' by default
    # Our junction_df uses 'chrom', 'chromStart', 'chromEnd', 'strand'
    # Keep all other columns from junction_df
    
    # Create a mapping for column names to PyRanges standard names
    column_map = {
        'chrom': 'Chromosome',
        'chromStart': 'Start',
        'chromEnd': 'End',
        'strand': 'Strand'
    }
    junctions_pr_df_renamed = junction_df.rename(columns=column_map)
    junctions_pr = pr.PyRanges(junctions_pr_df_renamed)
    
    print(f"Converted {len(junctions_pr)} junctions to PyRanges object for CDS filtering.")

    # Filter junctions that are subsets of (contained within) CDS features
    # A.issubset(B) returns a new PyRanges object with intervals from A that are subsets of intervals in B
    contained_junctions_pr = junctions_pr.issubset(cds_pr)
    
    original_junction_count = len(junction_df)
    filtered_junction_count = len(contained_junctions_pr)
    print(f"CDS containment filtering: Kept {filtered_junction_count} junctions out of {original_junction_count} (removed {original_junction_count - filtered_junction_count} junctions not contained within CDS).")

    if contained_junctions_pr.empty:
        print("No junctions were found to be contained within CDS regions.")
        return pd.DataFrame()

    # Convert back to pandas DataFrame, reverting column names for consistency
    # The PyRanges df attribute will have the standard PyRanges column names
    filtered_junction_df_std_names = contained_junctions_pr.df
    
    # Revert column names to original for consistency with other functions
    reverse_column_map = {v: k for k, v in column_map.items()}
    filtered_junction_df_original_names = filtered_junction_df_std_names.rename(columns=reverse_column_map)
    
    return filtered_junction_df_original_names


Unnamed: 0,Chromosome,Start,End,Strand,title
0,chr1,5,10,+,a
1,chr1,20,28,+,b
2,chr1,80,95,-,c
3,chr3,10,38,+,d


4. Summarize junction info as list of unique junctions and their counts 
