In [24]:
import pandas as pd
import glob
import os
from tqdm import tqdm # progress tracker

1. Parse raw junction data from regtools output files

In [25]:
def parseJunctionFiles(directory_path, file_pattern="*.reverse.output.junc", file_count=10):

    all_data = []
    file_paths = glob.glob(os.path.join(directory_path, file_pattern))

    # slicing data set (set file_count to None for all the data)
    if file_count is not None and file_count > 0:
        file_paths_to_process = file_paths[:file_count]
        print(f"Found {len(file_paths)} files. Processing the first {len(file_paths_to_process)} files.")
    else:
        file_paths_to_process = file_paths
        print(f"Found {len(file_paths_to_process)} files to process.")

    # pre-modification columns 
    regtools_column_names = [
        'chrom', 'start_anchor', 'end_anchor', 'name', 'score', 'strand',
        'thick_start_orig', 'thick_end_orig', 'item_rgb_orig',
        'block_count_orig', 'block_sizes_orig', 'block_starts_orig'
    ]
    
    for file_path in tqdm(file_paths_to_process):
        try:
            sample_id = os.path.basename(file_path).split('.')[0]
            df = pd.read_csv(
                file_path, sep='\t', header=None, names=regtools_column_names,
                dtype={'chrom': str, 'block_sizes_orig': str, 'block_starts_orig': str}
            )
            if df.empty:
                print(f"File is empty or failed to load: {file_path}")
                continue
            df['sample_id_source'] = sample_id
            all_data.append(df)
        except pd.errors.EmptyDataError:
            print(f"File is empty and will be skipped: {file_path}")
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    if not all_data:
        print("No data was read from any file.")
        return None

    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"\nSuccessfully combined raw data from {len(all_data)} files into a df with {len(combined_df)} rows.")
    
    # type conversion for  numeric columns from regtools
    for col in ['start_anchor', 'end_anchor', 'score']:
        if col in combined_df.columns:
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
    
    # drop rows if info is missing
    combined_df.dropna(subset=['start_anchor', 'end_anchor', 'score', 'block_sizes_orig'], inplace=True)
    # ensure int types
    for col in ['start_anchor', 'end_anchor', 'score']:
         combined_df[col] = combined_df[col].astype(int)

    return combined_df

2. Transform raw regtools df into BED12 df where each entry is an intron. Intron coords recalculated, taking into account blockSize

In [26]:
def transformJunctionData(raw_df):
    if raw_df.empty:
        print("Raw DataFrame is empty.")
        return pd.DataFrame()
    
    # CHROMOSOME FILTERING
    original_row_count = len(raw_df)
    
    # allowed chromosomes
    allowed_chrom_numbers = [str(i) for i in range(1, 23)]
    allowed_sex_chroms_upper = ['X', 'Y'] 
    
    allowed_chromosomes = set()
    for num_chrom in allowed_chrom_numbers:
        allowed_chromosomes.add(num_chrom)
        allowed_chromosomes.add(f"chr{num_chrom}")
    for sex_chrom in allowed_sex_chroms_upper:
        allowed_chromosomes.add(sex_chrom)
        allowed_chromosomes.add(sex_chrom.lower())
        allowed_chromosomes.add(f"chr{sex_chrom}")
        allowed_chromosomes.add(f"chr{sex_chrom.lower()}")
    
    raw_df_filtered = raw_df[raw_df['chrom'].isin(allowed_chromosomes)].copy()
    
    filtered_row_count = len(raw_df_filtered)
    print(f"Removed {original_row_count - filtered_row_count} rows with non-standard chromosomes).")

    # JUNCTION COORD CORRECTION
    # initialize lists for each column
    chrom_list = []
    chromStart_list = []
    chromEnd_list = []
    name_list = []
    score_list = []
    strand_list = []
    thickStart_list = []
    thickEnd_list = []
    itemRgb_list = []
    blockCount_list = []
    blockSizes_list = []
    blockStarts_list = []
    sample_id_source_list = []
    
    skipped_rows = 0

    # iterate over the filtered df (set to 10 rows)
    for i, (index, row) in enumerate(tqdm(raw_df.iterrows(), total=100)):
        if i >= 100:
            break
        try:
            regtools_start = row['start_anchor']
            regtools_end = row['end_anchor']
            regtools_block_sizes_str = row['block_sizes_orig']

            parsed_block_sizes = [int(s) for s in regtools_block_sizes_str.strip(',').split(',')]
            if len(parsed_block_sizes) < 2:
                skipped_rows += 1
                continue
            
            overhang_left = parsed_block_sizes[0]
            overhang_right = parsed_block_sizes[1]

            junc_start = regtools_start + overhang_left
            junc_end = regtools_end - overhang_right

            if junc_start >= junc_end: 
                skipped_rows += 1
                continue

            junc_length = junc_end - junc_start

            # add values to respective lists
            chrom_list.append(row['chrom'])
            chromStart_list.append(junc_start)
            chromEnd_list.append(junc_end)
            name_list.append(row['name'])
            score_list.append(row['score'])
            strand_list.append(row['strand'])
            thickStart_list.append(junc_start)
            thickEnd_list.append(junc_end)
            itemRgb_list.append(row.get('item_rgb_orig', '0'))
            blockCount_list.append(1)
            blockSizes_list.append(str(junc_length))
            blockStarts_list.append("0")
            sample_id_source_list.append(row['sample_id_source'])
            
        except Exception as e:
            skipped_rows += 1
            continue
    
    if skipped_rows > 0:
        print(f"Skipped {skipped_rows} rows.")
    
    if not chrom_list: # Check if any data was successfully processed
        print("No records were successfully processed for BED12 transformation after chromosome filtering.")
        return pd.DataFrame()

    # create df from dictionary of lists
    transformed_df = pd.DataFrame({
        'chrom': chrom_list,
        'chromStart': chromStart_list,
        'chromEnd': chromEnd_list,
        'name': name_list,
        'score': score_list,
        'strand': strand_list,
        'thickStart': thickStart_list,
        'thickEnd': thickEnd_list,
        'itemRgb': itemRgb_list,
        'blockCount': blockCount_list,
        'blockSizes': blockSizes_list,
        'blockStarts': blockStarts_list,
        'sample_id_source': sample_id_source_list
    })
    
    print(f"Transformed {len(transformed_df)} records (from standard chromosomes) into intron-centric BED12 format.")
    return transformed_df

In [27]:
transformJunctionData(parseJunctionFiles("/gpfs/commons/groups/knowles_lab/atokolyi/als/juncs/",file_pattern="*.reverse.output.junc"))


Found 1876 files. Processing the first 10 files.


100%|██████████| 10/10 [00:11<00:00,  1.13s/it]



Successfully combined raw data from 10 files into a df with 3102409 rows.
Removed 5253 rows with non-standard chromosomes).


100%|██████████| 100/100 [00:02<00:00, 43.72it/s]


Transformed 100 records (from standard chromosomes) into intron-centric BED12 format.


Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,sample_id_source
0,GL000008.2,196214,198475,JUNC00310217,1,-,196214,198475,25500,1,2261,0,CGND-HRA-02241
1,GL000009.2,37797,48751,JUNC00310218,2,+,37797,48751,25500,1,10954,0,CGND-HRA-02241
2,GL000009.2,86509,87324,JUNC00310219,2,-,86509,87324,25500,1,815,0,CGND-HRA-02241
3,GL000009.2,91507,94561,JUNC00310220,1,-,91507,94561,25500,1,3054,0,CGND-HRA-02241
4,GL000009.2,94133,94561,JUNC00310221,3,-,94133,94561,25500,1,428,0,CGND-HRA-02241
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,GL000219.1,80028,82375,JUNC00310312,1,-,80028,82375,25500,1,2347,0,CGND-HRA-02241
96,GL000219.1,83317,88294,JUNC00310313,79,-,83317,88294,25500,1,4977,0,CGND-HRA-02241
97,GL000219.1,83317,97293,JUNC00310314,2,-,83317,97293,25500,1,13976,0,CGND-HRA-02241
98,GL000219.1,88420,97293,JUNC00310315,91,-,88420,97293,25500,1,8873,0,CGND-HRA-02241


filter out chromosomes that are not 1-22, X,Y

3. summarize junction info

general function for converting from datafram to BED file