In [16]:
import pysam
import numpy as np
import pandas as pd
import pyranges as pr
import tqdm

In [None]:
# exitron data before filter
data = pd.read_parquet('final_exitron_data.parquet')
data.head()


Unnamed: 0,Chromosome,Start,End,Strand,title,reads,sourceID
0,chr1,999094,999138,-,chr1:999094:999138:-,1,CGND-HRA-01280
1,chr1,999094,999138,-,chr1:999094:999138:-,1,CGND-HRA-01280
2,chr1,999094,999138,-,chr1:999094:999138:-,1,CGND-HRA-01280
3,chr1,999787,999865,-,chr1:999787:999865:-,12,CGND-HRA-01280
4,chr1,1211625,1211703,-,chr1:1211625:1211703:-,1,CGND-HRA-01280


In [10]:
def filterExitronData(exitron_data_filepath, min_count=30, min_peeps=10):
    long_df= pd.read_parquet(exitron_data_filepath)
    # drop duplicates created by pyranges .overlap method
    long_df.drop_duplicates(subset=['title', 'sourceID'], inplace=True)

    # create filtered wide df
    wide_df = long_df.pivot(index='title', columns='sourceID', values='reads').fillna(0)

    wide_is_gt_count = wide_df > min_count
    wide_is_gt_count_sums = wide_is_gt_count.sum(axis=1)
    keep_juncs = wide_is_gt_count_sums[wide_is_gt_count_sums >= min_peeps].index
    print(f"Found {len(keep_juncs)} junctions that passed the filter.")

    # construct final matrix
    filtered_exitron_data = wide_df.loc[keep_juncs].copy()
    filtered_exitron_data = filtered_exitron_data.where(filtered_exitron_data >= min_count, 0)

    return filtered_exitron_data

In [12]:
# exitron data after filter
filtered_exitron_data = filterExitronData('final_exitron_data.parquet')
filtered_exitron_data.head()

  wide_df = long_df.pivot(index='title', columns='sourceID', values='reads').fillna(0)


Found 2797 junctions that passed the filter.


sourceID,CGND-HRA-00013,CGND-HRA-00015,CGND-HRA-00017,CGND-HRA-00019,CGND-HRA-00020-2,CGND-HRA-00021,CGND-HRA-00023,CGND-HRA-00024,CGND-HRA-00025,CGND-HRA-00026,...,CGND-HRA-03135,CGND-HRA-03136,CGND-HRA-03137,CGND-HRA-03138,CGND-HRA-03139,CGND-HRA-03140,CGND-HRA-1927-1,CGND-HRA-1930-1,CGND-HRA-1941-1,CGND-HRA-1957-1
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr10:119042185:119042215:-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr10:119042185:119042245:-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,142.0,0.0,0.0,0.0,0.0
chr10:119042185:119042275:-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr10:119042190:119042373:-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr10:119042215:119042638:-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0


In [13]:
def getDenominator(source_id, junc_title):
    EXTEND=6
    split_title = junc_title.split(':')
    junc_chr = split_title[0]
    junc_start = int(split_title[1])
    junc_end = int(split_title[2])

    bam_filepath = f'/gpfs/commons/projects/ALS_Consortium_resource/bam_files/{source_id}.bam'
    bam = pysam.AlignmentFile(bam_filepath, 'rb')
    depth = bam.count_coverage(junc_chr, junc_start-EXTEND, junc_end+EXTEND, quality_threshold = 0)
    totals = [sum(values) for values in zip(*depth)]

    denom = np.median(totals[:EXTEND] + totals[-EXTEND:]) # Just get the bits just outside the junction/exitron

    return denom

In [20]:
def normalizeExitronData(filtered_exitron_data):
    # initialize denominator matrix
    normalized_df = pd.DataFrame(np.zeros(filtered_exitron_data.shape), index=filtered_exitron_data.index, columns=filtered_exitron_data.columns)

    for junc_title in tqdm.tqdm(filtered_exitron_data.index):
        for source_id in filtered_exitron_data.columns:
            numerator = filtered_exitron_data.loc[junc_title, source_id]
            if numerator > 0:
                denominator = getDenominator(source_id, junc_title)
                normalized_df.loc[junc_title, source_id] = numerator / denominator
    
    print(f"Normalized {len(normalized_df)} junctions.")
    return normalized_df

In [None]:
normalizeExitronData(filtered_exitron_data)

  1%|          | 17/2797 [05:16<7:39:32,  9.92s/it] 

In [None]:
def detectNovelExitrons():
     # whether exitron is annotated  take in exitron coordiante and whether its annotated as an intron in gtf file (aka if start and end matches )