In [1]:
import pyranges as pr
import pandas as pd

In [109]:
def get_transcript_ref(fname):
    """
    Get transcripts with necessary tags to order based on basic set etc.

    Parameters:
        fname (str): Path to GTF file

    Returns:
        df (pandas DataFrame): DataFrame of only transcript entries
            with tags parsed out
    """
    df = pr.read_gtf(fname, duplicate_attr=True).df
    df = df.loc[df.Feature == 'transcript']
    df['MANE_Select'] = df.tag.str.contains('MANE_Select')
    df['basic_set'] = df.tag.str.contains('basic')
    df['temp'] = df.tag.str.split('appris_principal_', n=1, expand=True)[1]
    df['appris_principal'] = df.temp.str.split(',', n=1, expand=True)[0]

    return df

In [110]:
gtf = 'tests/files/Canx.gtf' 
mode = 'tss'
dist = 50
slack = 50

In [148]:
def get_ends_from_gtf(gtf, mode, dist, slack):
    """
    Create bed regions for each tes in a gtf and number them 
    based on tags indicating priority for each gene w/i the gtf

    Parameters:
        gtf (str): Path to gtf
        dist (int): Distance by which to extend regions on either side
        slack (int): Distance allowable for merging nearby regions

    Returns:
        bed (pyranges PyRanges): PyRanges object containing extended regions
    """
    gr_gtf = pr.read_gtf(gtf)
    if 'gene_id' not in gr_gtf.columns:
        raise ValueError('No gene_id field found in {}'.format(gtf))
        
    # get and extend ends
    if mode == 'tss':
        bed = gr_gtf.features.tss().extend(dist).df[[
            'Chromosome', 'Start', 'End', 'Strand', 'gene_id',
        ]].drop_duplicates()
    elif mode == 'tes':
        bed = gr_gtf.features.tes().extend(dist).df[[
            'Chromosome', 'Start', 'End', 'Strand', 'gene_id',
        ]].drop_duplicates()

    bed = pr.PyRanges(bed.rename({'ThickStart': 'gene_id'}, axis=1))
    bed = bed.merge(strand=True,
                by='gene_id',
                slack=slack)
    
    bed = number_gtf_ends(bed, gtf, mode)

    return bed

In [155]:
def number_gtf_ends(bed, gtf, mode):
    """
    As a part of calling ends from a gtf, use the tags and gene id
    in the gtf to assign each region a name / number
    
    Parameters:
        bed (pyranges PyRanges): Output of get_ends_from_gtf
        gtf (str): Path to gtf file
        mode (str): {'tss', 'tes'}
    
    Returns: 
        bed (pyranges PyRanges): Bed file with Name field 
            consisting of stable gene id_end region number
    """
    
    # read gtf with extra tag info
    gr_gtf = get_transcript_ref(gtf)
    gr_gtf = pr.PyRanges(gr_gtf)
    
    # add a temporary unique identifier for each end
    bed = bed.df
    bed[mode] = [i for i in range(len(bed.index))]
    bed = pr.PyRanges(bed)

    # get ends from reference gtf and join with called ends
    cols = ['Chromosome', 'Start', 'End', 'Strand',
             'gene_id', 'transcript_id', 'MANE_Select',
             'basic_set', 'appris_principal']
    if mode == 'tss':
        temp = pr.PyRanges(gr_gtf.features.tss().df[cols])
    elif mode == 'tes':
        temp = pr.PyRanges(gr_gtf.features.tes().df[cols])
    bed = bed.join(temp,
               strandedness='same',
               how='left',
               slack=0)
    bed = bed.df
    bed = bed.loc[bed.gene_id == bed.gene_id_b]
    cols = ['Start_b', 'End_b', 'Strand_b', 'gene_id_b']
    bed.drop(cols, axis=1, inplace=True)
    
    # use the tags from the gtf to assign an actual end id
    bed = number_tss_ic_tes(bed, mode=mode)
    bed['gene_id'] = bed.gene_id.str.split(pat='.', n=1, expand=True)[0]
    c = '{}_num'.format(mode)
    bed['Name'] = bed.gene_id+'_'+bed[c].astype(str)
    cols = [mode, 'transcript_id', 'MANE_Select',
            'basic_set', 'appris_principal', c, 'gene_id']
    bed.drop(cols, axis=1, inplace=True)
    bed = pr.PyRanges(bed)
    
    return bed

In [156]:
def number_tss_ic_tes(df, mode):
    """
    Assign a number to each tss, intron chain, or tes per gene based on the
    status of the transcript

    Parameters:
        df (pandas DataFrame): DataFrame derived from PyRanges
            with gene id, tss/ic/tes id, and tags to order transcripts
            with
        mode (str): {'tss', 'ic', 'tes'}

    Returns:
        df (pandas DataFrame): DataFrame with an identifier for each unique
            tss/ic/tes per gene
    """
    # groupby feature but record which feature
    # each transcript id uses
    cols = ['transcript_id', 'Chromosome', 'Strand',
            'Start', 'End',
            mode, 'basic_set', 'MANE_Select',
            'appris_principal', 'gene_id']
    df = df[cols].groupby(['Chromosome', 'Strand',
                           'Start', 'End',
                           mode, 'gene_id'],
                           observed=True).agg({'transcript_id': ','.join,
                                     'MANE_Select': 'max',
                                     'basic_set': 'max',
                                     'appris_principal': 'min'}).reset_index()

    # compute feature number based on tags 
    df['{}_num'.format(mode)] = df.sort_values(by=['gene_id', 'MANE_Select',
                                                   'appris_principal', 'basic_set'],
                                 ascending=[True, False, True, False],
                                 na_position='last')\
                                 .groupby(['gene_id'])\
                                 .cumcount() + 1

    return df

In [163]:
bed = get_ends_from_gtf(gtf, 'tss', dist, slack)

In [164]:
bed.head()

Unnamed: 0,Chromosome,Strand,Start,End,Name
0,chr11,-,50298268,50298369,ENSMUSG00000020368_3
1,chr11,-,50299327,50299428,ENSMUSG00000020368_4
2,chr11,-,50300939,50301040,ENSMUSG00000020368_2
3,chr11,-,50325622,50325723,ENSMUSG00000020368_1


In [158]:
df['gene_id'] = df.gene_id.str.split(pat='.', n=1, expand=True)[0]
df['Name'] = df.gene_id+'_'+df.tss_num.astype(str)
cols = [mode, 'transcript_id', 'MANE_Select',
        'basic_set', 'appris_principal', 'tss_num', 'gene_id']
df.drop(cols, axis=1, inplace=True)

AttributeError: ('PyRanges object has no attribute', 'gene_id')

In [130]:
df

Unnamed: 0,Chromosome,Strand,Start,End,Name
0,chr11,-,50298268,50298369,ENSMUSG00000020368_3
1,chr11,-,50299327,50299428,ENSMUSG00000020368_4
2,chr11,-,50300939,50301040,ENSMUSG00000020368_2
3,chr11,-,50325622,50325723,ENSMUSG00000020368_1


In [132]:
df = pr.PyRanges(df)

In [133]:
df.to_bed('test_tss_id.bed')