In [1]:
import pyranges as pr
import pandas as pd

In [109]:
def get_transcript_ref(fname):
    """
    Get transcripts with necessary tags to order based on basic set etc.

    Parameters:
        fname (str): Path to GTF file

    Returns:
        df (pandas DataFrame): DataFrame of only transcript entries
            with tags parsed out
    """
    df = pr.read_gtf(fname, duplicate_attr=True).df
    df = df.loc[df.Feature == 'transcript']
    df['MANE_Select'] = df.tag.str.contains('MANE_Select')
    df['basic_set'] = df.tag.str.contains('basic')
    df['temp'] = df.tag.str.split('appris_principal_', n=1, expand=True)[1]
    df['appris_principal'] = df.temp.str.split(',', n=1, expand=True)[0]

    return df

In [110]:
gtf = 'tests/files/Canx.gtf' 
mode = 'tss'
dist = 50
slack = 50

In [148]:
def get_ends_from_gtf(gtf, mode, dist, slack):
    """
    Create bed regions for each end in a gtf and number them 
    based on tags indicating priority for each gene w/i the gtf

    Parameters:
        gtf (str): Path to gtf
        dist (int): Distance by which to extend regions on either side
        slack (int): Distance allowable for merging nearby regions

    Returns:
        bed (pyranges PyRanges): PyRanges object containing extended regions
    """
    gr_gtf = pr.read_gtf(gtf)
    if 'gene_id' not in gr_gtf.columns:
        raise ValueError('No gene_id field found in {}'.format(gtf))
        
    # get and extend ends
    if mode == 'tss':
        bed = gr_gtf.features.tss().extend(dist).df[[
            'Chromosome', 'Start', 'End', 'Strand', 'gene_id',
        ]].drop_duplicates()
    elif mode == 'tes':
        bed = gr_gtf.features.tes().extend(dist).df[[
            'Chromosome', 'Start', 'End', 'Strand', 'gene_id',
        ]].drop_duplicates()

    bed = pr.PyRanges(bed.rename({'ThickStart': 'gene_id'}, axis=1))
    bed = bed.merge(strand=True,
                by='gene_id',
                slack=slack)
    
    bed = number_gtf_ends(bed, gtf, mode)

    return bed

In [155]:
def number_gtf_ends(bed, gtf, mode):
    """
    As a part of calling ends from a gtf, use the tags and gene id
    in the gtf to assign each region a name / number
    
    Parameters:
        bed (pyranges PyRanges): Output of get_ends_from_gtf
        gtf (str): Path to gtf file
        mode (str): {'tss', 'tes'}
    
    Returns: 
        bed (pyranges PyRanges): Bed file with Name field 
            consisting of stable gene id_end region number
    """
    
    # read gtf with extra tag info
    gr_gtf = get_transcript_ref(gtf)
    gr_gtf = pr.PyRanges(gr_gtf)
    
    # add a temporary unique identifier for each end
    bed = bed.df
    bed[mode] = [i for i in range(len(bed.index))]
    bed = pr.PyRanges(bed)

    # get ends from reference gtf and join with called ends
    cols = ['Chromosome', 'Start', 'End', 'Strand',
             'gene_id', 'transcript_id', 'MANE_Select',
             'basic_set', 'appris_principal']
    if mode == 'tss':
        temp = pr.PyRanges(gr_gtf.features.tss().df[cols])
    elif mode == 'tes':
        temp = pr.PyRanges(gr_gtf.features.tes().df[cols])
    bed = bed.join(temp,
               strandedness='same',
               how='left',
               slack=0)
    bed = bed.df
    bed = bed.loc[bed.gene_id == bed.gene_id_b]
    cols = ['Start_b', 'End_b', 'Strand_b', 'gene_id_b']
    bed.drop(cols, axis=1, inplace=True)
    
    # use the tags from the gtf to assign an actual end id
    bed = number_tss_ic_tes(bed, mode=mode)
    bed['gene_id'] = bed.gene_id.str.split(pat='.', n=1, expand=True)[0]
    c = '{}_num'.format(mode)
    bed['Name'] = bed.gene_id+'_'+bed[c].astype(str)
    cols = [mode, 'transcript_id', 'MANE_Select',
            'basic_set', 'appris_principal', c, 'gene_id']
    bed.drop(cols, axis=1, inplace=True)
    bed = pr.PyRanges(bed)
    
    return bed

In [156]:
def number_tss_ic_tes(df, mode):
    """
    Assign a number to each tss, intron chain, or tes per gene based on the
    status of the transcript

    Parameters:
        df (pandas DataFrame): DataFrame derived from PyRanges
            with gene id, tss/ic/tes id, and tags to order transcripts
            with
        mode (str): {'tss', 'ic', 'tes'}

    Returns:
        df (pandas DataFrame): DataFrame with an identifier for each unique
            tss/ic/tes per gene
    """
    # groupby feature but record which feature
    # each transcript id uses
    cols = ['transcript_id', 'Chromosome', 'Strand',
            'Start', 'End',
            mode, 'basic_set', 'MANE_Select',
            'appris_principal', 'gene_id']
    df = df[cols].groupby(['Chromosome', 'Strand',
                           'Start', 'End',
                           mode, 'gene_id'],
                           observed=True).agg({'transcript_id': ','.join,
                                     'MANE_Select': 'max',
                                     'basic_set': 'max',
                                     'appris_principal': 'min'}).reset_index()

    # compute feature number based on tags 
    df['{}_num'.format(mode)] = df.sort_values(by=['gene_id', 'MANE_Select',
                                                   'appris_principal', 'basic_set'],
                                 ascending=[True, False, True, False],
                                 na_position='last')\
                                 .groupby(['gene_id'])\
                                 .cumcount() + 1

    return df

In [163]:
bed = get_ends_from_gtf(gtf, 'tss', dist, slack)

## Getting ICs from GTF

In [167]:
def get_ic(gtf_pr):
    """
    Get a hyphen-separated representation of each transcript's intron chain
    from a PyRanges GTF

    Parameters:
        gtf_pr (pyranges PyRanges): GTF PyRanges object

    Returns:
        df (pandas DataFrame): DataFrame detailing intron chain, gene, strand,
            chromosome, and transcript that intron chain was seen in
    """
    df = gtf_pr.df.copy(deep=True)

    # restrict to exon entries
    df = df.loc[df.Feature == 'exon']
    cols = ['Chromosome', 'Strand', 'Start', 'End', 'transcript_id', 'gene_id']
    df = df[cols]

    # melt to isolate individual coordinates
    df = pd.melt(df, id_vars=['Chromosome', 'Strand', 'transcript_id', 'gene_id'],
                value_vars=['Start', 'End'],
                value_name='Coord')
    df.drop('variable', axis=1, inplace=True)

    # sort to order coordinates correctly
    df.Coord = df.Coord.astype(int)
    fwd = df.loc[df.Strand == '+'].copy(deep=True)
    rev = df.loc[df.Strand == '-'].copy(deep=True)

    fwd.sort_values(by=['Chromosome', 'transcript_id', 'Coord'],
                    ascending=[True, True, True], inplace=True)
    rev.sort_values(by=['Chromosome', 'transcript_id', 'Coord'],
                    ascending=[True, True, False], inplace=True)
    df = pd.concat([fwd, rev])

    # create intron chain strings
    df.Coord = df.Coord.astype(str)
    df = df.groupby(['Chromosome', 'Strand',
                     'transcript_id', 'gene_id'], observed=True)['Coord'].apply('-'.join).reset_index()

    # remove tss and tes from intron chain
    df['temp'] = df.Coord.str.split('-', n=1, expand=True)[1]
    df['ic'] = df.temp.str.rsplit('-', n=1, expand=True)[0]

    return df

In [171]:
# def get_ics_from_gtf(gtf):
#     """
#     Get a file for each intron chain in a gtf and number them 
#     based on tags indicating priority for each gene w/i the gtf 
    
#     Parameters:
#         gtf (str): Filename for input gtf
#     """
    
# get basic status and appris_principal tag for each transcript
t_df = get_transcript_ref(gtf)
t_df = t_df[['transcript_id', 'MANE_Select', 'basic_set', 'appris_principal']]

# get unique intron chains from gtf
df = pr.read_gtf(gtf)
df = get_ic(df)

# add basic annotation, appris principal number, and gene id
df = df.merge(t_df, on='transcript_id', how='left')

# add number for each unique intron chain
df = number_tss_ic_tes(df, mode='ic')

# # make coords into tuple and perform additional
# # formatting for this table
# df['ic'] = df.ic.str.split('-')
# df['ic'] = [tuple(c) for c in df.ic.tolist()]
# ic = df.copy(deep=True)
# ic.rename({'ic': 'coordinates',
#            'Chromosome': 'chrom',
#            'Strand': 'strand'},
#            axis=1, inplace=True)
# ic['ic_id'] = ic['gene_id']+'_'+ic.ic_num.astype(str)
# ic.drop(['MANE_Select', 'basic_set', 'appris_principal',
#          'gene_id', 'ic_num'],
#         axis=1, inplace=True)
?

KeyError: "['Start', 'End'] not in index"

In [170]:
df.head()

Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,Coord,temp,ic,MANE_Select,basic_set,appris_principal
0,chr11,-,ENSMUST00000020637.8,ENSMUSG00000020368.14,50325673-50325492-50311784-50311603-50310867-5...,50325492-50311784-50311603-50310867-50310793-5...,50325492-50311784-50311603-50310867-50310793-5...,False,True,1.0
1,chr11,-,ENSMUST00000146979.1,ENSMUSG00000020368.14,50299378-50299128-50298236-50298116-50297394-5...,50299128-50298236-50298116-50297394-50297273-5...,50299128-50298236-50298116-50297394-50297273-5...,,,
2,chr11,-,ENSMUST00000153068.1,ENSMUSG00000020368.14,50298319-50298116-50297394-50297363,50298116-50297394-50297363,50298116-50297394,,,
3,chr11,-,ENSMUST00000155801.1,ENSMUSG00000020368.14,50300990-50300897-50299344-50298837,50300897-50299344-50298837,50300897-50299344,False,False,2.0
4,chr11,-,ENSMUST00000155801.2,ENSMUSG00000020368.14,50300990-50300897-50299344-50298837,50300897-50299344-50298837,50300897-50299344,False,True,1.0
