In [10]:
import pyranges as pr
import pandas as pd
import numpy as np
import h5py 
import os

from cerberus.cerberus import *
from cerberus.main import *

## modify / collapse transcripts in TALON GTF

In [150]:
# def map_gtf_transcripts(gtf, m_df):
#     """
#     Add cerberus transcript id and ids for the ends used to each entry in gtf
    
#     Parameters:
#         gtf (pandas DataFrame): DF of GTF
#         m_df (pandas DataFrame): Map file from cerberus reference matching
#             each transcript id in `gtf` to a cerberus transcript id
    
#     Returns:
#         gtf (pandas DataFrame): DF of GTF with cerberus ids added
#     """
#     gtf = df.copy(deep=True)
#     gtf = gtf.merge(m_df, how='left', 
#                     left_on=['transcript_name', 'transcript_id'],
#                     right_on=['original_transcript_name', 'original_transcript_id'],
#                     suffixes=('', '_cerberus'))

#     # gtf.drop(['transcript_id', 'transcript_name'], axis=1, inplace=True)
#     # gtf.rename({'transcript_id_cerberus': 'transcript_id',
#     #             'transcript_name_cerberus': 'transcript_name'},
#     #            axis=1, inplace=True)
    
#     return gtf

def get_stranded_gtf_dfs(df):
    """
    Split a GTF df into fwd and rev strands
    
    Parameters:
        df (pandas DataFrame): DF of gtf
    
    Returns:
        fwd (pandas DataFrame): DF of all forward-stranded entries from GTF
        rev (pandas DataFrame): DF of all reverse-stranded entries from GTF
    """
    rev = df.loc[df.Strand == '-'].copy(deep=True)
    fwd = df.loc[df.Strand == '+'].copy(deep=True)
    
    return fwd, rev

def sort_gtf(df):
    """
    Sort a GTF into its proper ordering
    
    Parameters:
        df (pandas DataFrame): DF of GTF
    
    Returns:
        df (pandas DataFrame): DF of GTF, sorted
    """
    df['feature_rank'] = df.Feature.map({'gene':0, 'transcript':1, 'exon':2})
    df.feature_rank = df.feature_rank.astype(int)    
     
    fwd, rev = get_stranded_gtf_dfs(df)

    df = pd.DataFrame()
    for temp in [fwd, rev]:
        if len(temp.index) > 0:
            strand = temp.Strand.values.tolist()[0]
            if strand == '+':
                ascending = True
            elif strand == '-':
                ascending = False
            temp.sort_values(by=['gene_id', 'transcript_id', 'feature_rank', 'Start'], 
                             ascending=[True, True, True, ascending],
                             na_position='first', inplace=True)
            
            df = pd.concat([df, temp])
    # print(temp[['gene_id', 'transcript_id', 'feature_rank', 'Start']].dtypes)
    # tids = ['ENST00000380874.3', 'ENST00000645831.1', 'ENST00000647651.1', 'ENST00000379198.4', 'ENST00000373200.5', 'ENST00000644024.1']
    # if 'original_transcript_id' in df.columns:
    #     test = df.loc[df.original_transcript_id.isin(tids)].copy(deep=True)
    # else:
    #     test = df.loc[df.transcript_id.isin(tids)].copy(deep=True)
    # print(test[['Feature', 'gene_id', 'transcript_id', 'feature_rank', 'Start']])
    df.drop('feature_rank', axis=1, inplace=True)
    return df

def get_update_ends_settings(strand, mode):
    """
    Returns which columns to refer to and which min/max function
    to use depending on looking at forward / rev strand or 
    tss / tes
    
    Parameters:
        strand (str): {'+', '-'}
        mode (str): {'tss', 'tes'}
    
    Returns:
        old_end (str): Name of column to modify; {'Start', 'End'}
        new_end (str): Name of column to pull new value from; {'Start_end', 'End_end'}
        gene_func (str): What function to apply to new_end; {'min', 'max'}
    """
    if mode == 'tss': 
        if strand == '+':
            old_end = 'Start'
            new_end = 'Start_end'
            gene_func = 'min'
        elif strand == '-':
            old_end = 'End'
            new_end = 'End_end'
            gene_func = 'max'
    elif mode == 'tes':
        if strand == '+':
            old_end = 'End'
            new_end = 'End_end'
            gene_func = 'max'
        elif strand == '-':
            old_end = 'Start'
            new_end  = 'Start_end'
            gene_func = 'min'
            
    return old_end, new_end, gene_func

def update_transcript_ends(df, mode, strand):
    """
    Update the ends of transcripts and the first / last exon
    in a GTF. GTF must be sorted!
    
    Parameters:
        df (pandas DataFrame): Sorted DF of GTF with 'Start_end', and 'End_end' 
            columns denoting the boundaries of each end region
        mode (str): {'tss', 'tes'}
        strand (str): {'+', '-'}
    
    Returns:
        df (pandas DataFrame): DF of GTF with transcript and 
            exon ends modified
    """
    old_col, new_col, gene_func = get_update_ends_settings(strand, mode)
    
    temp = df[['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'Start_end', 'End_end']].copy(deep=True)
    temp = temp.loc[temp.Feature != 'gene']
    if mode == 'tss':
        inds = temp.groupby('transcript_id').head(2).index.tolist()
    elif mode == 'tes':
        inds = temp.groupby('transcript_id').head(1).index.tolist()
        inds += temp.groupby('transcript_id').tail(1).index.tolist()

    df.loc[inds, old_col] = df.loc[inds, new_col]
    
    # convert float dtypes
    df.Start = df.Start.astype(int)
    df.End = df.End.astype(int)
    
    return df

def update_gene_ends(df, mode, strand):
    """
    Update the ends of genes in a GTF. 
    
    Parameters:
        df (pandas DataFrame): GTF dataframe 
        mode (str): {'tss', 'tes'}
        strand (str): {'+', '-'}
        
    Returns:
        df (pandas DataFrame): DataFrame of GTF with gene ends updated
    """
    # determine which ends we're updating and how we're doing so 
    old_col, new_col, gene_func = get_update_ends_settings(strand, mode)
    
    # get min or max of transcript ends depending on settings
    temp = df[['Feature', 'gene_id', old_col]].copy(deep=True)
    temp = temp.loc[temp.Feature == 'transcript']
    temp = temp.groupby(['gene_id', 'Feature'], observed=True).agg(gene_func).reset_index()
    temp.drop('Feature', axis=1, inplace=True)
    
    # add that coord to gene end
    df = df.merge(temp, on='gene_id', suffixes=('', '_gene'))
    inds = df.loc[df.Feature == 'gene'].index.tolist()
    df.loc[inds, old_col] = df.loc[inds, '{}_gene'.format(old_col)]
    df.drop('{}_gene'.format(old_col), axis=1, inplace=True)
    
    return df

def update_gtf_ends(gtf, tss, tes):
    """
    Update gene, transcript, and exon boundaries to be 
    furthest upstream or downstream entry for end used
    
    Parameters:
        gtf (pandas DataFrame): DF of GTF
        tss (pyranges PyRanges): PyRanges object of reference TSSs
        tes (pyranges PyRanges): PyRanges object of reference TESs
    
    Returns: 
        gtf (pandas DataFrame): DF of GTF with updated ends
            based on the TSSs and TESs used in the input beds
    """
    gtf = gtf.copy(deep=True)

    for mode, ends in zip(['tss', 'tes'], [tss, tes]):
        ends = ends.df
        ends = ends[['Start', 'End', '{}_id'.format(mode)]]
        gtf = gtf.merge(ends, how='left',
                        on='{}_id'.format(mode),
                        suffixes=('', '_end'))

        fwd, rev = get_stranded_gtf_dfs(gtf)
        df = pd.DataFrame()
        for strand, temp in zip(['+', '-'], [fwd, rev]):
    
            # fix exon, transcript, and gene boundaries
            temp = update_transcript_ends(temp, mode, strand)
            temp = update_gene_ends(temp, mode, strand)
            df = pd.concat([df, temp])
        
        gtf = df.copy(deep=True)
        gtf.drop(['Start_end', 'End_end'], axis=1, inplace=True)
        
    return gtf

In [151]:
h5 = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/cerberus/human_cerberus.h5'
gtf = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_known_nic_nnc_talon.gtf'
gtf = 'test_dupe_genes.gtf'
agg = True
update_ends = True

# def replace_gtf_ids():

df = pr.read_gtf(gtf).df
print(len(df.index))
entry_types = ['gene', 'transcript', 'exon']
df = df.loc[df.Feature.isin(entry_types)]
df = sort_gtf(df)

if not update_ends:
    _, _, _, m_df = read_h5(h5)
else:
    _, tss, tes, m_df = read_h5(h5)
    # hack to remove duplicates for now
    # tss = pr.PyRanges(tss.df[['Chromosome', 'Start', 'End', 'Strand', 'gene_id', 'tss']].drop_duplicates())
    # tes = pr.PyRanges(tes.df[['Chromosome', 'Start', 'End', 'Strand', 'gene_id', 'tes']].drop_duplicates())
    tss = tss.df
    tes = tes.df
    tss['tss_id'] = tss.gene_id+'_'+tss.tss.astype(str)
    tes['tes_id'] = tes.gene_id+'_'+tes.tes.astype(str)
    tss = pr.PyRanges(tss)
    tes = pr.PyRanges(tes)


m_df.drop(['transcript_triplet', 
           'gene_name', 'gene_id'], axis=1, inplace=True)

df = df.merge(m_df, how='left', 
                left_on=['transcript_name', 'transcript_id'],
                right_on=['original_transcript_name', 'original_transcript_id'],
                suffixes=('', '_cerberus'))

if update_ends:
    df = update_gtf_ends(df, tss, tes)

if agg:
    if not update_ends:
        raise ValueError('Must update ends to aggregate transcripts')
    pass
    # gtf = agg_gtf_transcripts()
    
df.drop(['transcript_id', 'transcript_name'], axis=1, inplace=True)
df.rename({'transcript_id_cerberus': 'transcript_id',
            'transcript_name_cerberus': 'transcript_name'},
           axis=1, inplace=True)

15


In [152]:
df

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,original_transcript_name,transcript_id,transcript_name
0,chr6,HAVANA,gene,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,,,,,,,,,,
1,chr6,HAVANA,transcript,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000380874.3,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
2,chr6,HAVANA,exon,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000380874.3,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
3,chr6,HAVANA,transcript,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000645831.1,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
4,chr6,HAVANA,exon,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000645831.1,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
5,chr1,HAVANA,gene,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,,,,,,,,,,
6,chr1,HAVANA,transcript,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000379198.4,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
7,chr1,HAVANA,exon,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000379198.4,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
8,chr1,HAVANA,transcript,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000647651.1,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
9,chr1,HAVANA,exon,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000647651.1,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"


In [158]:
df.columns

Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_name', 'gene_status', 'gene_type',
       'talon_gene', 'havana_gene', 'level', 'tag', 'transcript_status',
       'talon_transcript', 'source', 'transcript_type', 'havana_transcript',
       'protein_id', 'exon_number', 'exon_id', 'talon_exon', 'exon_status',
       'transcript_support_level', 'ccdsid', 'original_transcript_id', 'ic',
       'ic_id', 'tss_id', 'tss', 'tes_id', 'tes', 'original_transcript_name',
       'transcript_id', 'transcript_name'],
      dtype='object')

In [157]:
dupe_cols = ['Chromosome', 'Source', 
             'Feature', 
             'Start', 'End', 
             'Score', 'Strand', 'Frame', 'gene_id', 'gene_name', 
             'gene_status', 'gene_type', 'talon_gene', 
             'ic', 'ic_id', 'tss_id', 'tss', 'tes_id', 'tes', 'transcript_id',
             'transcript_name']
df.loc[df[dupe_cols].duplicated(keep=False)]
# df[dupe_cols]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,original_transcript_name,transcript_id,transcript_name
1,chr6,HAVANA,transcript,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000380874.3,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
2,chr6,HAVANA,exon,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000380874.3,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
3,chr6,HAVANA,transcript,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000645831.1,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
4,chr6,HAVANA,exon,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000645831.1,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
6,chr1,HAVANA,transcript,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000379198.4,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
7,chr1,HAVANA,exon,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000379198.4,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
8,chr1,HAVANA,transcript,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000647651.1,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
9,chr1,HAVANA,exon,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000647651.1,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
11,chrX,HAVANA,transcript,83508239,83512177,.,+,.,ENSG00000196767.7,POU3F4,...,ENST00000373200.5,1.0,ENSG00000196767_1,ENSG00000196767_1,1.0,ENSG00000196767_1,1.0,POU3F4-201,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]"
12,chrX,HAVANA,exon,83508239,83512177,.,+,.,ENSG00000196767.7,POU3F4,...,ENST00000373200.5,1.0,ENSG00000196767_1,ENSG00000196767_1,1.0,ENSG00000196767_1,1.0,POU3F4-201,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]"


In [98]:
df

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,original_transcript_name,transcript_id,transcript_name
0,chr6,HAVANA,gene,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,,,,,,,,,,
1,chr6,HAVANA,transcript,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000380874.3,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
2,chr6,HAVANA,exon,1609864,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000380874.3,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
3,chr6,HAVANA,transcript,1610066,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000645831.1,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
4,chr6,HAVANA,exon,1610066,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000645831.1,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
5,chr1,HAVANA,gene,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,,,,,,,,,,
6,chr1,HAVANA,transcript,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000379198.4,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
7,chr1,HAVANA,exon,1232186,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000379198.4,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
8,chr1,HAVANA,transcript,1232225,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000647651.1,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
9,chr1,HAVANA,exon,1232225,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000647651.1,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"


In [99]:
tss_df = tss.df
tss_df.loc[tss_df.Name == 'ENSG00000054598_1']

Unnamed: 0,Chromosome,Start,End,Strand,Name,source,gene_id,tss,tss_id
46048,chr6,1609864,1609965,+,ENSG00000054598_1,"v39,v29,talon",ENSG00000054598,1,ENSG00000054598_1


In [57]:
temp = gtf.loc[gtf.Feature == 'transcript'].copy(deep=True)
temp = temp.sort_values(by='transcript_id')

In [58]:
temp.loc[temp.transcript_id.duplicated(keep=False), 'Chromosome'].unique()
temp.loc[(temp.transcript_id.duplicated(keep=False))&(temp.Chromosome=='chr6')]
temp = temp.loc[temp.transcript_id.duplicated(keep=False)]

In [68]:
gtf = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_known_nic_nnc_talon.gtf'
gtf = pr.read_gtf(gtf).df
temp = gtf.loc[gtf.gene_name.isin(['FOXC1', 'B3GALT6', 'POU3F4'])]
temp = pr.PyRanges(temp)
temp.to_gtf('test_dupe_genes.gtf')

In [60]:
tids = ['ENSG00000054598[1,1,1]', 'ENSG00000176022[1,1,1]', 'ENSG00000196767[1,1,1]']
temp = gtf.loc[gtf.transcript_id.isin(tids)]
# temp = sort_gtf(temp)
temp

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,original_transcript_name,transcript_id,transcript_name
38975,chr6,HAVANA,transcript,1609864,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000380874.3,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
38976,chr6,HAVANA,exon,1609864,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000380874.3,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
38977,chr6,HAVANA,transcript,1610066,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000645831.1,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
38978,chr6,HAVANA,exon,1610066,1613947,.,+,.,ENSG00000054598.7,FOXC1,...,ENST00000645831.1,1.0,ENSG00000054598_1,ENSG00000054598_1,1.0,ENSG00000054598_1,1.0,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
477309,chr1,HAVANA,transcript,1232186,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000379198.4,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
477310,chr1,HAVANA,exon,1232186,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000379198.4,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
477311,chr1,HAVANA,transcript,1232225,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000647651.1,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
477312,chr1,HAVANA,exon,1232225,1235091,.,+,.,ENSG00000176022.5,B3GALT6,...,ENST00000647651.1,1.0,ENSG00000176022_1,ENSG00000176022_1,1.0,ENSG00000176022_1,1.0,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
528989,chrX,HAVANA,transcript,83508239,83512177,.,+,.,ENSG00000196767.7,POU3F4,...,ENST00000373200.5,1.0,ENSG00000196767_1,ENSG00000196767_1,1.0,ENSG00000196767_1,1.0,POU3F4-201,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]"
528990,chrX,HAVANA,exon,83508239,83512127,.,+,.,ENSG00000196767.7,POU3F4,...,ENST00000373200.5,1.0,ENSG00000196767_1,ENSG00000196767_1,1.0,ENSG00000196767_1,1.0,POU3F4-201,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]"


In [47]:
test = pr.read_gtf('/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_known_nic_nnc_talon.gtf').df

In [48]:
tids = ['ENST00000380874.3', 'ENST00000645831.1', 'ENST00000647651.1', 'ENST00000379198.4', 'ENST00000373200.5', 'ENST00000644024.1']
test = test.loc[test.transcript_id.isin(tids)].copy(deep=True)

In [51]:
test = sort_gtf(test)

gene_id          object
transcript_id    object
feature_rank      int64
Start             int32
dtype: object
            Feature            gene_id      transcript_id  feature_rank  \
380352   transcript  ENSG00000054598.7  ENST00000380874.3             1   
380353         exon  ENSG00000054598.7  ENST00000380874.3             2   
380354   transcript  ENSG00000054598.7  ENST00000645831.1             1   
380355         exon  ENSG00000054598.7  ENST00000645831.1             2   
700      transcript  ENSG00000176022.5  ENST00000379198.4             1   
701            exon  ENSG00000176022.5  ENST00000379198.4             2   
698      transcript  ENSG00000176022.5  ENST00000647651.1             1   
699            exon  ENSG00000176022.5  ENST00000647651.1             2   
1189029  transcript  ENSG00000196767.7  ENST00000373200.5             1   
1189030        exon  ENSG00000196767.7  ENST00000373200.5             2   
1189031  transcript  ENSG00000196767.7  ENST00000644024.1        

In [41]:
test

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,tag,transcript_type,transcript_support_level,havana_transcript,NNC_transcript,protein_id,ccdsid,intergenic_novel,antisense_gene,gene_antisense_to_IDs
380352,chr6,HAVANA,transcript,1609971,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,CCDS,protein_coding,,OTTHUMT00000043450.1,,ENSP00000370256.2,CCDS4473.1,,,
380353,chr6,HAVANA,exon,1609971,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,CCDS,protein_coding,,,,ENSP00000370256.2,CCDS4473.1,,,
380354,chr6,HAVANA,transcript,1610066,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,CCDS,protein_coding,,OTTHUMT00000495790.1,,ENSP00000493906.1,CCDS4473.1,,,
380355,chr6,HAVANA,exon,1610066,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,CCDS,protein_coding,,,,ENSP00000493906.1,CCDS4473.1,,,
700,chr1,HAVANA,transcript,1232258,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,CCDS,protein_coding,,OTTHUMT00000005071.2,,ENSP00000368496.2,CCDS13.1,,,
701,chr1,HAVANA,exon,1232258,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,CCDS,protein_coding,,,,ENSP00000368496.2,CCDS13.1,,,
698,chr1,HAVANA,transcript,1232225,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,appris_principal_1,protein_coding,,OTTHUMT00000500823.1,,ENSP00000496787.1,,,,
699,chr1,HAVANA,exon,1232225,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,appris_principal_1,protein_coding,,,,ENSP00000496787.1,,,,
1189029,chrX,HAVANA,transcript,83508260,83512127,.,+,.,ENSG00000196767.7,POU3F4,...,CCDS,protein_coding,,OTTHUMT00000057368.5,,ENSP00000362296.2,CCDS14450.1,,,
1189030,chrX,HAVANA,exon,83508260,83512127,.,+,.,ENSG00000196767.7,POU3F4,...,CCDS,protein_coding,,,,ENSP00000362296.2,CCDS14450.1,,,


In [None]:
# def agg_gtf_transcripts():
gb_cols = ['Chromosome', 'Source', 'Start', 'End', 'Score', 'Strand', 'Frame', 'gene_name', 'gene_id', 'gene_status', 'gene_type', 

## testing

In [153]:
def make_end_df(c,s,st,e,n, source,mode):
    df = pd.DataFrame()
    cols = ['Chromosome', 'Strand', 'Start', 'End', 'Name']
    var = [c,s,st,e,n]
    for col, var in zip(cols, var):
        if type(var) == list:
            df[col] = var

    # add source
    df['source'] = source

    df = format_end_df(df)

    # get end # and gene id
    if any(df.Name.isnull()):
        df['gene_id'] = np.nan
        df[mode] = np.nan
    else:
        df['gene_id'] = df.Name.str.split('_', expand=True)[0]
        df[mode] = df.Name.str.split('_', expand=True)[1]

    # get arbitrary unique ids
    df['id'] = [i for i in range(len(df.index))]
    
    # get id
    df['{}_id'.format(mode)] = df.gene_id+'_'+df[mode]

    return df

def format_end_df(df):
    sort_cols = ['Chromosome', 'Start', 'End', 'Strand']
    df = df.sort_values(by=sort_cols)
    order = ['Chromosome', 'Start', 'End', 'Strand', 'Name', 'source']
    order = [o for o in order if o in df.columns]
    df = df[order]
    df.reset_index(drop=True, inplace=True)
    return df

def make_exon_df(n,c,e,s,g,t):
    df = pd.DataFrame()
    df['Chromosome'] = c
    df['Start'] = [i[0] for i in e]
    df['End'] = [i[1] for i in e]
    df['Strand'] = s
    df['Feature'] = 'exon'
    cols = ['gene_name', 'gene_id']
    for c in cols:
        df[c] = g
    cols = ['transcript_id', 'transcript_name']
    for c in cols:
        df[c] = t

    # reorder exons and starts/ stops if needed
    df['new_Start'] = df[['Start', 'End']].min(axis=1)
    df['new_End'] = df[['Start', 'End']].max(axis=1)
    df.drop(['Start', 'End'], axis=1, inplace=True)
    df.rename({'new_Start':'Start',
               'new_End':'End'}, axis=1, inplace=True)
    s = s[0]
    if s == '+':
        ascending = True
    elif s == '-':
        ascending = False
    df.sort_values(by='Start', ascending=ascending, inplace=True)
    return df

def make_hier_entry(df, how='t'):
    """
    kind {'g','t'}
    """
    agg_dict = {'min_coord': 'min', 'max_coord': 'max'}
    t_df = df.copy(deep=True)
    t_df['min_coord'] = t_df[['Start', 'End']].min(axis=1)
    t_df['max_coord'] = t_df[['Start', 'End']].max(axis=1)
    if how == 't':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id', 'transcript_id', 'transcript_name']
        if 'tss_id' in t_df.columns:
            gb_cols.append('tss_id')
        if 'tes_id' in t_df.columns:
            gb_cols.append('tes_id')
    elif how == 'g':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id']
        
    cols = gb_cols + ['min_coord', 'max_coord']
    t_df = t_df[cols]
    t_df = t_df.groupby(gb_cols).agg(agg_dict).reset_index()
    t_df.rename({'min_coord': 'Start', 'max_coord': 'End'}, axis=1, inplace=True)
    if how == 't':
        t_df['Feature'] = 'transcript'
    elif how == 'g':
        t_df['Feature'] = 'gene'
        
    return t_df

def make_test_gtf(ts):
    df = pd.concat(ts)
        # make transcript entries
    t_df = make_hier_entry(df, how='t')
    # make gene entries
    g_df = make_hier_entry(df, how='g')

    # concat everything and sort by gene id, transcript id, feature rank (gene =0, t =1, exon=2), then start coords
    df = pd.concat([df, t_df, g_df])
    df = sort_gtf(df)
    return df

In [154]:
# tests for update_gtf_ends
def test_update_gtf_ends(print_dfs=True):

    ts = []
    # t1 - fwd strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[1,10], [14,20], [25,30]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)
    
    # t1.5 - fwd strand transcript from gene that doesn't 
    # need boundaries updated
    n = 3
    c = ['1' for i in range(n)]
    e = [[0, 10], [14,20], [25, 35]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1_t1.5'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)

    # t2 - rev. strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[90,60], [45,30], [10,8]]
    s = ['-' for i in range(n)]
    g = 'g2'
    t = 'g2_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g2_1'
    df['tes_id'] = 'g2_1'
    ts.append(df)

    # t3 - rev. strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[95,60], [45,30], [10,6]]
    s = ['-' for i in range(n)]
    g = 'g2'
    t = 'g2_t2'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g2_2'
    df['tes_id'] = 'g2_2'
    ts.append(df)
    
    # t4 - fwd strand gene w/ monoexonic transcript
    n = 1
    c = ['1' for i in range(n)]
    e = [[20,30]]
    s = ['+']
    g = 'g3'
    t = 'g3_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g3_1'
    df['tes_id'] = 'g3_1'
    ts.append(df)
    
    # t5 - rev strand gene w/ monoexonic transcript
    n = 1
    c = ['1' for i in range(n)]
    e = [[50, 40]]
    s = ['-']
    g = 'g4'
    t = 'g4_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g4_1'
    df['tes_id'] = 'g4_1'
    ts.append(df)

    # tss reference
    n = 5
    mode = 'tss'
    c = ['1' for i in range(n)]
    s = ['+', '-', '-', '+', '-']
    st = [0, 85, 91, 19, 50] 
    e = [3, 93, 98, 22, 55]
    n = ['g1_1', 'g2_1', 'g2_2', 'g3_1', 'g4_1']
    source = 'v1'
    tss = make_end_df(c,s,st,e,n,source,mode)
    tss = pr.PyRanges(tss)

    # tes reference
    n = 5
    mode = 'tes'
    c = ['1' for i in range(n)]
    s = ['+', '-', '-', '+', '-']
    st = [25, 7, 4, 30, 35]
    e = [35, 9, 6, 31, 40]
    n = ['g1_1', 'g2_1', 'g2_2', 'g3_1', 'g4_1']
    source = 'v1'
    tes = make_end_df(c,s,st,e,n,source,mode)
    tes = pr.PyRanges(tes)

    test_df = make_test_gtf(ts)
    # test_df = pr.PyRanges(test_df)

    test = update_gtf_ends(test_df, tss, tes)

    # ctrl for update_gtf_ends
    # tests for update_gtf_ends
    ts = []
    # t1 - fwd strand transcript
    n = 3
    c = ['1' for i in range(n)]
    e = [[0,10], [14,20], [25,35]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)
    
    # t1.5 - fwd strand transcript from gene that doesn't 
    # need boundaries updated
    n = 3
    c = ['1' for i in range(n)]
    e = [[0, 10], [14,20], [25, 35]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1_t1.5'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)

    # t2 - rev. strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[93,60], [45,30], [10,7]]
    s = ['-' for i in range(n)]
    g = 'g2'
    t = 'g2_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g2_1'
    df['tes_id'] = 'g2_1'
    ts.append(df)

    # t3 - rev. strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[98,60], [45,30], [10,4]]
    s = ['-' for i in range(n)]
    g = 'g2'
    t = 'g2_t2'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g2_2'
    df['tes_id'] = 'g2_2'
    ts.append(df)
    
    # t4 - fwd strand gene w/ monoexonic transcript
    n = 1
    c = ['1' for i in range(n)]
    e = [[19,31]]
    s = ['+']
    g = 'g3'
    t = 'g3_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g3_1'
    df['tes_id'] = 'g3_1'
    ts.append(df)
    
    # t4 - rev strand gene w/ monoexonic transcript
    n = 1
    c = ['1' for i in range(n)]
    e = [[55, 35]]
    s = ['-']
    g = 'g4'
    t = 'g4_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g4_1'
    df['tes_id'] = 'g4_1'
    ts.append(df)

    ctrl = make_test_gtf(ts)
    ctrl.reset_index(inplace=True, drop=True)
    test.reset_index(inplace=True, drop=True)

    if print_dfs:
        print('test')
        print(test)
        print(test.index)
        print(test.dtypes)
        print('ctrl')
        print(ctrl)
        print(ctrl.index)
        print(ctrl.dtypes)

    pd.testing.assert_frame_equal(ctrl, test, check_like=True)

    assert len(ctrl.index) == len(test.index)

In [155]:
test_update_gtf_ends(False)

In [583]:
# tests for map_gtf_ids
# - transcript that does not have duplicate
# - transcript that does have a duplicate

ts = []
# t1 - transcript that doesn't need to be merged
n = 3
c = ['1' for i in range(n)]
e = [[1,10], [14,20], [25,30]]
s = ['+' for i in range(n)]
g = 'g1'
t = 'g1_t1'
ts.append(make_exon_df(n,c,e,s,g,t))

# t2 - rev. strand transcript that needs to be merged
n = 3
c = ['1' for i in range(n)]
e = [[90,60], [45,30], [10,8]]
s = ['-' for i in range(n)]
g = 'g2'
t = 'g2_t1'
ts.append(make_exon_df(n,c,e,s,g,t))

# t3 - rev. strand transcript that needs to be merged
n = 3
c = ['1' for i in range(n)]
e = [[95,60], [45,30], [10,6]]
s = ['-' for i in range(n)]
g = 'g2'
t = 'g2_t2'
ts.append(make_exon_df(n,c,e,s,g,t))

test_df = make_test_gtf(ts)

# map file
otid = ['g1_t1', 'g2_t1', 'g2_t2',]
otname = [o+'n' for o in otid]
tid = ['g1[1,1,1]', 'g2[1,1,1]', 'g2[1,1,1]']
tname = [o.split('[')[0]+'n['+o.split('[')[1] for o in tid]
gid = ['g1', 'g2', 'g2']
gname = ['g1n', 'g2n', 'g2n']
m_df = pd.DataFrame()
m_df['original_transcript_id'] = otid
m_df['original_transcript_name'] = otname
m_df['transcript_id'] = tid
m_df['transcript_name'] = tname
m_df['gene_name'] = gname
m_df['transcript_triplet'] = m_df.transcript_id.str.slice(2)
m_df['gene_id'] = gid

# control
otid = ['g1_t1', 'g2_t2']

In [209]:
m_df

Unnamed: 0,original_transcript_id,original_transcript_name,transcript_id,transcript_name,gene_name,transcript_triplet,gene_id
0,g1_t1,g1_t1n,"g1[1,1,1]","g1n[1,1,1]",g1n,"[1,1,1]",g1
1,g2_t1,g2_t1n,"g2[1,1,1]","g2n[1,1,1]",g2n,"[1,1,1]",g2
2,g2_t2,g2_t2n,"g2[1,1,1]","g2n[1,1,1]",g2n,"[1,1,1]",g2


In [210]:
test_df

Unnamed: 0,Chromosome,Strand,Feature,gene_name,gene_id,transcript_id,transcript_name,Start,End,feature_rank
0,1,+,gene,g1,g1,,,1,30,0
0,1,+,transcript,g1,g1,g1_t1,g1_t1,1,30,1
0,1,+,exon,g1,g1,g1_t1,g1_t1,1,10,2
1,1,+,exon,g1,g1,g1_t1,g1_t1,14,20,2
2,1,+,exon,g1,g1,g1_t1,g1_t1,25,30,2
1,1,-,gene,g2,g2,,,6,95,0
1,1,-,transcript,g2,g2,g2_t1,g2_t1,8,90,1
0,1,-,exon,g2,g2,g2_t1,g2_t1,60,90,2
1,1,-,exon,g2,g2,g2_t1,g2_t1,30,45,2
2,1,-,exon,g2,g2,g2_t1,g2_t1,8,10,2


In [163]:
test_df.dtypes

Chromosome         object
Strand             object
Feature            object
gene_name          object
gene_id            object
transcript_id      object
transcript_name    object
Start               int64
End                 int64
feature_rank        int64
dtype: object

In [None]:
# # tests for update_gtf_ends()
# def make_gtf(c,e,s,g,t):


# # t1 - transcript that doesn't need to be merged
# n = 3
# c = ['1' for i in range(n)]
# e [[1,10], [14,20], [25,30]]
# s = ['+' for i in range(n)]
# g = 'g1'
# t = 'g1_t1'

# # t2 - rev. strand transcript that needs ends to be updated
# n = 3
# c = ['1' for i in range(n)]
# e = [[90,60], [45,30], [10,8]]
# s = ['-' for i in range(n)]
# g = 'g2'
# t = 'g1_t1'

# df = pd.DataFrame()
# df['Chromosome'] = c
# df['Start'] = [i[0] for i in e]
# df['End'] = [i[1] for i in e]
# df['Strand'] = s
# cols = ['gene_name', 'gene_id']
# for c in cols:
#     df[c] = g
# cols = []

