In [2]:
import pyranges as pr
import pandas as pd
import numpy as np
import h5py 
import os

from cerberus.cerberus import *
from cerberus.main import *


In [3]:
def replace_gtf_ids(gtf, h5, agg):
    """
    Replace transcript ids and names in a gtf with the triplets
    calculated from assign_triplets

    Parameters:
        gtf (str): Path to gtf file
        h5 (str): Path to h5 annotation (output from assign)
        agg (bool): Whether or not to collapse transcripts with
            duplicate triplets

    Returns:
        df (pyranges PyRanges): PyRanges gtf table with updated ids
    """
    
    # keep only the gene, exon, and transcript entries
    df = pr.read_gtf(gtf).df
    entry_types = ['gene', 'transcript', 'exon']
    df = df.loc[df.Feature.isin(entry_types)]
   
    if not update_ends:
        _, _, _, m_df = read_h5(h5)
    else:
        _, tss, tes, m_df = read_h5(h5)

#     # groupby transcripts that are the same
#     gb_cols = ['gene_name', 'gene_id', 'transcript_triplet',
#                'transcript_id', 'transcript_name']
#     temp = m_df[['transcript_id',
#                  'original_transcript_id',
#                  'original_transcript_name']].copy(deep=True)
#     m_df = m_df.groupby(gb_cols).agg({'original_transcript_id': ','.join,
#                                       'original_transcript_name': ','.join}).reset_index()
#     m_df = m_df.merge(temp, on='transcript_id', suffixes=('','_merge'))
#     m_df.drop(['gene_name', 'gene_id', 'transcript_triplet'],
#               axis=1, inplace=True)

#     # add new transcript ids
#     df = df.merge(m_df, left_on=['transcript_id', 'transcript_name'],
#                   right_on=['original_transcript_id_merge',
#                             'original_transcript_name_merge'],
#                  suffixes=('_x', ''))

#     # drop old tids
#     df.drop(['transcript_id_x', 'transcript_name_x',
#              'original_transcript_name_merge'],
#             axis=1, inplace=True)

#     # remove duplicated transcripts; just keeping the first one
#     if agg:
#         temp = df[['transcript_id', 'original_transcript_id_merge']].drop_duplicates()
#         dupe_old_tids = temp.loc[temp.transcript_id.duplicated(keep='first'), 'original_transcript_id_merge']
#         df = df.loc[~df.original_transcript_id_merge.isin(dupe_old_tids)]

#     # drop last column
#     df.drop('original_transcript_id_merge', axis=1, inplace=True)

#     df = pr.PyRanges(df)

#     return df

## modify / collapse transcripts in TALON GTF

In [499]:
h5 = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/cerberus/human_cerberus.h5'
gtf = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_known_nic_nnc_talon.gtf'
agg = True
update_ends = True

# replace_gtf_ids(gtf, h5, agg, update_ends)

In [534]:
h5 = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/cerberus/human_cerberus.h5'
gtf = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_known_nic_nnc_talon.gtf'
agg = True
update_ends = True

df = pr.read_gtf(gtf).df
print(len(df.index))
entry_types = ['gene', 'transcript', 'exon']
df = df.loc[df.Feature.isin(entry_types)]

if not update_ends:
    _, _, _, m_df = read_h5(h5)
else:
    _, tss, tes, m_df = read_h5(h5)
    # hack to remove duplicates for now
    tss = pr.PyRanges(tss.df[['Chromosome', 'Start', 'End', 'Strand', 'gene_id', 'tss']].drop_duplicates())
    tes = pr.PyRanges(tes.df[['Chromosome', 'Start', 'End', 'Strand', 'gene_id', 'tes']].drop_duplicates())
    tss = tss.df
    tes = tes.df
    tss['tss_id'] = tss.gene_id+'_'+tss.tss.astype(str)
    tes['tes_id'] = tes.gene_id+'_'+tes.tes.astype(str)
    tss = pr.PyRanges(tss)
    tes = pr.PyRanges(tes)


m_df.drop(['transcript_triplet', 
           'gene_name', 'gene_id'], axis=1, inplace=True)

df = map_gtf_transcripts(df, m_df)

if update_ends:
    gtf = update_gtf_ends(df, tss, tes)

1216320


In [536]:
# # after just tss on fwd strand
# gtf.head()

In [537]:
# spot check - fwd strand

In [538]:
gtf.loc[gtf.gene_id == 'ENCODEHG000058846', ['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'tss_id', 'tes_id']] 

Unnamed: 0,Feature,gene_id,transcript_id,Strand,Start,End,tss_id,tes_id
29,gene,ENCODEHG000058846,,+,10597.0,11656.0,,
30,transcript,ENCODEHG000058846,"ENCODEHG000058846[1,1,1]",+,10597.0,11656.0,ENCODEHG000058846_1,ENCODEHG000058846_1
31,exon,ENCODEHG000058846,"ENCODEHG000058846[1,1,1]",+,10597.0,10791.0,ENCODEHG000058846_1,ENCODEHG000058846_1
32,exon,ENCODEHG000058846,"ENCODEHG000058846[1,1,1]",+,10882.0,11057.0,ENCODEHG000058846_1,ENCODEHG000058846_1
33,exon,ENCODEHG000058846,"ENCODEHG000058846[1,1,1]",+,11434.0,11656.0,ENCODEHG000058846_1,ENCODEHG000058846_1


In [539]:
tss_df = tss.df
tss_df.loc[tss_df.tss_id == 'ENCODEHG000058846_1']

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,tss,tss_id
0,SIRV1,10597,10698,+,ENCODEHG000058846,1,ENCODEHG000058846_1


In [540]:
tes_df = tes.df
tes_df.loc[tes_df.tes_id == 'ENCODEHG000058846_1']

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,tes,tes_id
0,SIRV1,11555,11656,+,ENCODEHG000058846,1,ENCODEHG000058846_1


In [541]:
# spot check - rev strand

In [547]:
gtf.loc[gtf.gene_id == 'ENCODEHG000058784', ['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'tss_id', 'tes_id']]

Unnamed: 0,Feature,gene_id,transcript_id,Strand,Start,End,tss_id,tes_id
0,gene,ENCODEHG000058784,,-,945.0,5956.0,,
1,transcript,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,3615.0,4814.0,ENCODEHG000058784_1,ENCODEHG000058784_2
2,exon,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,4687.0,4814.0,ENCODEHG000058784_1,ENCODEHG000058784_2
3,exon,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,3966.0,4479.0,ENCODEHG000058784_1,ENCODEHG000058784_2
4,exon,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,3615.0,3825.0,ENCODEHG000058784_1,ENCODEHG000058784_2
5,transcript,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,945.0,5956.0,ENCODEHG000058784_2,ENCODEHG000058784_1
6,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,5788.0,5956.0,ENCODEHG000058784_2,ENCODEHG000058784_1
7,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,4687.0,4800.0,ENCODEHG000058784_2,ENCODEHG000058784_1
8,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,4338.0,4479.0,ENCODEHG000058784_2,ENCODEHG000058784_1
9,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,3966.0,4094.0,ENCODEHG000058784_2,ENCODEHG000058784_1


In [545]:
tss_df = tss.df
tss_df.loc[tss_df.tss_id == 'ENCODEHG000058784_1']

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,tss,tss_id
3,SIRV2,4713,4814,-,ENCODEHG000058784,1,ENCODEHG000058784_1


In [546]:
tes_df = tes.df
tes_df.loc[tes_df.tes_id == 'ENCODEHG000058784_2']

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,tes,tes_id
3,SIRV2,3615,3716,-,ENCODEHG000058784,2,ENCODEHG000058784_2


In [533]:
def update_gtf_ends(gtf, tss, tes):
    """
    Update gene, transcript, and exon boundaries to be 
    furthest upstream or downstream entry for end used
    """
    gtf = gtf.copy(deep=True)
    gtf = sort_gtf(gtf)
    # print(gtf.loc[gtf.gene_id == 'ENCODEHG000058844', ['Feature', 'gene_id', 'transcript_id', 'Start', 'End']])

    for mode, ends in zip(['tss', 'tes'], [tss, tes]):
        ends = ends.df
        ends = ends[['Start', 'End', '{}_id'.format(mode)]]
        gtf = gtf.merge(ends, how='left',
                        on='{}_id'.format(mode),
                        suffixes=('', '_end'))

        fwd, rev = get_stranded_gtf_dfs(gtf)
        df = pd.DataFrame()
        for strand, temp in zip(['+', '-'], [fwd, rev]):
    
            # fix exon, transcript, and gene boundaries
            temp = update_transcript_ends(temp, mode, strand)
            temp = update_gene_ends(temp, mode, strand)
            df = pd.concat([df, temp])
        
        gtf = df.copy(deep=True)
        gtf.drop(['Start_end', 'End_end'], axis=1, inplace=True)
        
    return gtf


In [530]:
# print(rev[['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'Start_end', 'End_end']].head(20))

def get_update_ends_settings(strand, mode):
    if mode == 'tss': 
        if strand == '+':
            old_end = 'Start'
            new_end = 'Start_end'
            gene_func = 'min'
        elif strand == '-':
            old_end = 'End'
            new_end = 'End_end'
            gene_func = 'max'
    elif mode == 'tes':
        if strand == '+':
            old_end = 'End'
            new_end = 'End_end'
            gene_func = 'max'
        elif strand == '-':
            old_end = 'Start'
            new_end  = 'Start_end'
            gene_func = 'min'
            
    return old_end, new_end, gene_func

def update_transcript_ends(df, mode, strand):
    """
    Update the ends of transcripts and the first / last exon
    in a GTF. GTF must be sorted!
    """
    old_col, new_col, gene_func = get_update_ends_settings(strand, mode)
    
    temp = df[['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'Start_end', 'End_end']].copy(deep=True)
    temp = temp.loc[temp.Feature != 'gene']
    if mode == 'tss':
        inds = temp.groupby('transcript_id').head(2).index.tolist()
    elif mode == 'tes':
        inds = temp.groupby('transcript_id').head(1).index.tolist()
        inds += temp.groupby('transcript_id').tail(1).index.tolist()
    
    # print('transcript')
    # print(df.columns)
    # print(old_col)
    # print(new_col)
    # print()
    df.loc[inds, old_col] = df.loc[inds, new_col]
    
    return df

def update_gene_ends(df, mode, strand):
    """
    Update the ends of genes in a GTF. 
    """
    old_col, new_col, gene_func = get_update_ends_settings(strand, mode)
    temp = df[['Feature', 'gene_id', old_col]].copy(deep=True)
    temp = temp.loc[temp.Feature == 'transcript']
    # print(temp.loc[temp.gene_id == 'ENCODEHG000058784'])
    temp = temp.groupby(['gene_id', 'Feature'], observed=True).agg(gene_func).reset_index()
    temp.drop('Feature', axis=1, inplace=True)
    # print(temp.loc[temp.gene_id == 'ENCODEHG000058784'])
    df = df.merge(temp, on='gene_id', suffixes=('', '_gene'))
    inds = df.loc[df.Feature == 'gene'].index.tolist()
    # print('gene')
    # print(df.columns)
    # print(old_col)
    # print('{}_gene'.format(old_col))
    # print()
    
    df.loc[inds, old_col] = df.loc[inds, '{}_gene'.format(old_col)]
    df.drop('{}_gene'.format(old_col), axis=1, inplace=True)
    
    return df




In [490]:
def get_stranded_gtf_dfs(df):
    """
    Split a GTF df into fwd and rev strands
    """
    rev = df.loc[df.Strand == '-'].copy(deep=True)
    fwd = df.loc[df.Strand == '+'].copy(deep=True)
    
    return fwd, rev

def sort_gtf(df):
    """
    Sort a GTF 
    """
    df['feature_rank'] = df.Feature.map({'gene':0, 'transcript':1, 'exon':2})
    df.feature_rank = df.feature_rank.astype(int)    
     
    fwd, rev = get_stranded_gtf_dfs(df)

    df = pd.DataFrame()
    for temp in [fwd, rev]:
        if len(temp.index) > 0:
            strand = temp.Strand.values.tolist()[0]
            if strand == '+':
                ascending = True
            elif strand == '-':
                ascending = False
            temp.sort_values(by=['gene_id', 'transcript_id', 'feature_rank', 'Start'], 
                             ascending=[True, True, True, ascending],
                             na_position='first', inplace=True)
            
            df = pd.concat([df, temp])
    df.drop('feature_rank', axis=1, inplace=True)
    return df

In [480]:
mode

'tes'

In [471]:
df.columns

Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_name', 'gene_status', 'gene_type',
       'talon_gene', 'havana_gene', 'level', 'transcript_status',
       'talon_transcript', 'NIC_transcript', 'exon_number', 'exon_id',
       'talon_exon', 'exon_status', 'ont', 'source', 'tag', 'transcript_type',
       'transcript_support_level', 'havana_transcript', 'NNC_transcript',
       'protein_id', 'ccdsid', 'intergenic_novel', 'antisense_gene',
       'gene_antisense_to_IDs', 'original_transcript_id', 'ic', 'ic_id',
       'tss_id', 'tss', 'tes_id', 'tes', 'original_transcript_name',
       'transcript_id', 'transcript_name', 'Start_end', 'End_end'],
      dtype='object')

In [474]:
df.loc[df.gene_id == 'ENCODEHG000058784', ['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'Start_end', 'End_end']]

Unnamed: 0,Feature,gene_id,transcript_id,Strand,Start,End,Start_end,End_end
0,gene,ENCODEHG000058784,,-,945.0,4764,,
1,transcript,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,3615.0,4764,3615.0,3716.0
2,exon,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,4687.0,4764,3615.0,3716.0
3,exon,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,3966.0,4479,3615.0,3716.0
4,exon,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,3615.0,3825,3615.0,3716.0
5,transcript,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,945.0,5906,945.0,1046.0
6,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,5788.0,5906,945.0,1046.0
7,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,4687.0,4800,945.0,1046.0
8,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,4338.0,4479,945.0,1046.0
9,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,3966.0,4094,945.0,1046.0


In [429]:
df.loc[df.gene_id == 'ENCODEHG000058844', ['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'Start_end', 'End_end']]
# df.head()

In [420]:
temp.loc[temp.gene_id == 'ENCODEHG000058844']

Unnamed: 0,gene_id,Feature,Start
0,ENCODEHG000058844,transcript,1930.0


In [395]:
rev[['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'Start_end', 'End_end']].head(20)

Unnamed: 0,Feature,gene_id,transcript_id,Strand,Start,End,Start_end,End_end
29,gene,ENCODEHG000058784,,-,995,4764.0,,
42,transcript,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,3665,4814.0,4713.0,4814.0
43,exon,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,4687,4814.0,4713.0,4814.0
44,exon,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,3966,4479.0,4713.0,4814.0
45,exon,ENCODEHG000058784,"ENCODEHG000058784[1,1,2]",-,3665,3825.0,4713.0,4814.0
30,transcript,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,995,5956.0,5855.0,5956.0
31,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,5788,5956.0,5855.0,5956.0
32,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,4687,4800.0,5855.0,5956.0
33,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,4338,4479.0,5855.0,5956.0
34,exon,ENCODEHG000058784,"ENCODEHG000058784[2,2,1]",-,3966,4094.0,5855.0,5956.0


In [344]:
fwd[['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'Start_end', 'End_end']].head(20)

Unnamed: 0,Feature,gene_id,transcript_id,Strand,Start,End,Start_end,End_end
46,gene,ENCODEHG000058844,,+,1980.0,6780,,
56,transcript,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,1930.0,8837,1930.0,2031.0
57,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,1930.0,2005,1930.0,2031.0
58,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,4003.0,4080,1930.0,2031.0
59,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,4568.0,4779,1930.0,2031.0
60,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,6057.0,6333,1930.0,2031.0
61,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,7270.0,7366,1930.0,2031.0
62,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,7872.0,7988,1930.0,2031.0
63,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,8124.0,8207,1930.0,2031.0
64,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,8755.0,8837,1930.0,2031.0


In [311]:
test.head()

Unnamed: 0,Feature,gene_id,transcript_id,Strand,Start,End,Start_end,End_end
56,transcript,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,1980,8837,1930.0,2031.0
57,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,1980,2005,1930.0,2031.0
58,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,4003,4080,1930.0,2031.0
59,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,4568,4779,1930.0,2031.0
60,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,6057,6333,1930.0,2031.0


In [293]:
temp = tss.df
# temp.loc[(temp.gene_id == 'ENSG00000285976')&(beep.tss == 2)]
temp.loc[(temp.tss == 'ENSG00000285976')&(temp.tss==2)]


Unnamed: 0,Chromosome,Start,End,Strand,gene_id,tss


In [294]:
temp.tss.to_frame().dtypes

tss    int64
dtype: object

In [287]:
m_df.loc[m_df.transcript_id == 'ENSG00000285976[2,3,3]']

Unnamed: 0,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,original_transcript_name,transcript_id,transcript_name
106791,ENCODEHT000789200,3,ENSG00000285976_3,ENSG00000285976_2,2,ENSG00000285976_3,3,ENCODEHT000789200,"ENSG00000285976[2,3,3]","AL135905.2[2,3,3]"


In [276]:
temp.loc[temp[['Start', 'End', 'Strand', 'Name', 'gene_id', 'tss']].duplicated(keep=False)]

Unnamed: 0,Chromosome,Start,End,Strand,Name,source,gene_id,tss
18,chr1,30216,30317,+,ENSG00000243485_2,"v39,v29,talon",ENSG00000243485,2
19,chr1,30216,30317,+,ENSG00000243485_2,"v39,talon",ENSG00000243485,2
24,chr1,30315,30416,+,ENSG00000284332_1,"v39,v29",ENSG00000284332,1
25,chr1,30315,30416,+,ENSG00000284332_1,v39,ENSG00000284332,1
492,chr1,10430333,10430485,+,ENSG00000251503_2,"v39,v29,talon",ENSG00000251503,2
...,...,...,...,...,...,...,...,...
156310,chrY,2615296,2615397,-,ENSG00000230542_1,"v39,v29",ENSG00000230542,1
156487,chrY,18404995,18405096,-,ENSG00000226362_1,"v39,v29",ENSG00000226362,1
156488,chrY,18404995,18405096,-,ENSG00000282909_2,"v39,v29",ENSG00000282909,2
156489,chrY,18404995,18405096,-,ENSG00000226362_1,v39,ENSG00000226362,1


In [268]:
test.head(10)

Unnamed: 0,Feature,gene_id,transcript_id,Strand,Start,End,Start_end,End_end
56,transcript,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,1980,8837,1930.0,2031.0
57,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,1980,2005,1930.0,2031.0
58,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,4003,4080,1930.0,2031.0
59,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,4568,4779,1930.0,2031.0
60,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,6057,6333,1930.0,2031.0
61,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,7270,7366,1930.0,2031.0
62,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,7872,7988,1930.0,2031.0
63,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,8124,8207,1930.0,2031.0
64,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",+,8755,8837,1930.0,2031.0
65,transcript,ENCODEHG000058844,"ENCODEHG000058844[1,2,3]",+,1980,8837,1930.0,2031.0


In [249]:
# test = df.loc[df.gene_id == 'ENCODEHG000058846'].copy(deep=True)
# test = sort_gtf(test)
# # test.sort_values(by=['transcript_id', 'gene_id', 'feature_rank', 'Start'],
# #                 ascending=[True, True, True, True],
# #                 na_position='first')[['Feature', 'transcript_id', 'gene_id', 'feature_rank', 'Start']]

# # ascending = True
# # test.sort_values(by=['transcript_id', 'gene_id', 'feature_rank', 'Start'],
# #                 ascending=[True, True, True, ascending],
# #                 na_position='first')[['Feature', 'transcript_id', 'gene_id', 'feature_rank', 'Start', 'End']]
# test[['Feature', 'transcript_id', 'gene_id', 'feature_rank', 'Start', 'End']]
# # test.loc[test[['transcript_id', 'gene_id']].duplicated(keep=False)]

In [250]:
# test = df.loc[df.gene_id.isin(['ENCODEHG000058844', 'ENCODEHG000058784', 'ENSG00000285631.1'])].copy(deep=True)
# test = sort_gtf(test)
# # test.sort_values(by=['transcript_id', 'gene_id', 'feature_rank', 'Start'],
# #                 ascending=[True, True, True, True],
# #                 na_position='first')[['Feature', 'transcript_id', 'gene_id', 'feature_rank', 'Start']]

# ascending = False
# # test.sort_values(by=['gene_id', 'transcript_id', 'feature_rank', 'Start'],
# #                 ascending=[True, True, True, ascending],
# #                 na_position='first')[['Feature', 'gene_id', 'transcript_id', 'feature_rank', 'Start', 'End']]
# test[['Feature', 'transcript_id', 'gene_id', 'feature_rank', 'Start', 'End']]


In [248]:
gtf.reset_index(inplace=True, drop=True)
print(gtf.loc[gtf.gene_id == 'ENCODEHG000058844', ['Feature', 'gene_id', 'transcript_id', 'feature_rank', 'Start', 'End']])

       Feature            gene_id             transcript_id  feature_rank  \
0         gene  ENCODEHG000058844                       NaN             0   
1   transcript  ENCODEHG000058844  ENCODEHG000058844[1,1,3]             1   
2         exon  ENCODEHG000058844  ENCODEHG000058844[1,1,3]             2   
3         exon  ENCODEHG000058844  ENCODEHG000058844[1,1,3]             2   
4         exon  ENCODEHG000058844  ENCODEHG000058844[1,1,3]             2   
5         exon  ENCODEHG000058844  ENCODEHG000058844[1,1,3]             2   
6         exon  ENCODEHG000058844  ENCODEHG000058844[1,1,3]             2   
7         exon  ENCODEHG000058844  ENCODEHG000058844[1,1,3]             2   
8         exon  ENCODEHG000058844  ENCODEHG000058844[1,1,3]             2   
9         exon  ENCODEHG000058844  ENCODEHG000058844[1,1,3]             2   
10  transcript  ENCODEHG000058844  ENCODEHG000058844[1,2,3]             1   
11        exon  ENCODEHG000058844  ENCODEHG000058844[1,2,3]             2   

In [232]:
# # gtf.loc[gtf.transcript_id.isnull(), ['Feature', 'transcript_id', 'original_transcript_id']]
gtf.reset_index(inplace=True, drop=True)
gtf.head()
gtf[['Feature', 'gene_id', 'transcript_id', 'feature_rank', 'Start', 'End']]

Unnamed: 0,Feature,gene_id,transcript_id,feature_rank,Start,End
0,gene,ENCODEHG000058844,,0,1980,6780
1,transcript,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",1,1980,8837
2,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",2,1980,2005
3,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",2,4003,4080
4,exon,ENCODEHG000058844,"ENCODEHG000058844[1,1,3]",2,4568,4779
...,...,...,...,...,...,...
1216315,exon,ENSG00000285991.1,"ENSG00000285991[1,2,2]",2,149890816,149891034
1216316,exon,ENSG00000285991.1,"ENSG00000285991[1,2,2]",2,149889884,149890145
1216317,exon,ENSG00000285991.1,"ENSG00000285991[1,2,2]",2,149889347,149889623
1216318,exon,ENSG00000285991.1,"ENSG00000285991[1,2,2]",2,149888538,149888667


In [216]:
gtf[['Feature', 'transcript_id', 'gene_id', 'feature_rank', 'Start', 'End']]

Unnamed: 0,Feature,transcript_id,gene_id,feature_rank,Start,End
56,transcript,"ENCODEHG000058844[1,1,3]",ENCODEHG000058844,1,1980,8837
57,exon,"ENCODEHG000058844[1,1,3]",ENCODEHG000058844,2,1980,2005
58,exon,"ENCODEHG000058844[1,1,3]",ENCODEHG000058844,2,4003,4080
59,exon,"ENCODEHG000058844[1,1,3]",ENCODEHG000058844,2,4568,4779
60,exon,"ENCODEHG000058844[1,1,3]",ENCODEHG000058844,2,6057,6333
...,...,...,...,...,...,...
358883,gene,,ENSG00000285967.1,0,36864424,36876207
616385,gene,,ENSG00000285972.1,0,84167227,84172093
378346,gene,,ENSG00000285978.1,0,178694604,178729335
699465,gene,,ENSG00000285980.1,0,131564543,131581145


In [123]:
599084+617236

1216320

In [172]:
gtf.feature_rank = gtf.feature_rank.astype(int)

In [106]:
gtf.loc[gtf.gene_id == 'ENCODEHG000058846']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,tss,tes_id,tes,original_transcript_name,transcript_id,transcript_name,Start_end,End_end,Start_end.1,End_end.1
28,SIRV1,TALON,exon,10647,10791,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,1.0,ENCODEHG000058846_1,1.0,ENCODEHT000206942,"ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]",10597,10698,11555,11656
29,SIRV1,TALON,exon,10882,11057,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,1.0,ENCODEHG000058846_1,1.0,ENCODEHT000206942,"ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]",10597,10698,11555,11656
30,SIRV1,TALON,exon,11434,11606,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,1.0,ENCODEHG000058846_1,1.0,ENCODEHT000206942,"ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]",10597,10698,11555,11656
31,SIRV1,TALON,transcript,10647,11606,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,1.0,ENCODEHG000058846_1,1.0,ENCODEHT000206942,"ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]",10597,10698,11555,11656


In [None]:
if update_ends:
    df = update_ends(df)

In [94]:
temp = df.loc[df.Feature =='transcript']
b = temp.loc[temp.transcript_id.duplicated(keep=False)]
b

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,intergenic_novel,antisense_gene,gene_antisense_to_IDs,original_transcript_id,ic_id,tss_id,tes_id,original_transcript_name,transcript_id,transcript_name
698,chr1,HAVANA,transcript,1232225,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,,,,ENST00000647651.1,ENSG00000176022_1,ENSG00000176022_1,ENSG00000176022_1,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
700,chr1,HAVANA,transcript,1232258,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,,,,ENST00000379198.4,ENSG00000176022_1,ENSG00000176022_1,ENSG00000176022_1,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
380352,chr6,HAVANA,transcript,1609971,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,,,,ENST00000380874.3,ENSG00000054598_1,ENSG00000054598_1,ENSG00000054598_1,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
380354,chr6,HAVANA,transcript,1610066,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,,,,ENST00000645831.1,ENSG00000054598_1,ENSG00000054598_1,ENSG00000054598_1,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
1189029,chrX,HAVANA,transcript,83508260,83512127,.,+,.,ENSG00000196767.7,POU3F4,...,,,,ENST00000373200.5,ENSG00000196767_1,ENSG00000196767_1,ENSG00000196767_1,POU3F4-201,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]"
1189031,chrX,HAVANA,transcript,83508292,83512127,.,+,.,ENSG00000196767.7,POU3F4,...,,,,ENST00000644024.1,ENSG00000196767_1,ENSG00000196767_1,ENSG00000196767_1,POU3F4-202,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]"


In [93]:
len(df.index)

1216320

In [84]:
def map_gtf_transcripts(gtf, m_df):
    """
    Add cerberus transcript id to each entry in gtf
    """
    gtf = df.copy(deep=True)
    gtf = gtf.merge(m_df, how='left', 
                    left_on=['transcript_name', 'transcript_id'],
                    right_on=['original_transcript_name', 'original_transcript_id'],
                    suffixes=('', '_cerberus'))

    gtf.drop(['transcript_id', 'transcript_name'], axis=1, inplace=True)
    gtf.rename({'transcript_id_cerberus': 'transcript_id',
                'transcript_name_cerberus': 'transcript_name'},
               axis=1, inplace=True)
    
    return gtf

In [77]:
# df.loc[df.transcript_id == 'ENST00000469289.1', ['gene_id', 'gene_name', 'transcript_name', 'transcript_id']]

In [76]:
# m_df.loc[m_df.original_transcript_id == 'ENST00000469289.1', ['gene_id', 'gene_name', 'original_transcript_name', 'original_transcript_id']]

In [75]:
# df.loc[df.transcript_id == 'ENST00000469289.1']

In [74]:
# m_df.loc[m_df.original_transcript_id == 'ENST00000469289.1']

In [82]:
gtf.head()
temp = gtf.loc[gtf.Feature =='transcript']
b = temp.loc[temp.transcript_id.duplicated(keep=False)]
# print(len(b.index))
# b.transcript_id.head()
b

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,intergenic_novel,antisense_gene,gene_antisense_to_IDs,original_transcript_id,ic_id,tss_id,tes_id,original_transcript_name,transcript_id,transcript_name
698,chr1,HAVANA,transcript,1232225,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,,,,ENST00000647651.1,ENSG00000176022_1,ENSG00000176022_1,ENSG00000176022_1,B3GALT6-202,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
700,chr1,HAVANA,transcript,1232258,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,,,,ENST00000379198.4,ENSG00000176022_1,ENSG00000176022_1,ENSG00000176022_1,B3GALT6-201,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]"
380352,chr6,HAVANA,transcript,1609971,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,,,,ENST00000380874.3,ENSG00000054598_1,ENSG00000054598_1,ENSG00000054598_1,FOXC1-201,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
380354,chr6,HAVANA,transcript,1610066,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,,,,ENST00000645831.1,ENSG00000054598_1,ENSG00000054598_1,ENSG00000054598_1,FOXC1-202,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]"
1189029,chrX,HAVANA,transcript,83508260,83512127,.,+,.,ENSG00000196767.7,POU3F4,...,,,,ENST00000373200.5,ENSG00000196767_1,ENSG00000196767_1,ENSG00000196767_1,POU3F4-201,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]"
1189031,chrX,HAVANA,transcript,83508292,83512127,.,+,.,ENSG00000196767.7,POU3F4,...,,,,ENST00000644024.1,ENSG00000196767_1,ENSG00000196767_1,ENSG00000196767_1,POU3F4-202,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]"


In [61]:
d = gtf.loc[~gtf.transcript_id_cerberus.isnull()]
d.loc[d.gene_id.str.contains('ENSG')]
# gtf.loc[gtf.transcript_id_cerberus == 'ENSG00000243485[3,2,1]']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,intergenic_novel,antisense_gene,gene_antisense_to_IDs,original_transcript_id,ic_id,tss_id,tes_id,original_transcript_name,transcript_id_cerberus,transcript_name_cerberus


In [47]:
m_df.loc[m_df.original_transcript_id == 'ENST00000469289.1']

Unnamed: 0,original_transcript_id,ic_id,tss_id,tes_id,gene_id,gene_name,original_transcript_name,transcript_id,transcript_name
4588,ENST00000469289.1,ENSG00000243485_2,ENSG00000243485_3,ENSG00000243485_1,ENSG00000243485,MIR1302-2HG,MIR1302-2HG-201,"ENSG00000243485[3,2,1]","MIR1302-2HG[3,2,1]"


In [19]:
m_df.head()

Unnamed: 0,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,gene_id,gene_name,original_transcript_name,transcript_triplet,transcript_id,transcript_name
0,ENCODEHT000206942,1,ENCODEHG000058846_1,ENCODEHG000058846_1,1,ENCODEHG000058846_1,1,ENCODEHG000058846,ENCODEHG000058846,ENCODEHT000206942,"[1,1,1]","ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]"
1,ENCODEHT000206867,4,ENCODEHG000058837_4,ENCODEHG000058837_2,2,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206867,"[2,4,1]","ENCODEHG000058837[2,4,1]","ENCODEHG000058837[2,4,1]"
2,ENCODEHT000206868,2,ENCODEHG000058837_2,ENCODEHG000058837_2,2,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206868,"[2,2,1]","ENCODEHG000058837[2,2,1]","ENCODEHG000058837[2,2,1]"
3,ENCODEHT000206870,3,ENCODEHG000058837_3,ENCODEHG000058837_2,2,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206870,"[2,3,1]","ENCODEHG000058837[2,3,1]","ENCODEHG000058837[2,3,1]"
4,ENCODEHT000206886,1,ENCODEHG000058837_1,ENCODEHG000058837_1,1,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206886,"[1,1,1]","ENCODEHG000058837[1,1,1]","ENCODEHG000058837[1,1,1]"


In [20]:
df.head()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,tag,transcript_type,transcript_support_level,havana_transcript,NNC_transcript,protein_id,ccdsid,intergenic_novel,antisense_gene,gene_antisense_to_IDs
0,SIRV1,TALON,gene,10647,11606,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,,,,True,,
1,SIRV1,TALON,transcript,10647,11606,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,True,,,,,
2,SIRV1,TALON,exon,10647,10791,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,,,,,,
3,SIRV1,TALON,exon,10882,11057,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,,,,,,
4,SIRV1,TALON,exon,11434,11606,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,,,,,,


In [8]:
t_df = df.loc[df.Feature == 'transcript']
keep_cols = ['Chromosome', 'Source', 'Feature',
             'Start', 'End', 'Score', 'Strand',
             'Frame', 'gene_id', 'gene_name', 

In [13]:
t_df.level.unique().tolist()[:10]

[nan, '2', '1', '3']

In [17]:
t_df.transcript_support_level.unique().tolist()[:5]

[nan, '5', 'NA', '1', '4']

In [9]:
# everything related to gene groupby
# drop everything that we don't care about
# for other we take set of unique values
t_df.columns

Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_name', 'gene_status', 'gene_type',
       'talon_gene', 'havana_gene', 'level', 'transcript_id',
       'transcript_status', 'transcript_name', 'talon_transcript',
       'NIC_transcript', 'exon_number', 'exon_id', 'talon_exon', 'exon_status',
       'ont', 'source', 'tag', 'transcript_type', 'transcript_support_level',
       'havana_transcript', 'NNC_transcript', 'protein_id', 'ccdsid',
       'intergenic_novel', 'antisense_gene', 'gene_antisense_to_IDs'],
      dtype='object')

In [18]:
';'.join(set(['a','a', 'b']))

'b;a'

In [23]:
test = map_gtf_transcripts(df, m_df)

In [24]:
test.columns

Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_name', 'gene_status', 'gene_type',
       'talon_gene', 'havana_gene', 'level', 'transcript_status',
       'talon_transcript', 'NIC_transcript', 'exon_number', 'exon_id',
       'talon_exon', 'exon_status', 'ont', 'source', 'tag', 'transcript_type',
       'transcript_support_level', 'havana_transcript', 'NNC_transcript',
       'protein_id', 'ccdsid', 'intergenic_novel', 'antisense_gene',
       'gene_antisense_to_IDs', 'transcript_id', 'transcript_name',
       'original_transcript_id', 'original_transcript_name',
       'original_transcript_id_merge'],
      dtype='object')

In [25]:
test[['Chromosome', 'Feature', 'Start', 'End', 'Strand', 
      'gene_id', 'gene_name', 'transcript_id', 'transcript_name', 
      'original_transcript_id', 'original_transcript_name']].tail()

# how to handle transcript novelty????

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,gene_name,transcript_id,transcript_name,original_transcript_id,original_transcript_name
1181901,chrY,exon,25393153,25394056,-,ENSG00000233944.1,LINC00265-3P,"ENSG00000233944[1,1,1]","LINC00265-3P[1,1,1]",ENST00000447588.1,LINC00265-3P-201
1181902,chrY,transcript,16379178,16387459,-,ENCODEHG000377496,ENCODEHG000377496,"ENCODEHG000377496[1,1,1]","ENCODEHG000377496[1,1,1]",ENCODEHT005023774,ENCODEHT005023774
1181903,chrY,exon,16387141,16387459,-,ENCODEHG000377496,ENCODEHG000377496,"ENCODEHG000377496[1,1,1]","ENCODEHG000377496[1,1,1]",ENCODEHT005023774,ENCODEHT005023774
1181904,chrY,exon,16383092,16383318,-,ENCODEHG000377496,ENCODEHG000377496,"ENCODEHG000377496[1,1,1]","ENCODEHG000377496[1,1,1]",ENCODEHT005023774,ENCODEHT005023774
1181905,chrY,exon,16379178,16381894,-,ENCODEHG000377496,ENCODEHG000377496,"ENCODEHG000377496[1,1,1]","ENCODEHG000377496[1,1,1]",ENCODEHT005023774,ENCODEHT005023774


In [182]:
test[['original_transcript_id', 'original_transcript_name', 'transcript_id', 'transcript_name']]

Unnamed: 0,original_transcript_id,original_transcript_name,transcript_id,transcript_name
0,ENCODEHT000206942,ENCODEHT000206942,"ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]"
1,ENCODEHT000206942,ENCODEHT000206942,"ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]"
2,ENCODEHT000206942,ENCODEHT000206942,"ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]"
3,ENCODEHT000206942,ENCODEHT000206942,"ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]"
4,ENCODEHT000206867,ENCODEHT000206867,"ENCODEHG000058837[2,4,1]","ENCODEHG000058837[2,4,1]"
...,...,...,...,...
1181901,ENST00000447588.1,LINC00265-3P-201,"ENSG00000233944[1,1,1]","LINC00265-3P[1,1,1]"
1181902,ENCODEHT005023774,ENCODEHT005023774,"ENCODEHG000377496[1,1,1]","ENCODEHG000377496[1,1,1]"
1181903,ENCODEHT005023774,ENCODEHT005023774,"ENCODEHG000377496[1,1,1]","ENCODEHG000377496[1,1,1]"
1181904,ENCODEHT005023774,ENCODEHT005023774,"ENCODEHG000377496[1,1,1]","ENCODEHG000377496[1,1,1]"


In [22]:
def map_gtf_transcripts(df, m_df):
    """
    Add cerberus transcript id to each entry in gtf
    """
    # groupby transcripts that are the same
    gb_cols = ['gene_name', 'gene_id', 'transcript_triplet',
               'transcript_id', 'transcript_name']
    temp = m_df[['transcript_id',
                 'original_transcript_id',
                 'original_transcript_name']].copy(deep=True)
    m_df = m_df.groupby(gb_cols).agg({'original_transcript_id': ','.join,
                                      'original_transcript_name': ','.join}).reset_index()
    m_df = m_df.merge(temp, on='transcript_id', suffixes=('','_merge'))
    m_df.drop(['gene_name', 'gene_id', 'transcript_triplet'],
              axis=1, inplace=True)

    # add new transcript ids
    df = df.merge(m_df, left_on=['transcript_id', 'transcript_name'],
                  right_on=['original_transcript_id_merge',
                            'original_transcript_name_merge'],
                 suffixes=('_x', ''))

    # drop old tids
    df.drop(['transcript_id_x', 'transcript_name_x',
             'original_transcript_name_merge'],
            axis=1, inplace=True)
    
    return df

In [208]:
# tests for map_gtf_ids
# - transcript that does not have duplicate
# - transcript that does have a duplicate

def make_exon_df(n,c,e,s,g,t):
    df = pd.DataFrame()
    df['Chromosome'] = c
    df['Start'] = [i[0] for i in e]
    df['End'] = [i[1] for i in e]
    df['Strand'] = s
    df['Feature'] = 'exon'
    cols = ['gene_name', 'gene_id']
    for c in cols:
        df[c] = g
    cols = ['transcript_id', 'transcript_name']
    for c in cols:
        df[c] = t

    # reorder exons and starts/ stops if needed
    df['new_Start'] = df[['Start', 'End']].min(axis=1)
    df['new_End'] = df[['Start', 'End']].max(axis=1)
    df.drop(['Start', 'End'], axis=1, inplace=True)
    df.rename({'new_Start':'Start',
               'new_End':'End'}, axis=1, inplace=True)
    s = s[0]
    if s == '+':
        ascending = True
    elif s == '-':
        ascending = False
    df.sort_values(by='Start', ascending=ascending, inplace=True)
    return df

def make_hier_entry(df, how='t'):
    """
    kind {'g','t'}
    """
    agg_dict = {'min_coord': 'min', 'max_coord': 'max'}
    t_df = df.copy(deep=True)
    t_df['min_coord'] = t_df[['Start', 'End']].min(axis=1)
    t_df['max_coord'] = t_df[['Start', 'End']].max(axis=1)
    if how == 't':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id', 'transcript_id', 'transcript_name']
    elif how == 'g':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id']
        
    cols = gb_cols + ['min_coord', 'max_coord']
    t_df = t_df[cols]
    t_df = t_df.groupby(gb_cols).agg(agg_dict).reset_index()
    t_df.rename({'min_coord': 'Start', 'max_coord': 'End'}, axis=1, inplace=True)
    if how == 't':
        t_df['Feature'] = 'transcript'
    elif how == 'g':
        t_df['Feature'] = 'gene'
        
    return t_df

def make_test_gtf(ts):
    df = pd.concat(ts)
        # make transcript entries
    t_df = make_hier_entry(df, how='t')
    # make gene entries
    g_df = make_hier_entry(df, how='g')

    # concat everything and sort by gene id, transcript id, feature rank (gene =0, t =1, exon=2), then start coords
    df = pd.concat([df, t_df, g_df])
    df['feature_rank'] = df.Feature.map({'gene':0, 'transcript':1, 'exon':2})

    rev = df.loc[df.Strand == '-'].copy(deep=True)
    fwd = df.loc[df.Strand == '+'].copy(deep=True)

    df = pd.DataFrame()
    for temp in [fwd, rev]:
        if len(temp.index) > 0:
            strand = temp.Strand.values.tolist()[0]
            if strand == '+':
                ascending = True
            elif strand == '-':
                ascending = False
            temp.sort_values(by=['gene_id', 'transcript_id', 'feature_rank', 'Start'], 
                             ascending=[True, True, True, ascending],
                             na_position='first', inplace=True)
            
            df = pd.concat([df, temp])
    # df.drop('feature_rank', axis=1, inplace=True)
    return df

ts = []
# t1 - transcript that doesn't need to be merged
n = 3
c = ['1' for i in range(n)]
e = [[1,10], [14,20], [25,30]]
s = ['+' for i in range(n)]
g = 'g1'
t = 'g1_t1'
ts.append(make_exon_df(n,c,e,s,g,t))

# t2 - rev. strand transcript that needs to be merged
n = 3
c = ['1' for i in range(n)]
e = [[90,60], [45,30], [10,8]]
s = ['-' for i in range(n)]
g = 'g2'
t = 'g2_t1'
ts.append(make_exon_df(n,c,e,s,g,t))

# t3 - rev. strand transcript that needs to be merged
n = 3
c = ['1' for i in range(n)]
e = [[95,60], [45,30], [10,6]]
s = ['-' for i in range(n)]
g = 'g2'
t = 'g2_t2'
ts.append(make_exon_df(n,c,e,s,g,t))

test_df = make_test_gtf(ts)

# map file
otid = ['g1_t1', 'g2_t1', 'g2_t2',]
otname = [o+'n' for o in otid]
tid = ['g1[1,1,1]', 'g2[1,1,1]', 'g2[1,1,1]']
tname = [o.split('[')[0]+'n['+o.split('[')[1] for o in tid]
gid = ['g1', 'g2', 'g2']
gname = ['g1n', 'g2n', 'g2n']
m_df = pd.DataFrame()
m_df['original_transcript_id'] = otid
m_df['original_transcript_name'] = otname
m_df['transcript_id'] = tid
m_df['transcript_name'] = tname
m_df['gene_name'] = gname
m_df['transcript_triplet'] = m_df.transcript_id.str.slice(2)
m_df['gene_id'] = gid

# control
otid = ['g1_t1', 'g2_t2']

In [209]:
m_df

Unnamed: 0,original_transcript_id,original_transcript_name,transcript_id,transcript_name,gene_name,transcript_triplet,gene_id
0,g1_t1,g1_t1n,"g1[1,1,1]","g1n[1,1,1]",g1n,"[1,1,1]",g1
1,g2_t1,g2_t1n,"g2[1,1,1]","g2n[1,1,1]",g2n,"[1,1,1]",g2
2,g2_t2,g2_t2n,"g2[1,1,1]","g2n[1,1,1]",g2n,"[1,1,1]",g2


In [210]:
test_df

Unnamed: 0,Chromosome,Strand,Feature,gene_name,gene_id,transcript_id,transcript_name,Start,End,feature_rank
0,1,+,gene,g1,g1,,,1,30,0
0,1,+,transcript,g1,g1,g1_t1,g1_t1,1,30,1
0,1,+,exon,g1,g1,g1_t1,g1_t1,1,10,2
1,1,+,exon,g1,g1,g1_t1,g1_t1,14,20,2
2,1,+,exon,g1,g1,g1_t1,g1_t1,25,30,2
1,1,-,gene,g2,g2,,,6,95,0
1,1,-,transcript,g2,g2,g2_t1,g2_t1,8,90,1
0,1,-,exon,g2,g2,g2_t1,g2_t1,60,90,2
1,1,-,exon,g2,g2,g2_t1,g2_t1,30,45,2
2,1,-,exon,g2,g2,g2_t1,g2_t1,8,10,2


In [163]:
test_df.dtypes

Chromosome         object
Strand             object
Feature            object
gene_name          object
gene_id            object
transcript_id      object
transcript_name    object
Start               int64
End                 int64
feature_rank        int64
dtype: object

In [None]:
# # tests for update_gtf_ends()
# def make_gtf(c,e,s,g,t):


# # t1 - transcript that doesn't need to be merged
# n = 3
# c = ['1' for i in range(n)]
# e [[1,10], [14,20], [25,30]]
# s = ['+' for i in range(n)]
# g = 'g1'
# t = 'g1_t1'

# # t2 - rev. strand transcript that needs ends to be updated
# n = 3
# c = ['1' for i in range(n)]
# e = [[90,60], [45,30], [10,8]]
# s = ['-' for i in range(n)]
# g = 'g2'
# t = 'g1_t1'

# df = pd.DataFrame()
# df['Chromosome'] = c
# df['Start'] = [i[0] for i in e]
# df['End'] = [i[1] for i in e]
# df['Strand'] = s
# cols = ['gene_name', 'gene_id']
# for c in cols:
#     df[c] = g
# cols = []

