In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
import h5py 
import os

from cerberus.cerberus import *
from cerberus.main import *


In [None]:
def replace_gtf_ids(gtf, h5, agg):
    """
    Replace transcript ids and names in a gtf with the triplets
    calculated from assign_triplets

    Parameters:
        gtf (str): Path to gtf file
        h5 (str): Path to h5 annotation (output from assign)
        agg (bool): Whether or not to collapse transcripts with
            duplicate triplets

    Returns:
        df (pyranges PyRanges): PyRanges gtf table with updated ids
    """
    
    # keep only the gene, exon, and transcript entries
    df = pr.read_gtf(gtf).df
    entry_types = ['gene', 'transcript', 'exon']
    df = df.loc[df.Feature.isin(entry_types)]
   
    if not update_ends:
        _, _, _, m_df = read_h5(h5)
    else:
        _, tss, tes, m_df = read_h5(h5)

    # groupby transcripts that are the same
    gb_cols = ['gene_name', 'gene_id', 'transcript_triplet',
               'transcript_id', 'transcript_name']
    temp = m_df[['transcript_id',
                 'original_transcript_id',
                 'original_transcript_name']].copy(deep=True)
    m_df = m_df.groupby(gb_cols).agg({'original_transcript_id': ','.join,
                                      'original_transcript_name': ','.join}).reset_index()
    m_df = m_df.merge(temp, on='transcript_id', suffixes=('','_merge'))
    m_df.drop(['gene_name', 'gene_id', 'transcript_triplet'],
              axis=1, inplace=True)

    # add new transcript ids
    df = df.merge(m_df, left_on=['transcript_id', 'transcript_name'],
                  right_on=['original_transcript_id_merge',
                            'original_transcript_name_merge'],
                 suffixes=('_x', ''))

    # drop old tids
    df.drop(['transcript_id_x', 'transcript_name_x',
             'original_transcript_name_merge'],
            axis=1, inplace=True)

    # remove duplicated transcripts; just keeping the first one
    if agg:
        temp = df[['transcript_id', 'original_transcript_id_merge']].drop_duplicates()
        dupe_old_tids = temp.loc[temp.transcript_id.duplicated(keep='first'), 'original_transcript_id_merge']
        df = df.loc[~df.original_transcript_id_merge.isin(dupe_old_tids)]

    # drop last column
    df.drop('original_transcript_id_merge', axis=1, inplace=True)

    df = pr.PyRanges(df)

    return df

In [None]:
def replace_gtf_ids(gtf, h5, agg, update_ends, o): 
    """
    Replace transcript ids and names in a gtf with the triplets
    calculated from convert_transcriptome
    
    Parameters:
        gtf (str): Path to GTF file to update
        h5 (str): Path to cerberus reference with tid map
        agg (bool): Aggregate / collapse transcripts with the same 
            triplets
        update_ends (bool): Change the ends of transcripts based on 
            ends seen in the h5 annotation
        o (str): Output file name
    """

## modify / collapse transcripts in TALON abundance file

In [50]:
h5 = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/cerberus/human_cerberus.h5'
gtf = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_known_nic_nnc_talon.gtf'
agg = True
update_ends = True

# replace_gtf_ids(gtf, h5, agg, update_ends)

In [51]:
df = pr.read_gtf(gtf).df
entry_types = ['gene', 'transcript', 'exon']
df = df.loc[df.Feature.isin(entry_types)]

if not update_ends:
    _, _, _, m_df = read_h5(h5)
else:
    _, tss, tes, m_df = read_h5(h5)


In [20]:
m_df.head()

Unnamed: 0,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,gene_id,gene_name,original_transcript_name,transcript_triplet,transcript_id,transcript_name
0,ENCODEHT000206942,1,ENCODEHG000058846_1,ENCODEHG000058846_1,1,ENCODEHG000058846_1,1,ENCODEHG000058846,ENCODEHG000058846,ENCODEHT000206942,"[1,1,1]","ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]"
1,ENCODEHT000206867,4,ENCODEHG000058837_4,ENCODEHG000058837_2,2,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206867,"[2,4,1]","ENCODEHG000058837[2,4,1]","ENCODEHG000058837[2,4,1]"
2,ENCODEHT000206868,2,ENCODEHG000058837_2,ENCODEHG000058837_2,2,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206868,"[2,2,1]","ENCODEHG000058837[2,2,1]","ENCODEHG000058837[2,2,1]"
3,ENCODEHT000206870,3,ENCODEHG000058837_3,ENCODEHG000058837_2,2,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206870,"[2,3,1]","ENCODEHG000058837[2,3,1]","ENCODEHG000058837[2,3,1]"
4,ENCODEHT000206886,1,ENCODEHG000058837_1,ENCODEHG000058837_1,1,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206886,"[1,1,1]","ENCODEHG000058837[1,1,1]","ENCODEHG000058837[1,1,1]"


In [21]:
df.head()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,tag,transcript_type,transcript_support_level,havana_transcript,NNC_transcript,protein_id,ccdsid,intergenic_novel,antisense_gene,gene_antisense_to_IDs
0,SIRV1,TALON,gene,10647,11606,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,,,,True,,
1,SIRV1,TALON,transcript,10647,11606,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,True,,,,,
2,SIRV1,TALON,exon,10647,10791,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,,,,,,
3,SIRV1,TALON,exon,10882,11057,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,,,,,,
4,SIRV1,TALON,exon,11434,11606,.,+,.,ENCODEHG000058846,ENCODEHG000058846,...,,,,,,,,,,


In [52]:
def map_gtf_transcripts(df, m_df):
    """
    Add cerberus transcript id to each entry in gtf
    """
    # groupby transcripts that are the same
    gb_cols = ['gene_name', 'gene_id', 'transcript_triplet',
               'transcript_id', 'transcript_name']
    temp = m_df[['transcript_id',
                 'original_transcript_id',
                 'original_transcript_name']].copy(deep=True)
    m_df = m_df.groupby(gb_cols).agg({'original_transcript_id': ','.join,
                                      'original_transcript_name': ','.join}).reset_index()
    m_df = m_df.merge(temp, on='transcript_id', suffixes=('','_merge'))
    m_df.drop(['gene_name', 'gene_id', 'transcript_triplet'],
              axis=1, inplace=True)

    # add new transcript ids
    df = df.merge(m_df, left_on=['transcript_id', 'transcript_name'],
                  right_on=['original_transcript_id_merge',
                            'original_transcript_name_merge'],
                 suffixes=('_x', ''))

    # drop old tids
    df.drop(['transcript_id_x', 'transcript_name_x',
             'original_transcript_name_merge'],
            axis=1, inplace=True)
    
    return df

In [53]:
test = map_gtf_transcripts(df, m_df)

In [56]:
test.loc[(test.original_transcript_id.str.contains(','))&(test.Feature == 'transcript')]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,protein_id,ccdsid,intergenic_novel,antisense_gene,gene_antisense_to_IDs,transcript_id,transcript_name,original_transcript_id,original_transcript_name,original_transcript_id_merge
663,chr1,HAVANA,transcript,1232225,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,ENSP00000496787.1,,,,,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]","ENST00000379198.4,ENST00000647651.1","B3GALT6-201,B3GALT6-202",ENST00000647651.1
665,chr1,HAVANA,transcript,1232258,1235041,.,+,.,ENSG00000176022.5,B3GALT6,...,ENSP00000368496.2,CCDS13.1,,,,"ENSG00000176022[1,1,1]","B3GALT6[1,1,1]","ENST00000379198.4,ENST00000647651.1","B3GALT6-201,B3GALT6-202",ENST00000379198.4
369779,chr6,HAVANA,transcript,1609971,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,ENSP00000370256.2,CCDS4473.1,,,,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]","ENST00000380874.3,ENST00000645831.1","FOXC1-201,FOXC1-202",ENST00000380874.3
369781,chr6,HAVANA,transcript,1610066,1613897,.,+,.,ENSG00000054598.7,FOXC1,...,ENSP00000493906.1,CCDS4473.1,,,,"ENSG00000054598[1,1,1]","FOXC1[1,1,1]","ENST00000380874.3,ENST00000645831.1","FOXC1-201,FOXC1-202",ENST00000645831.1
1155612,chrX,HAVANA,transcript,83508260,83512127,.,+,.,ENSG00000196767.7,POU3F4,...,ENSP00000362296.2,CCDS14450.1,,,,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]","ENST00000373200.5,ENST00000644024.1","POU3F4-201,POU3F4-202",ENST00000373200.5
1155614,chrX,HAVANA,transcript,83508292,83512127,.,+,.,ENSG00000196767.7,POU3F4,...,ENSP00000495996.1,CCDS14450.1,,,,"ENSG00000196767[1,1,1]","POU3F4[1,1,1]","ENST00000373200.5,ENST00000644024.1","POU3F4-201,POU3F4-202",ENST00000644024.1


In [58]:
test.loc[(test.original_transcript_id.str.contains(','))&(test.Feature == 'transcript'), 'original_transcript_id']

663        ENST00000379198.4,ENST00000647651.1
665        ENST00000379198.4,ENST00000647651.1
369779     ENST00000380874.3,ENST00000645831.1
369781     ENST00000380874.3,ENST00000645831.1
1155612    ENST00000373200.5,ENST00000644024.1
1155614    ENST00000373200.5,ENST00000644024.1
Name: original_transcript_id, dtype: object

In [57]:
test.transcript_id.head()

0    ENCODEHG000058846[1,1,1]
1    ENCODEHG000058846[1,1,1]
2    ENCODEHG000058846[1,1,1]
3    ENCODEHG000058846[1,1,1]
4    ENCODEHG000058837[2,4,1]
Name: transcript_id, dtype: object

In [136]:
# tests for map_gtf_ids
# - transcript that does not have duplicate
# - transcript that does have a duplicate

def make_exon_df(n,c,e,s,g,t):
    df = pd.DataFrame()
    df['Chromosome'] = c
    df['Start'] = [i[0] for i in e]
    df['End'] = [i[1] for i in e]
    df['Strand'] = s
    df['Feature'] = 'exon'
    cols = ['gene_name', 'gene_id']
    for c in cols:
        df[c] = g
    cols = ['transcript_id', 'transcript_name']
    for c in cols:
        df[c] = t

    # reorder exons and starts/ stops if needed
    df['new_Start'] = df[['Start', 'End']].min(axis=1)
    df['new_End'] = df[['Start', 'End']].max(axis=1)
    df.drop(['Start', 'End'], axis=1, inplace=True)
    df.rename({'new_Start':'Start',
               'new_End':'End'}, axis=1, inplace=True)
    s = s[0]
    if s == '+':
        ascending = True
    elif s == '-':
        ascending = False
    df.sort_values(by='Start', ascending=ascending, inplace=True)
    return df

def make_hier_entry(df, how='t'):
    """
    kind {'g','t'}
    """
    agg_dict = {'min_coord': 'min', 'max_coord': 'max'}
    t_df = df.copy(deep=True)
    t_df['min_coord'] = t_df[['Start', 'End']].min(axis=1)
    t_df['max_coord'] = t_df[['Start', 'End']].max(axis=1)
    if how == 't':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id', 'transcript_id', 'transcript_name']
    elif how == 'g':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id']
        
    cols = gb_cols + ['min_coord', 'max_coord']
    t_df = t_df[cols]
    t_df = t_df.groupby(gb_cols).agg(agg_dict).reset_index()
    t_df.rename({'min_coord': 'Start', 'max_coord': 'End'}, axis=1, inplace=True)
    if how == 't':
        t_df['Feature'] = 'transcript'
    elif how == 'g':
        t_df['Feature'] = 'gene'
        
    return t_df

def make_test_gtf(ts):
    df = pd.concat(ts)
        # make transcript entries
    t_df = make_hier_entry(df, how='t')
    # make gene entries
    g_df = make_hier_entry(df, how='g')

    # concat everything and sort by gene id, transcript id, feature rank (gene =0, t =1, exon=2), then start coords
    df = pd.concat([df, t_df, g_df])
    df['feature_rank'] = df.Feature.map({'gene':0, 'transcript':1, 'exon':2})

    rev = df.loc[df.Strand == '-'].copy(deep=True)
    fwd = df.loc[df.Strand == '+'].copy(deep=True)

    df = pd.DataFrame()
    for temp in [fwd, rev]:
        if len(temp.index) > 0:
            strand = temp.Strand.values.tolist()[0]
            if strand == '+':
                ascending = True
            elif strand == '-':
                ascending = False
            # temp.sort_values(by=['feature_rank', 'gene_id', 'transcript_id', 'Start'],
            #                ascending=[True, True, True, ascending], inplace=True)
            temp.sort_values(by=['transcript_id', 'gene_id', 'feature_rank', 'Start'], 
                             ascending=[True, True, True, ascending],
                             na_position='first', inplace=True)
            
            df = pd.concat([df, temp])
    # df.drop('feature_rank', axis=1, inplace=True)
    return df

ts = []
# t1 - transcript that doesn't need to be merged
n = 3
c = ['1' for i in range(n)]
e = [[1,10], [14,20], [25,30]]
s = ['+' for i in range(n)]
g = 'g1'
t = 'g1_t1'
ts.append(make_exon_df(n,c,e,s,g,t))

# t2 - rev. strand transcript that needs to be merged
n = 3
c = ['1' for i in range(n)]
e = [[90,60], [45,30], [10,8]]
s = ['-' for i in range(n)]
g = 'g2'
t = 'g2_t1'
ts.append(make_exon_df(n,c,e,s,g,t))

# t3 - rev. strand transcript that needs to be merged
n = 3
c = ['1' for i in range(n)]
e = [[95,60], [45,30], [10,6]]
s = ['-' for i in range(n)]
g = 'g2'
t = 'g2_t2'
ts.append(make_exon_df(n,c,e,s,g,t))

df = make_test_gtf(ts)

# # make transcript entries
# t_df = make_hier_entry(df, how='t')
# # make gene entries
# g_df = make_hier_entry(df, how='g')

# # concat everything and sort by gene id, transcript id, feature rank (gene =0, t =1, exon=2), then start coords
# df = pd.concat([df, t_df, g_df])
# df['feature_rank'] = df.Feature.map({'gene':0, 'transcript':1, 'exon':2})

# rev = df.loc[df.Strand == '-'].copy(deep=True)
# fwd = df.loc[df.Strand == '+'].copy(deep=True)

# df = pd.DataFrame()
# for temp in [fwd, rev]:
#     if len(temp.index) > 0:
#         strand = temp.Strand.values.tolist()[0]
#         if strand == '+':
#             ascending = True
#         elif strand == '-':
#             ascending = False
#         temp.sort_values(by=['feature_rank', 'gene_id', 'transcript_id', 'Start'],
#                        ascending=[True, True, True, ascending], inplace=True)
#         df = pd.concat([df, temp])

In [137]:
df

Unnamed: 0,Chromosome,Strand,Feature,gene_name,gene_id,transcript_id,transcript_name,Start,End,feature_rank
0,1,+,gene,g1,g1,,,1,30,0
0,1,+,transcript,g1,g1,g1_t1,g1_t1,1,30,1
0,1,+,exon,g1,g1,g1_t1,g1_t1,1,10,2
1,1,+,exon,g1,g1,g1_t1,g1_t1,14,20,2
2,1,+,exon,g1,g1,g1_t1,g1_t1,25,30,2
1,1,-,gene,g2,g2,,,6,95,0
1,1,-,transcript,g2,g2,g2_t1,g2_t1,8,90,1
0,1,-,exon,g2,g2,g2_t1,g2_t1,60,90,2
1,1,-,exon,g2,g2,g2_t1,g2_t1,30,45,2
2,1,-,exon,g2,g2,g2_t1,g2_t1,8,10,2


In [None]:
# # tests for update_gtf_ends()
# def make_gtf(c,e,s,g,t):


# # t1 - transcript that doesn't need to be merged
# n = 3
# c = ['1' for i in range(n)]
# e [[1,10], [14,20], [25,30]]
# s = ['+' for i in range(n)]
# g = 'g1'
# t = 'g1_t1'

# # t2 - rev. strand transcript that needs ends to be updated
# n = 3
# c = ['1' for i in range(n)]
# e = [[90,60], [45,30], [10,8]]
# s = ['-' for i in range(n)]
# g = 'g2'
# t = 'g1_t1'

# df = pd.DataFrame()
# df['Chromosome'] = c
# df['Start'] = [i[0] for i in e]
# df['End'] = [i[1] for i in e]
# df['Strand'] = s
# cols = ['gene_name', 'gene_id']
# for c in cols:
#     df[c] = g
# cols = []

