In [2]:
import pyranges as pr
import pandas as pd
import numpy as np
import h5py 
import os

from cerberus.cerberus import *
from cerberus.main import *

## ddebugging agg_gtf

In [6]:
def agg_gtf(df):
    """
    Deduplicate GTF transcripts that have the same triplet id

    Parameters:
        df (pandas DataFrame): DF of gtf from `update_ends`

    Returns:
        df (pandas DataFrame): DF of gtf with deduplicated
            transcript / exon entries based on the triplet id
    """

    def collapse_non_gb_col(x):
        x = x.fillna('')
        x = x.astype(str)
        x = x.unique().tolist()
        x = ','.join(x)
        if x == '':
            x = np.nan
        return(x)

    gb_cols = ['Chromosome',
                 'Feature',
                 'Start', 'End',
                 'Score', 'Strand', 'Frame', 'gene_id', 'gene_name',
                 'gene_status', 'gene_type', 'talon_gene',
                 'ic', 'ic_id', 'tss_id', 'tss', 'tes_id', 'tes', 'transcript_id',
                 'transcript_name']
    gb_cols = list(set(df.columns)&set(gb_cols))

    # get dictionary:function mapping for aggregation
    agg_cols = list(set(df.columns)-set(gb_cols))
    agg_dict = dict()
    for c in agg_cols:
        agg_dict[c] = collapse_non_gb_col

    # get collapsed features to add to deduplicated df
    t_df = df.loc[df.Feature == 'transcript'].copy(deep=True)
    non_gb_cols = list(set(t_df.columns.tolist())-set(gb_cols))
    t_df[non_gb_cols].fillna('', inplace=True)
    t_df = t_df.groupby(gb_cols, observed=True).agg(agg_dict).reset_index()
    t_df = t_df[['transcript_id']+non_gb_cols]
    collapsed_feats = t_df.copy(deep=True)

    # deduplicate df based only on transcript id
    temp = df[['transcript_id', 'original_transcript_id']].drop_duplicates()
    dupe_old_tids = temp.loc[temp.transcript_id.duplicated(keep='first'), 'original_transcript_id']
    df = df.loc[~df.original_transcript_id.isin(dupe_old_tids)]

    # replace the non gb columns with the ones that we already grouped
    df.drop(non_gb_cols, axis=1, inplace=True)
    df = df.merge(collapsed_feats, how='left', on='transcript_id')

    return df

In [7]:
d = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/'
h5 = '{}human_cerberus.h5'.format(d)
gtf = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/lapa/smol.gtf'
update_ends = True
agg = True
o = '{}smol_replace.gtf'.format(d)

# replace_gtf_ids(h5, gstf, update_ends, agg, o)

In [15]:
print_dfs = False
# def test_agg_gtf(print_dfs=True):

# need:
# transcripts that don't need to be aggregated
# transcripts that do need to be aggregated

ts = []

# t1 - transcript that won't need to be aggregated
n = 3
c = ['1' for i in range(n)]
e = [[1,10], [14,20], [25,30]]
s = ['+' for i in range(n)]
g = 'g1'
nt = 'g1[1,1,1]'
t = 'g1_t1'
ag1='known'
ag2='p1'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g1_1'
df['tes_id'] = 'g1_1'
ts.append(df)

# t2 - similar transcript from g1 gene that won't need to be aggregated
n = 3
c = ['1' for i in range(n)]
e = [[1,10], [14,20], [25,50]]
s = ['+' for i in range(n)]
g = 'g1'
nt = 'g1[1,1,3]'
t = 'g1_t2'
ag1 = 'novel'
ag2 = 'p1'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g1_1'
df['tes_id'] = 'g1_3'
ts.append(df)

# t3 - transcript from a different gene that does need to be aggregated
n = 3
c = ['1' for i in range(n)]
e = [[90,60], [45,30], [10,8]]
s = ['-' for i in range(n)]
g = 'g2'
nt = 'g2[1,1,1]'
t = 'g2_t1'
ag1 = 'known'
ag2 = 'p3'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g2_1'
df['tes_id'] = 'g2_1'
ts.append(df)

# t4 - transcript that needs to be collapsed with t3
n = 3
c = ['1' for i in range(n)]
e = [[90,60], [45,30], [10,8]]
s = ['-' for i in range(n)]
g = 'g2'
nt = 'g2[1,1,1]'
t = 'g2_t2'
ag1 = 'novel'
ag2 ='p4'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g2_1'
df['tes_id'] = 'g2_1'
ts.append(df)

# make test gtf
test_df = make_test_gtf(ts)
test_df.rename({'transcript_id': 'original_transcript_id',
                'transcript_name': 'original_transcript_name'},
               axis=1, inplace=True)
test_df.rename({'new_transcript_id': 'transcript_id'},
               axis=1, inplace=True)
test_df['transcript_name'] = test_df['transcript_id']

# # make ctrl df
# ts = []

# # t1 - transcript that won't need to be aggregated
# n = 3
# c = ['1' for i in range(n)]
# e = [[1,10], [14,20], [25,30]]
# s = ['+' for i in range(n)]
# g = 'g1'
# t = 'g1[1,1,1]'
# ag1='known'
# ag2='p1'
# df = make_exon_df(n,c,e,s,g,t,ag1=ag1,ag2=ag2)
# df['original_transcript_id'] = 'g1_t1'
# df['original_transcript_name'] = df['original_transcript_id']
# df['tss_id'] = 'g1_1'
# df['tes_id'] = 'g1_1'
# ts.append(df)

# # t2 - similar transcript from g1 gene that won't need to be aggregated
# n = 3
# c = ['1' for i in range(n)]
# e = [[1,10], [14,20], [25,50]]
# s = ['+' for i in range(n)]
# g = 'g1'
# t = 'g1[1,1,3]'
# ag1 = 'novel'
# ag2 = 'p1'
# df = make_exon_df(n,c,e,s,g,t,ag1=ag1,ag2=ag2)
# df['original_transcript_id'] = 'g1_t2'
# df['original_transcript_name'] = df['original_transcript_id']
# df['tss_id'] = 'g1_1'
# df['tes_id'] = 'g1_3'
# ts.append(df)

# # t3 /t4 collapsed- transcript from a different gene that does need to be aggregated
# n = 3
# c = ['1' for i in range(n)]
# e = [[90,60], [45,30], [10,8]]
# s = ['-' for i in range(n)]
# g = 'g2'
# t = 'g2[1,1,1]'
# ag1 = 'known,novel'
# ag2 = 'p3,p4'
# df = make_exon_df(n,c,e,s,g,t,ag1=ag1,ag2=ag2)
# df['tss_id'] = 'g2_1'
# df['tes_id'] = 'g2_1'
# df['original_transcript_id'] = 'g2_t1,g2_t2'
# df['original_transcript_name'] = df['original_transcript_id']
# ts.append(df)


# ctrl = pr.PyRanges(make_test_gtf(ts)).df
# test = pr.PyRanges(agg_gtf(test_df)).df

# ctrl.reset_index(inplace=True, drop=True)
# test.reset_index(inplace=True, drop=True)

# if print_dfs:
#     print('test')
#     print(test)
#     print(test.index)
#     print(test.dtypes)
#     print('ctrl')
#     print(ctrl)
#     print(ctrl.index)
#     print(ctrl.dtypes)

# #     pd.testing.assert_frame_equal(ctrl, test, check_like=True)

# #     assert len(ctrl.index) == len(test.index)

In [16]:
test_df.head()

Unnamed: 0,Chromosome,Strand,Feature,gene_name,gene_id,original_transcript_id,original_transcript_name,transcript_id,ag1,ag2,Start,End,tss_id,tes_id,transcript_name
0,1,+,gene,g1,g1,,,,,,1,50,,,
1,1,+,transcript,g1,g1,g1_t1,g1_t1,"g1[1,1,1]",known,p1,1,30,g1_1,g1_1,"g1[1,1,1]"
2,1,+,exon,g1,g1,g1_t1,g1_t1,"g1[1,1,1]",known,p1,1,10,g1_1,g1_1,"g1[1,1,1]"
3,1,+,exon,g1,g1,g1_t1,g1_t1,"g1[1,1,1]",known,p1,14,20,g1_1,g1_1,"g1[1,1,1]"
4,1,+,exon,g1,g1,g1_t1,g1_t1,"g1[1,1,1]",known,p1,25,30,g1_1,g1_1,"g1[1,1,1]"


In [17]:
test_df[['transcript_id', 'transcript_name', 'original_transcript_id', 'original_transcript_name']].head()

Unnamed: 0,transcript_id,transcript_name,original_transcript_id,original_transcript_name
0,,,,
1,"g1[1,1,1]","g1[1,1,1]",g1_t1,g1_t1
2,"g1[1,1,1]","g1[1,1,1]",g1_t1,g1_t1
3,"g1[1,1,1]","g1[1,1,1]",g1_t1,g1_t1
4,"g1[1,1,1]","g1[1,1,1]",g1_t1,g1_t1


In [12]:
def make_end_df(c,s,st,e,n, source,mode):
    df = pd.DataFrame()
    cols = ['Chromosome', 'Strand', 'Start', 'End', 'Name']
    var = [c,s,st,e,n]
    for col, var in zip(cols, var):
        if type(var) == list:
            df[col] = var

    # add source
    df['source'] = source

    df = format_end_df(df)

    # get end # and gene id
    if any(df.Name.isnull()):
        df['gene_id'] = np.nan
        df[mode] = np.nan
    else:
        df['gene_id'] = df.Name.str.split('_', expand=True)[0]
        df[mode] = df.Name.str.split('_', expand=True)[1]

    # get arbitrary unique ids
    df['id'] = [i for i in range(len(df.index))]

    # get id
    df['{}_id'.format(mode)] = df.gene_id+'_'+df[mode]

    return df

def format_end_df(df):
    sort_cols = ['Chromosome', 'Start', 'End', 'Strand']
    df = df.sort_values(by=sort_cols)
    order = ['Chromosome', 'Start', 'End', 'Strand', 'Name', 'source']
    order = [o for o in order if o in df.columns]
    df = df[order]
    df.reset_index(drop=True, inplace=True)
    return df

def make_exon_df(n,c,e,s,g,t,nt=None,ag1=None,ag2=None):
    df = pd.DataFrame()
    df['Chromosome'] = c
    df['Start'] = [i[0] for i in e]
    df['End'] = [i[1] for i in e]
    df['Strand'] = s
    df['Feature'] = 'exon'
    cols = ['gene_name', 'gene_id']
    for c in cols:
        df[c] = g
    cols = ['transcript_id', 'transcript_name']
    for c in cols:
        df[c] = t
    if nt:
        df['new_transcript_id'] = nt
    if ag1:
        df['ag1'] = ag1
    if ag2:
        df['ag2'] = ag2

    # reorder exons and starts/ stops if needed
    df['new_Start'] = df[['Start', 'End']].min(axis=1)
    df['new_End'] = df[['Start', 'End']].max(axis=1)
    df.drop(['Start', 'End'], axis=1, inplace=True)
    df.rename({'new_Start':'Start',
               'new_End':'End'}, axis=1, inplace=True)
    s = s[0]
    if s == '+':
        ascending = True
    elif s == '-':
        ascending = False
    df.sort_values(by='Start', ascending=ascending, inplace=True)
    return df

def make_hier_entry(df, how='t'):
    """
    kind {'g','t'}
    """
    agg_dict = {'min_coord': 'min', 'max_coord': 'max'}
    t_df = df.copy(deep=True)
    t_df['min_coord'] = t_df[['Start', 'End']].min(axis=1)
    t_df['max_coord'] = t_df[['Start', 'End']].max(axis=1)
    if how == 't':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id', 'transcript_id', 'transcript_name',
                   'tss_id', 'tes_id',
                   'new_transcript_id', 'original_transcript_id',
                   'original_transcript_name', 'ag1', 'ag2']
        gb_cols = list(set(gb_cols)&(set(t_df.columns)))
    elif how == 'g':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id']

    cols = gb_cols + ['min_coord', 'max_coord']
    t_df = t_df[cols]
    t_df = t_df.groupby(gb_cols).agg(agg_dict).reset_index()
    t_df.rename({'min_coord': 'Start', 'max_coord': 'End'}, axis=1, inplace=True)
    if how == 't':
        t_df['Feature'] = 'transcript'
    elif how == 'g':
        t_df['Feature'] = 'gene'

    return t_df

def make_test_gtf(ts):
    df = pd.concat(ts)
        # make transcript entries
    t_df = make_hier_entry(df, how='t')
    # make gene entries
    g_df = make_hier_entry(df, how='g')

    # concat everything and sort by gene id, transcript id, feature rank (gene =0, t =1, exon=2), then start coords
    df = pd.concat([df, t_df, g_df])
    df = sort_gtf(df)
    return df


## write h5 ref from input beds and ics tsv

In [70]:
ic = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/temp/talon_ic.tsv'
tes = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tes.bed'
tss = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tss.bed'
tss_map = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tes_source_map.bed'
tes_map = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tss_source_map.bed'

# df = read_ic_ref(ic)
# df.head()

write_reference(tss, tes, ic, 'test.h5')

# df = read_cerberus_ends(tss, mode='tss')
# df

df = read_cerberus_source_map(tss_map)
df.head()

Unnamed: 0,Chromosome,Start,End,Strand,source,Name
0,chr1,169804296,169804436,+,v40,ENSG00000000460_2
1,chr1,169807786,169807887,+,v40,ENSG00000000460_4
2,chr1,169821668,169821769,+,v40,ENSG00000000460_5
3,chr1,169852986,169853135,+,v40,ENSG00000000460_3
4,chr1,169854029,169854130,+,v40,ENSG00000000460_1
