In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
import h5py 
import os

from cerberus.cerberus import *
from cerberus.main import *

## modify / collapse transcripts in TALON GTF

In [34]:
def get_stranded_gtf_dfs(df):
    """
    Split a GTF df into fwd and rev strands
    
    Parameters:
        df (pandas DataFrame): DF of gtf
    
    Returns:
        fwd (pandas DataFrame): DF of all forward-stranded entries from GTF
        rev (pandas DataFrame): DF of all reverse-stranded entries from GTF
    """
    rev = df.loc[df.Strand == '-'].copy(deep=True)
    fwd = df.loc[df.Strand == '+'].copy(deep=True)
    
    return fwd, rev

def sort_gtf(df):
    """
    Sort a GTF into its proper ordering
    
    Parameters:
        df (pandas DataFrame): DF of GTF
    
    Returns:
        df (pandas DataFrame): DF of GTF, sorted
    """
    df['feature_rank'] = df.Feature.map({'gene':0, 'transcript':1, 'exon':2})
    df.feature_rank = df.feature_rank.astype(int)    
     
    fwd, rev = get_stranded_gtf_dfs(df)

    df = pd.DataFrame()
    for temp in [fwd, rev]:
        if len(temp.index) > 0:
            strand = temp.Strand.values.tolist()[0]
            if strand == '+':
                ascending = True
            elif strand == '-':
                ascending = False
            temp.sort_values(by=['gene_id', 'transcript_id', 'feature_rank', 'Start'], 
                             ascending=[True, True, True, ascending],
                             na_position='first', inplace=True)
            
            df = pd.concat([df, temp], ignore_index=True)
    df.drop('feature_rank', axis=1, inplace=True)
    return df

def get_update_ends_settings(strand, mode):
    """
    Returns which columns to refer to and which min/max function
    to use depending on looking at forward / rev strand or 
    tss / tes
    
    Parameters:
        strand (str): {'+', '-'}
        mode (str): {'tss', 'tes'}
    
    Returns:
        old_end (str): Name of column to modify; {'Start', 'End'}
        new_end (str): Name of column to pull new value from; {'Start_end', 'End_end'}
        gene_func (str): What function to apply to new_end; {'min', 'max'}
    """
    if mode == 'tss': 
        if strand == '+':
            old_end = 'Start'
            new_end = 'Start_end'
            gene_func = 'min'
        elif strand == '-':
            old_end = 'End'
            new_end = 'End_end'
            gene_func = 'max'
    elif mode == 'tes':
        if strand == '+':
            old_end = 'End'
            new_end = 'End_end'
            gene_func = 'max'
        elif strand == '-':
            old_end = 'Start'
            new_end  = 'Start_end'
            gene_func = 'min'
            
    return old_end, new_end, gene_func

def update_transcript_ends(df, mode, strand):
    """
    Update the ends of transcripts and the first / last exon
    in a GTF. GTF must be sorted!
    
    Parameters:
        df (pandas DataFrame): Sorted DF of GTF with 'Start_end', and 'End_end' 
            columns denoting the boundaries of each end region
        mode (str): {'tss', 'tes'}
        strand (str): {'+', '-'}
    
    Returns:
        df (pandas DataFrame): DF of GTF with transcript and 
            exon ends modified
    """
    old_col, new_col, gene_func = get_update_ends_settings(strand, mode)
    
    temp = df[['Feature', 'gene_id', 'transcript_id', 'Strand', 'Start', 'End', 'Start_end', 'End_end']].copy(deep=True)
    temp = temp.loc[temp.Feature != 'gene']
    if mode == 'tss':
        inds = temp.groupby('transcript_id').head(2).index.tolist()
    elif mode == 'tes':
        inds = temp.groupby('transcript_id').head(1).index.tolist()
        inds += temp.groupby('transcript_id').tail(1).index.tolist()

    df.loc[inds, old_col] = df.loc[inds, new_col]
    
    # convert float dtypes
    df.Start = df.Start.astype(int)
    df.End = df.End.astype(int)
    
    return df

def update_gene_ends(df, mode, strand):
    """
    Update the ends of genes in a GTF. 
    
    Parameters:
        df (pandas DataFrame): GTF dataframe 
        mode (str): {'tss', 'tes'}
        strand (str): {'+', '-'}
        
    Returns:
        df (pandas DataFrame): DataFrame of GTF with gene ends updated
    """
    # determine which ends we're updating and how we're doing so 
    old_col, new_col, gene_func = get_update_ends_settings(strand, mode)
    
    # get min or max of transcript ends depending on settings
    temp = df[['Feature', 'gene_id', old_col]].copy(deep=True)
    temp = temp.loc[temp.Feature == 'transcript']
    temp = temp.groupby(['gene_id', 'Feature'], observed=True).agg(gene_func).reset_index()
    temp.drop('Feature', axis=1, inplace=True)
    
    # add that coord to gene end
    df = df.merge(temp, on='gene_id', suffixes=('', '_gene'))
    inds = df.loc[df.Feature == 'gene'].index.tolist()
    df.loc[inds, old_col] = df.loc[inds, '{}_gene'.format(old_col)]
    df.drop('{}_gene'.format(old_col), axis=1, inplace=True)
    
    return df

def update_gtf_ends(gtf, tss, tes):
    """
    Update gene, transcript, and exon boundaries to be 
    furthest upstream or downstream entry for end used
    
    Parameters:
        gtf (pandas DataFrame): DF of GTF
        tss (pyranges PyRanges): PyRanges object of reference TSSs
        tes (pyranges PyRanges): PyRanges object of reference TESs
    
    Returns: 
        gtf (pandas DataFrame): DF of GTF with updated ends
            based on the TSSs and TESs used in the input beds
    """
    gtf = gtf.copy(deep=True)

    for mode, ends in zip(['tss', 'tes'], [tss, tes]):
        ends = ends.df
        ends = ends[['Start', 'End', '{}_id'.format(mode)]]
        gtf = gtf.merge(ends, how='left',
                        on='{}_id'.format(mode),
                        suffixes=('', '_end'))

        fwd, rev = get_stranded_gtf_dfs(gtf)
        df = pd.DataFrame()
        for strand, temp in zip(['+', '-'], [fwd, rev]):
    
            # fix exon, transcript, and gene boundaries
            temp = update_transcript_ends(temp, mode, strand)
            temp = update_gene_ends(temp, mode, strand)
            df = pd.concat([df, temp], ignore_index=True)
        
        gtf = df.copy(deep=True)
        gtf.drop(['Start_end', 'End_end'], axis=1, inplace=True)
        
    return gtf

def agg_gtf(df):
    """
    Deduplicate GTF transcripts that have the same triplet id
    
    Parameters:
        df (pandas DataFrame): DF of gtf from `update_ends` 
    
    Returns:
        df (pandas DataFrame): DF of gtf with deduplicated
            transcript / exon entries based on the triplet id
    """
    
    def collapse_non_gb_col(x):
        x = x.astype(str)
        x = x.unique().tolist()
        x = ','.join(x)
        return(x)

    gb_cols = ['Chromosome',
                 'Feature', 
                 'Start', 'End', 
                 'Score', 'Strand', 'Frame', 'gene_id', 'gene_name', 
                 'gene_status', 'gene_type', 'talon_gene', 
                 'ic', 'ic_id', 'tss_id', 'tss', 'tes_id', 'tes', 'transcript_id',
                 'transcript_name']
    gb_cols = list(set(df.columns)&set(gb_cols))

    # get collapsed features to add to deduplicated df
    t_df = df.loc[df.Feature == 'transcript'].copy(deep=True)
    non_gb_cols = list(set(t_df.columns.tolist())-set(gb_cols))
    t_df[non_gb_cols].fillna('', inplace=True)
    t_df = t_df.groupby(gb_cols, observed=True)[non_gb_cols].agg(collapse_non_gb_col).reset_index()
    t_df = t_df[['transcript_id']+non_gb_cols]
    collapsed_feats = t_df.copy(deep=True)

    # deduplicate df based only on transcript id
    temp = df[['transcript_id', 'original_transcript_id']].drop_duplicates()
    dupe_old_tids = temp.loc[temp.transcript_id.duplicated(keep='first'), 'original_transcript_id']
    df = df.loc[~df.original_transcript_id.isin(dupe_old_tids)]
    
    # replace the non gb columns with the ones that we already grouped
    df.drop(non_gb_cols, axis=1, inplace=True)
    df = df.merge(collapsed_feats, how='left', on='transcript_id')
    
    return df

In [113]:
h5 = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/cerberus/human_cerberus.h5'
# gtf = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_known_nic_nnc_talon.gtf'
gtf = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/lapa/human_swan_talon.corrected.gtf'
# gtf = 'test_dupe_genes.gtf'
# gtf = 'test_c1orf112.gtf'
agg = True
update_ends = True

a

3941882
     transcript_id_cerberus        transcript_id      Start  Start_end
12  ENSG00000000460[3,10,3]    ENCODEHT001085918  169794729  169794679
37  ENSG00000000460[3,10,3]  ENCODEHT001085918#0  169794782  169794679
62  ENSG00000000460[3,10,3]  ENCODEHT001085918#1  169794782  169794679
     transcript_id_cerberus        transcript_id      Start  Start_end
12  ENSG00000000460[3,10,3]    ENCODEHT001085918  169794679  169794679
37  ENSG00000000460[3,10,3]  ENCODEHT001085918#0  169794679  169794679
62  ENSG00000000460[3,10,3]  ENCODEHT001085918#1  169794679  169794679
Empty DataFrame
Columns: [transcript_id_cerberus, transcript_id, End, End_end]
Index: []
Empty DataFrame
Columns: [transcript_id_cerberus, transcript_id, End, End_end]
Index: []
     transcript_id_cerberus        transcript_id        End    End_end
12  ENSG00000000460[3,10,3]    ENCODEHT001085918  169853085  169853135
37  ENSG00000000460[3,10,3]  ENCODEHT001085918#0  169852909  169853135
62  ENSG00000000460[3,10,3]  ENCO

In [1]:
# df_back = df.copy(deep=True)

In [None]:
# # get test set to work with 
# df = df_back.copy(deep=True)
# temp = df.loc[df.Feature == 'transcript'].copy(deep=True)
# temp = temp.loc[temp.transcript_id.duplicated(keep=False)]
# temp.head()
# gids_1 = temp.loc[temp.Strand == '+', 'gene_id'].unique().tolist()[:5]
# gids_2 = temp.loc[temp.Strand == '-', 'gene_id'].unique().tolist()[:5]
# gids = gids_1+gids_2

# # # temp.head() 
# # temp.gene_id.head()
# gids

# df_back = df.copy(deep=True)

# df = df.loc[df.gene_id.isin(gids)]
# df = pr.PyRanges(df)
# df.to_gtf('test_dupe_genes.gtf')

In [7]:
gtf = 'test_dupe_genes.gtf'
df = pr.read_gtf(gtf).df
cols = ['Chromosome',
             'Feature', 
             'Start', 'End', 
             'Score', 'Strand', 'Frame', 'gene_id', 'gene_name', 
             'gene_status', 'gene_type', 'talon_gene', 
             'ic', 'ic_id', 'tss_id', 'tss', 'tes_id', 'tes', 'transcript_id',
             'transcript_name']
gb_cols = list(set(df.columns)&set(cols))
real_input = df.copy(deep=True)
# print(df.loc[(df.Feature == 'transcript')&(df.transcript_id == 'ENSG00000000460[3,10,3]'), gb_cols])
# df = agg_gtf(df)
# print()
# gb_cols = list(set(df.columns)&set(cols))
# print(df.loc[(df.Feature == 'transcript')&(df.transcript_id == 'ENSG00000000460[3,10,3]'), gb_cols])

In [8]:
temp = df.loc[df.Feature=='transcript']
temp.loc[temp.transcript_id.duplicated(keep=False)]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,NNC_transcript,transcript_type,transcript_support_level,havana_transcript,ISM_to_IDs,ISM_transcript,ISM-prefix_to_IDs,ISM-prefix_transcript,ISM-suffix_to_IDs,ISM-suffix_transcript
0,chr1,TALON,transcript,169794679,169853135,.,+,.,ENSG00000000460.16,C1orf112,...,,,,,,,,,,
25,chr1,TALON,transcript,169794679,169853135,.,+,.,ENSG00000000460.16,C1orf112,...,,,,,,,,,,
50,chr1,TALON,transcript,169794679,169853135,.,+,.,ENSG00000000460.16,C1orf112,...,,,,,,,,,,
100,chr1,TALON,transcript,169794989,169853135,.,+,.,ENSG00000000460.16,C1orf112,...,,,,,,,,,,
125,chr1,TALON,transcript,169794989,169853135,.,+,.,ENSG00000000460.16,C1orf112,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2353,chr20,HAVANA,transcript,50934816,50958605,.,-,.,ENSG00000000419.12,DPM1,...,,processed_transcript,1,OTTHUMT00000079717.1,,,,,,
2504,chrX,TALON,transcript,100628619,100636856,.,-,.,ENSG00000000003.14,TSPAN6,...,TRUE,,,,,,,,,
2513,chrX,TALON,transcript,100628619,100636856,.,-,.,ENSG00000000003.14,TSPAN6,...,TRUE,,,,,,,,,
2564,chrX,HAVANA,transcript,100628619,100636856,.,-,.,ENSG00000000003.14,TSPAN6,...,,protein_coding,1,OTTHUMT00000057483.1,,,,,,


In [61]:
test_agg_gtf(print_dfs=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## testing

In [56]:
def make_end_df(c,s,st,e,n, source,mode):
    df = pd.DataFrame()
    cols = ['Chromosome', 'Strand', 'Start', 'End', 'Name']
    var = [c,s,st,e,n]
    for col, var in zip(cols, var):
        if type(var) == list:
            df[col] = var

    # add source
    df['source'] = source

    df = format_end_df(df)

    # get end # and gene id
    if any(df.Name.isnull()):
        df['gene_id'] = np.nan
        df[mode] = np.nan
    else:
        df['gene_id'] = df.Name.str.split('_', expand=True)[0]
        df[mode] = df.Name.str.split('_', expand=True)[1]

    # get arbitrary unique ids
    df['id'] = [i for i in range(len(df.index))]
    
    # get id
    df['{}_id'.format(mode)] = df.gene_id+'_'+df[mode]

    return df

def format_end_df(df):
    sort_cols = ['Chromosome', 'Start', 'End', 'Strand']
    df = df.sort_values(by=sort_cols)
    order = ['Chromosome', 'Start', 'End', 'Strand', 'Name', 'source']
    order = [o for o in order if o in df.columns]
    df = df[order]
    df.reset_index(drop=True, inplace=True)
    return df

def make_exon_df(n,c,e,s,g,t,nt=None,ag1=None,ag2=None):
    df = pd.DataFrame()
    df['Chromosome'] = c
    df['Start'] = [i[0] for i in e]
    df['End'] = [i[1] for i in e]
    df['Strand'] = s
    df['Feature'] = 'exon'
    cols = ['gene_name', 'gene_id']
    for c in cols:
        df[c] = g
    cols = ['transcript_id', 'transcript_name']
    for c in cols:
        df[c] = t
    if nt:
        df['new_transcript_id'] = nt
    if ag1:
        df['ag1'] = ag1
    if ag2:
        df['ag2'] = ag2

    # reorder exons and starts/ stops if needed
    df['new_Start'] = df[['Start', 'End']].min(axis=1)
    df['new_End'] = df[['Start', 'End']].max(axis=1)
    df.drop(['Start', 'End'], axis=1, inplace=True)
    df.rename({'new_Start':'Start',
               'new_End':'End'}, axis=1, inplace=True)
    s = s[0]
    if s == '+':
        ascending = True
    elif s == '-':
        ascending = False
    df.sort_values(by='Start', ascending=ascending, inplace=True)
    return df

def make_hier_entry(df, how='t'):
    """
    kind {'g','t'}
    """
    agg_dict = {'min_coord': 'min', 'max_coord': 'max'}
    t_df = df.copy(deep=True)
    t_df['min_coord'] = t_df[['Start', 'End']].min(axis=1)
    t_df['max_coord'] = t_df[['Start', 'End']].max(axis=1)
    if how == 't':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id', 'transcript_id', 'transcript_name',
                   'tss_id', 'tes_id',
                   'new_transcript_id', 'original_transcript_id', 
                   'original_transcript_name', 'ag1', 'ag2']
        gb_cols = list(set(gb_cols)&(set(t_df.columns)))
    elif how == 'g':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id']
        
    cols = gb_cols + ['min_coord', 'max_coord']
    t_df = t_df[cols]
    t_df = t_df.groupby(gb_cols).agg(agg_dict).reset_index()
    t_df.rename({'min_coord': 'Start', 'max_coord': 'End'}, axis=1, inplace=True)
    if how == 't':
        t_df['Feature'] = 'transcript'
    elif how == 'g':
        t_df['Feature'] = 'gene'
        
    return t_df

def make_test_gtf(ts):
    df = pd.concat(ts)
        # make transcript entries
    t_df = make_hier_entry(df, how='t')
    # make gene entries
    g_df = make_hier_entry(df, how='g')

    # concat everything and sort by gene id, transcript id, feature rank (gene =0, t =1, exon=2), then start coords
    df = pd.concat([df, t_df, g_df])
    df = sort_gtf(df)
    return df

In [59]:
# tests for agg_gtf
def test_agg_gtf(print_dfs=True):
    
    # need:
    # transcripts that don't need to be aggregated
    # transcripts that do need to be aggregated

    ts = []

    # t1 - transcript that won't need to be aggregated
    n = 3
    c = ['1' for i in range(n)]
    e = [[1,10], [14,20], [25,30]]
    s = ['+' for i in range(n)]
    g = 'g1'
    nt = 'g1[1,1,1]'
    t = 'g1_t1'
    ag1='known'
    ag2='p1'
    df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)

    # t2 - similar transcript from g1 gene that won't need to be aggregated
    n = 3
    c = ['1' for i in range(n)]
    e = [[1,10], [14,20], [25,50]]
    s = ['+' for i in range(n)]
    g = 'g1'
    nt = 'g1[1,1,3]'
    t = 'g1_t2'
    ag1 = 'novel'
    ag2 = 'p1'
    df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_3'
    ts.append(df)

    # t3 - transcript from a different gene that does need to be aggregated
    n = 3
    c = ['1' for i in range(n)]
    e = [[90,60], [45,30], [10,8]]
    s = ['-' for i in range(n)]
    g = 'g2'
    nt = 'g2[1,1,1]'
    t = 'g2_t1'
    ag1 = 'known'
    ag2 = 'p3'
    df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
    df['tss_id'] = 'g2_1'
    df['tes_id'] = 'g2_1'
    ts.append(df)

    # t4 - transcript that needs to be collapsed with t3
    n = 3
    c = ['1' for i in range(n)]
    e = [[90,60], [45,30], [10,8]]
    s = ['-' for i in range(n)]
    g = 'g2'
    nt = 'g2[1,1,1]'
    t = 'g2_t2'
    ag1 = 'novel'
    ag2 ='p4'
    df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
    df['tss_id'] = 'g2_1'
    df['tes_id'] = 'g2_1'
    ts.append(df)

    # make test gtf
    test_df = make_test_gtf(ts)
    test_df.rename({'transcript_id': 'original_transcript_id',
                    'transcript_name': 'original_transcript_name'},
                   axis=1, inplace=True)
    test_df.rename({'new_transcript_id': 'transcript_id'}, 
                   axis=1, inplace=True)
    test_df['transcript_name'] = test_df['transcript_id']

    # make ctrl df
    ts = []

    # t1 - transcript that won't need to be aggregated
    n = 3
    c = ['1' for i in range(n)]
    e = [[1,10], [14,20], [25,30]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1[1,1,1]'
    ag1='known'
    ag2='p1'
    df = make_exon_df(n,c,e,s,g,t,ag1=ag1,ag2=ag2)
    df['original_transcript_id'] = 'g1_t1'
    df['original_transcript_name'] = df['original_transcript_id']
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)

    # t2 - similar transcript from g1 gene that won't need to be aggregated
    n = 3
    c = ['1' for i in range(n)]
    e = [[1,10], [14,20], [25,50]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1[1,1,3]'
    ag1 = 'novel'
    ag2 = 'p1'
    df = make_exon_df(n,c,e,s,g,t,ag1=ag1,ag2=ag2)
    df['original_transcript_id'] = 'g1_t2'
    df['original_transcript_name'] = df['original_transcript_id']
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_3'
    ts.append(df)

    # t3 /t4 collapsed- transcript from a different gene that does need to be aggregated
    n = 3
    c = ['1' for i in range(n)]
    e = [[90,60], [45,30], [10,8]]
    s = ['-' for i in range(n)]
    g = 'g2'
    t = 'g2[1,1,1]'
    ag1 = 'known,novel'
    ag2 = 'p3,p4'
    df = make_exon_df(n,c,e,s,g,t,ag1=ag1,ag2=ag2)
    df['tss_id'] = 'g2_1'
    df['tes_id'] = 'g2_1'
    df['original_transcript_id'] = 'g2_t1,g2_t2'
    df['original_transcript_name'] = df['original_transcript_id']
    ts.append(df)


    ctrl = pr.PyRanges(make_test_gtf(ts)).df
    test = pr.PyRanges(agg_gtf(test_df)).df

    ctrl.reset_index(inplace=True, drop=True)
    test.reset_index(inplace=True, drop=True)

    if print_dfs:
        print('test')
        print(test)
        print(test.index)
        print(test.dtypes)
        print('ctrl')
        print(ctrl)
        print(ctrl.index)
        print(ctrl.dtypes)

    pd.testing.assert_frame_equal(ctrl, test, check_like=True)

    assert len(ctrl.index) == len(test.index)

In [57]:
# tests for update_gtf_ends
def test_update_gtf_ends(print_dfs=True):

    ts = []
    # t1 - fwd strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[1,10], [14,20], [25,30]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)
    
    # t1.5 - fwd strand transcript from gene that doesn't 
    # need boundaries updated
    n = 3
    c = ['1' for i in range(n)]
    e = [[0, 10], [14,20], [25, 35]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1_t1.5'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)

    # t2 - rev. strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[90,60], [45,30], [10,8]]
    s = ['-' for i in range(n)]
    g = 'g2'
    t = 'g2_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g2_1'
    df['tes_id'] = 'g2_1'
    ts.append(df)

    # t3 - rev. strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[95,60], [45,30], [10,6]]
    s = ['-' for i in range(n)]
    g = 'g2'
    t = 'g2_t2'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g2_2'
    df['tes_id'] = 'g2_2'
    ts.append(df)
    
    # t4 - fwd strand gene w/ monoexonic transcript
    n = 1
    c = ['1' for i in range(n)]
    e = [[20,30]]
    s = ['+']
    g = 'g3'
    t = 'g3_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g3_1'
    df['tes_id'] = 'g3_1'
    ts.append(df)
    
    # t5 - rev strand gene w/ monoexonic transcript
    n = 1
    c = ['1' for i in range(n)]
    e = [[50, 40]]
    s = ['-']
    g = 'g4'
    t = 'g4_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g4_1'
    df['tes_id'] = 'g4_1'
    ts.append(df)

    # tss reference
    n = 5
    mode = 'tss'
    c = ['1' for i in range(n)]
    s = ['+', '-', '-', '+', '-']
    st = [0, 85, 91, 19, 50] 
    e = [3, 93, 98, 22, 55]
    n = ['g1_1', 'g2_1', 'g2_2', 'g3_1', 'g4_1']
    source = 'v1'
    tss = make_end_df(c,s,st,e,n,source,mode)
    tss = pr.PyRanges(tss)

    # tes reference
    n = 5
    mode = 'tes'
    c = ['1' for i in range(n)]
    s = ['+', '-', '-', '+', '-']
    st = [25, 7, 4, 30, 35]
    e = [35, 9, 6, 31, 40]
    n = ['g1_1', 'g2_1', 'g2_2', 'g3_1', 'g4_1']
    source = 'v1'
    tes = make_end_df(c,s,st,e,n,source,mode)
    tes = pr.PyRanges(tes)

    test_df = make_test_gtf(ts)
    # test_df = pr.PyRanges(test_df)

    test = update_gtf_ends(test_df, tss, tes)

    # ctrl for update_gtf_ends
    # tests for update_gtf_ends
    ts = []
    # t1 - fwd strand transcript
    n = 3
    c = ['1' for i in range(n)]
    e = [[0,10], [14,20], [25,35]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)
    
    # t1.5 - fwd strand transcript from gene that doesn't 
    # need boundaries updated
    n = 3
    c = ['1' for i in range(n)]
    e = [[0, 10], [14,20], [25, 35]]
    s = ['+' for i in range(n)]
    g = 'g1'
    t = 'g1_t1.5'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g1_1'
    df['tes_id'] = 'g1_1'
    ts.append(df)

    # t2 - rev. strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[93,60], [45,30], [10,7]]
    s = ['-' for i in range(n)]
    g = 'g2'
    t = 'g2_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g2_1'
    df['tes_id'] = 'g2_1'
    ts.append(df)

    # t3 - rev. strand transcript from gene w/ >2 transcripts
    n = 3
    c = ['1' for i in range(n)]
    e = [[98,60], [45,30], [10,4]]
    s = ['-' for i in range(n)]
    g = 'g2'
    t = 'g2_t2'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g2_2'
    df['tes_id'] = 'g2_2'
    ts.append(df)
    
    # t4 - fwd strand gene w/ monoexonic transcript
    n = 1
    c = ['1' for i in range(n)]
    e = [[19,31]]
    s = ['+']
    g = 'g3'
    t = 'g3_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g3_1'
    df['tes_id'] = 'g3_1'
    ts.append(df)
    
    # t4 - rev strand gene w/ monoexonic transcript
    n = 1
    c = ['1' for i in range(n)]
    e = [[55, 35]]
    s = ['-']
    g = 'g4'
    t = 'g4_t1'
    df = make_exon_df(n,c,e,s,g,t)
    df['tss_id'] = 'g4_1'
    df['tes_id'] = 'g4_1'
    ts.append(df)

    ctrl = make_test_gtf(ts)
    ctrl.reset_index(inplace=True, drop=True)
    test.reset_index(inplace=True, drop=True)

    if print_dfs:
        print('test')
        print(test)
        print(test.index)
        print(test.dtypes)
        print('ctrl')
        print(ctrl)
        print(ctrl.index)
        print(ctrl.dtypes)

    pd.testing.assert_frame_equal(ctrl, test, check_like=True)

    assert len(ctrl.index) == len(test.index)

In [232]:
test_update_gtf_ends(False)

In [62]:
# tests for map_gtf_ids
# - transcript that does not have duplicate
# - transcript that does have a duplicate

ts = []
# t1 - transcript that doesn't need to be merged
n = 3
c = ['1' for i in range(n)]
e = [[1,10], [14,20], [25,30]]
s = ['+' for i in range(n)]
g = 'g1'
t = 'g1_t1'
ts.append(make_exon_df(n,c,e,s,g,t))

# t2 - rev. strand transcript that needs to be merged
n = 3
c = ['1' for i in range(n)]
e = [[90,60], [45,30], [10,8]]
s = ['-' for i in range(n)]
g = 'g2'
t = 'g2_t1'
ts.append(make_exon_df(n,c,e,s,g,t))

# t3 - rev. strand transcript that needs to be merged
n = 3
c = ['1' for i in range(n)]
e = [[95,60], [45,30], [10,6]]
s = ['-' for i in range(n)]
g = 'g2'
t = 'g2_t2'
ts.append(make_exon_df(n,c,e,s,g,t))

test_df = make_test_gtf(ts)

# map file
otid = ['g1_t1', 'g2_t1', 'g2_t2',]
otname = [o+'n' for o in otid]
tid = ['g1[1,1,1]', 'g2[1,1,1]', 'g2[1,1,1]']
tname = [o.split('[')[0]+'n['+o.split('[')[1] for o in tid]
gid = ['g1', 'g2', 'g2']
gname = ['g1n', 'g2n', 'g2n']
m_df = pd.DataFrame()
m_df['original_transcript_id'] = otid
m_df['original_transcript_name'] = otname
m_df['transcript_id'] = tid
m_df['transcript_name'] = tname
m_df['gene_name'] = gname
m_df['transcript_triplet'] = m_df.transcript_id.str.slice(2)
m_df['gene_id'] = gid

# control
otid = ['g1_t1', 'g2_t2']

## write h5 ref from input beds and ics tsv

In [2]:
ic = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/temp/talon_ic.tsv'
tes = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tes.bed'
tss = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tss.bed'
tss_map = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tes_source_map.bed'
tes_map = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tss_source_map.bed'

# df = read_ic_ref(ic)
# df.head()

write_reference(tss, tes, ic, 'test.h5')

# df = read_cerberus_ends(tss, mode='tss')
# df

df = read_cerberus_source_map(tss_map)
df.head()