In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
from cerberus.cerberus import *

## end aggregation

In [2]:
def test_agg_2_ends_1(print_dfs=False):
    """
    Test agg_2_ends w/ and w/o end adding
    """
    
    def get_test(mode='tss'):
        # example should have 
        # - entries that overlap
        # - entries that don't overlap but are within a certain distance
        # - entries that are unique to either
        # - entries that overlap but aren't using the same gene_id / strand (these will be 
        #      equivalent situtations b/c the gene ids for things on different strands will always
        #      differ

        # later - beds that don't have strand
        # later - beds that don't have gid

        buffer = 20
        
        n = 4
        c = ['1' for i in range(n)]
        s = ['+' for i in range(n)]
        st = [1, 200, 100, 300]
        e = [15, 250, 110, 340]
        n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene1_4']
        source = 'v1'
        bed1 = make_end_df(c,s,st,e,n, source, mode)
        bed1 = pr.PyRanges(bed1)

        n = 4
        c = ['1' for i in range(n)]
        s = ['+' for i in range(n)]
        st = [5, 120, 500, 200]
        e = [10, 140, 550, 250]
        n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene2_1']
        source = 'v2'
        bed2 = make_end_df(c,s,st,e,n, source, mode)
        bed2 = pr.PyRanges(bed2)

        return bed1, bed2
    
    def get_ctrl(add=True):
        
        mode = 'tss'
        
        n = 6
        c = ['1' for i in range(n)]
        s = ['+' for i in range(n)]
        st = [1,200,100,300, 200,500]
        e = [15,250,110,340, 250,550]
        n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene1_4', 'gene2_1', 'gene1_5']
        source = ['v1,v2','v1','v1,v2','v1', 'v2','v2']
        df = make_end_df(c,s,st,e,n, source,mode)

        # convert a few dtypes
        df['Strand'] = df['Strand'].astype('category')
        df['Chromosome'] = df['Chromosome'].astype('category')

        # if we're not adding new ends
        if not add:
            df = df.loc[df.source != 'v2']
            
        # remove unnecessary columns
        df.drop(['gene_id', 'id', mode], axis=1, inplace=True)
        
        # fix ids 
        df.reset_index(drop=True, inplace=True)

        return df

    tests = [True, False]
    for add_ends in tests:
        
        buffer = 20
        mode = 'tss'
        sort_cols = ['Chromosome', 'Strand', 'gene_id', 'Start', 'End']
        order = ['Chromosome', 'Start', 'End', 'Strand', 'Name',
                 'gene_id', 'source', mode]
        bed1, bed2 = get_test()
        df = agg_2_ends(bed1, bed2,
                        strand=True, 
                        gid=True,
                        buffer=buffer,
                        add_ends=add_ends,
                        mode=mode)
        
        test = format_end_df(df)
        ctrl = get_ctrl(add=add_ends)
        
        if print_dfs:
            print('test')
            print(test)
            print(test.index)
            print(test.dtypes)
            print('ctrl')
            print(ctrl)
            print(ctrl.index)
            print(ctrl.dtypes)

        pd.testing.assert_frame_equal(ctrl, test, check_like=True)
        
        assert len(ctrl.index) == len(test.index)
        
def make_end_df(c,s,st,e,n, source,mode):
    df = pd.DataFrame()
    cols = ['Chromosome', 'Strand', 'Start', 'End', 'Name']
    var = [c,s,st,e,n]
    for col, var in zip(cols, var):
        if type(var) == list:
            df[col] = var
            
    # add source
    df['source'] = source

    df = format_end_df(df)

    # get end # and gene id
    if any(df.Name.isnull()):
        df['gene_id'] = np.nan
        df[mode] = np.nan
    else:
        df['gene_id'] = df.Name.str.split('_', expand=True)[0]
        df[mode] = df.Name.str.split('_', expand=True)[1]
    
    # get arbitrary unique ids
    df['id'] = [i for i in range(len(df.index))]

    return df

def format_end_df(df):
    sort_cols = ['Chromosome', 'Start', 'End', 'Strand']
    df = df.sort_values(by=sort_cols)
    order = ['Chromosome', 'Start', 'End', 'Strand', 'Name', 'source']
    order = [o for o in order if o in df.columns]
    df = df[order]
    df.reset_index(drop=True, inplace=True)
    return df

def test_agg_2_ends_2(print_dfs=False):
    """
    Test agg_2_ends w/ and w/o end adding
    """
    
    def get_test(mode='tss'):
        
        # adding a bed file that doesn't have strand or gid info
        # TODO add an entry in bed2 that is duplicated based on 
        # - strandedness
        # - gene id
        # in bed1

        buffer = 20
        
        n = 6
        c = ['1' for i in range(n)]
        s = ['+' for i in range(n)]
        s[-1] = '-'
        st = [1, 200, 100, 300, 260, 260]
        e = [15, 250, 110, 340, 290, 290]
        n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene1_4', 'gene2_1', 'gene3_1']
        source = 'v1'
        bed1 = make_end_df(c,s,st,e,n, source, mode)
        bed1 = pr.PyRanges(bed1)

        n = 4
        c = ['1' for i in range(n)]
        s = [np.nan for i in range(n)]
        st = [5, 120, 500, 200]
        e = [10, 140, 550, 250]
        n = [np.nan for i in range(n)]
        source = 'v2'
        bed2 = make_end_df(c,s,st,e,n, source, mode)
        bed2 = pr.PyRanges(bed2)

        return bed1, bed2
    
    def get_ctrl(add=True):
        
        mode = 'tss'
        
        n = 6
        c = ['1' for i in range(n)]
        s = ['+' for i in range(n)]
        s[-1] = '-'
        st = [1,200,100,300, 260,260]
        e = [15,250,110,340, 290,290]
        n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene1_4', 'gene2_1', 'gene3_1']
        source = ['v1,v2','v1,v2','v1,v2','v1', 'v1,v2', 'v1,v2']
        df = make_end_df(c,s,st,e,n, source,mode)

        # convert a few dtypes
        df['Strand'] = df['Strand'].astype('category')
        df['Chromosome'] = df['Chromosome'].astype('category')

        # if we're not adding new ends
        if not add:
            df = df.loc[df.source != 'v2']
            
        # remove unnecessary columns
        df.drop(['gene_id', 'id', mode], axis=1, inplace=True)
        
        # fix ids 
        df.reset_index(drop=True, inplace=True)

        return df

    strand = False
    gid = False
    add_ends = False
        
    buffer = 20
    mode = 'tss'
    sort_cols = ['Chromosome', 'Strand', 'gene_id', 'Start', 'End']
    order = ['Chromosome', 'Start', 'End', 'Strand', 'Name',
             'gene_id', 'source', mode]
    bed1, bed2 = get_test()
    df = agg_2_ends(bed1, bed2,
                    strand=strand, 
                    gid=gid,
                    buffer=buffer,
                    add_ends=add_ends,
                    mode=mode)
    test = format_end_df(df)
    ctrl = get_ctrl(add=add_ends)

    if print_dfs:
        print('test')
        print(test)
        print(test.index)
        print(test.dtypes)
        print('ctrl')
        print(ctrl)
        print(ctrl.index)
        print(ctrl.dtypes)

    pd.testing.assert_frame_equal(ctrl, test, check_like=True)
        
    assert len(ctrl.index) == len(test.index)

In [3]:
test_agg_2_ends_1(print_dfs=False)
test_agg_2_ends_2(print_dfs=True)

test
  Chromosome  Start  End Strand     Name source
0          1      1   15      +  gene1_1  v1,v2
1          1    100  110      +  gene1_3  v1,v2
2          1    200  250      +  gene1_2  v1,v2
3          1    260  290      +  gene2_1  v1,v2
4          1    260  290      -  gene3_1  v1,v2
5          1    300  340      +  gene1_4     v1
RangeIndex(start=0, stop=6, step=1)
Chromosome    category
Start            int64
End              int64
Strand        category
Name            object
source          object
dtype: object
ctrl
  Chromosome  Start  End Strand     Name source
0          1      1   15      +  gene1_1  v1,v2
1          1    100  110      +  gene1_3  v1,v2
2          1    200  250      +  gene1_2  v1,v2
3          1    260  290      +  gene2_1  v1,v2
4          1    260  290      -  gene3_1  v1,v2
5          1    300  340      +  gene1_4     v1
RangeIndex(start=0, stop=6, step=1)
Chromosome    category
Start            int64
End              int64
Strand        category
Na

In [4]:
# def agg_2_ends(bed1, bed2,
#                strand,
#                gid,
#                buffer,
#                add_ends,
#                mode):
#     """
#     Parameters:
#         bed1 (pyranges PyRanges): Bed PyRanges object for existing ends
#         bed2 (pyranges PyRanges): Bed PyRanges object for new ends
#         buffer (int): Maximum allowable distance between ends in bed1 and bed2 
#             to call them the same end
#         add_ends (bool): Whether to initialize new regions from bed2
#     """
    
#     source1 = bed1.df.source.unique().tolist()[0]
#     source2 = bed2.df.source.unique().tolist()[0]
    
#     new_c = '{}_new'.format(mode)
#     max_c = '{}_max'.format(mode)
    
#     # depending on whether the new bed has strand information, 
#     # construct the join call
#     if strand:
#         temp_joined = bed1.join(bed2, 
#             strandedness='same',
#             suffix='_new',
#             slack=buffer,
#             how='left')
#     elif not strand:
#         temp_joined = bed1.join(bed2, 
#             strandedness=False,
#             suffix='_new',
#             slack=buffer,
#             how='left')
        

#     temp_joined = temp_joined.df
#     # print(temp_joined)
#     df = pd.DataFrame()

#     ### old ends ###

#     # situation 1: ends match across the datasets in coord and gene id
#     temp = temp_joined.loc[temp_joined.gene_id == temp_joined.gene_id_new].copy(deep=True)
#     temp.source = temp.source+','+temp.source_new
#     # print(temp)
#     df = pd.concat([df, temp])
#     # print(df)

#     # situation 2: ends are only in the first dataset
#     temp = temp_joined.loc[(temp_joined.gene_id_new == '-1')|(temp_joined.gene_id!=temp_joined.gene_id_new)].copy(deep=True)
#     # print('here')
#     # print(temp)
#     df = pd.concat([df, temp])
#     # print('df before add_aends')
#     # print(df)
#     # print()

#     # restrict to relevant columns
#     cols = ['Chromosome', 'Start', 'End', 'Strand',
#             'Name', 'gene_id', 'source', mode, 'id_new']
#     df = df[cols]
#     df.rename({'id_new': 'id'}, axis=1, inplace=True)

#     ### new ends, only add if we're allowing them to be independent
#     ### end support
#     if add_ends:
        
#         new_df = pd.DataFrame()
        
#         drop_cols = ['Start', 'End', 'Strand', 'gene_id', 'source', 'Name', mode]
#         m = {'Start_new': 'Start', 
#              'End_new': 'End',
#              'gene_id_new': 'gene_id',
#              'Strand_new': 'Strand',
#              'source_new': 'source',
#              'Name_new': 'Name', 
#              new_c: mode}

#         # situation 3: the ends overlapped, but the gene ids didn't match
#         temp = temp_joined.loc[(temp_joined.gene_id!=temp_joined.gene_id_new)&(temp_joined.gene_id_new!='-1')].copy(deep=True)
#         temp.drop(drop_cols, axis=1, inplace=True)
#         temp.rename(m, axis=1, inplace=True)
#         # print('hewwo')
#         # print(temp.head())
#         # df = pd.concat([df, temp])
#         # print('situation 3 new ends: ')
#         # print(temp)
#         new_df = pd.concat([new_df, temp])
#         # print(df)

#         # situation 4: the ends are brand new and didn't overlap at all in the existing ends
#         bed2 = bed2.df
#         # print(bed2.id.dtypes)
#         # print(temp.id.dtypes)
#         # print(bed2.id.tolist())
#         # print(df.id.tolist())
#         # print('id check')
#         # print('df')
#         # print(df.head())
    
#         # print('bed2')
#         # print(bed2.head())
#         inds = list(set(bed2.id.tolist())-set(df.id.tolist()))
#         # print(inds)
#         temp = bed2.loc[inds]
#         # print(temp)
#         # df = pd.concat([df, temp])
#         # print('situation 4 new ends: ')
#         # print(temp)
#         new_df = pd.concat([new_df, temp])
        
#         # print('bepis')
#         # print(df.head())
        
#         # if we added new ends, number those according to which gene they're from 
#         # n = int(df[mode].max())+1
#         # new_ends = df.loc[df.source == source2].index
#         # nums = [i for i in range(n, 
        
#         # print(df.head())
#         # temp = df.loc[df.source.str.contains(source1)].copy(deep=True)
#         # print(source1)
#         # print(temp.head())
        
#         g_maxes = get_gene_feat_max(df, mode)
#         new_df = renumber_new_feats(new_df, g_maxes, mode)
        
# #         # get max end number per gene from existing ends
# #         df[mode] = df[mode].astype(int)
# #         temp = df[['gene_id', mode]].copy(deep=True)
# #         temp = temp.groupby('gene_id').max().reset_index()
# #         temp.rename({mode: max_c}, axis=1, inplace=True)
# # #         print('new_df')
# # #         print(new_df.head())
# # #         print('temp')
# # #         print(temp.head())
        
# #         # merge with new ends that are being added
# #         new_df = new_df.merge(temp, how='left', on='gene_id')
# #         new_df[max_c].fillna(0, inplace=True)
# #         # df = df.merge(temp, how='left', on='gene_id')
# #         sort_cols = ['gene_id', mode]
        
# #         # renumber new ends 
# #         new_df[new_c] = new_df.sort_values(by=sort_cols,
# #                                         ascending=[True, True])\
# #                                         .groupby(['gene_id'])\
# #                                         .cumcount()+1
# #         new_df[new_c] = new_df[new_c].astype(int) + new_df[max_c].astype(int)
        
#         # some df formatting
#         new_df.drop([mode, max_c], axis=1, inplace=True)
#         new_df.rename({new_c: mode}, axis=1, inplace=True)
#         # print(new_df.head())
        
#         # finally, concatenate new and old df
#         df = pd.concat([df, new_df])
        


#         # print(df[['gene_id', mode]].dtypes)
#         # print(temp[['gene_id', mode]].dtypes)
# #         print(df[['Name', mode]])
# #         print(df['Name'].map(m, na_action='ignore'))
# #         df[mode] = df['Name'].map(m, na_action='ignore')
# #         print(m)
# #         print('hewwo?')                
# #         print()
# #         print()
# #         print(temp)
# #         print('break')
# #         print(df)
# #         df['Name'] = df.gene_id+'_'+df[mode]
        
    
#     # add missing columns
#     # ie mode, gene_id, Strand?

#     # drop unnecessary columns and create new names
#     # and do some extra formatting
#     df.drop('id', axis=1, inplace=True)
#     df['Name'] = df.gene_id+'_'+df[mode].astype(str)
#     df[mode] = df[mode].astype(int)
#     # keep_cols = ['Chromosome', 'Start', 'End', 'Strand', 'gene_id', 'source', mode]
#     # df = df[keep_cols]
    
#     return df

In [79]:
test_agg_2_ends_1(print_dfs=False)

In [39]:
def format_agg_2_ends_bed(bed, mode):
    cols = ['Name', 'gene_id', mode, 'Strand']
    
    gid = True
    strand = True
    for col in cols:
        if col not in bed.cols:
            bed[col] = [np.nan for i in range(len(bed.index))]
            
            # record some bools
            if col == 'gene_id':
                gid = False
            elif col == 'Strand':
                strand = False
                
    return bed, gid, strand

In [173]:
def aggregate_ends(beds, sources, add_ends, mode):
    
    df = pd.DataFrame()
    i = 0
    for bed_fname, source, add in zip(beds, sources, add_ends):
        
        bed = read_bed(bed_fname, mode)
        bed = bed.df
        bed['source'] = source
        bed = pr.PyRanges(bed)
        
        # first bed; just accept all these ends
        if len(df.index) == 0:
            
            if 'gene_id' not in bed.df.columns and 'Strand' not in bed.df.columns:
                raise Exception('First bed must contain Strand and gene_id columns')
                
            df = bed.df
        
        # more than one bed; merge and reconcile ends
        else:
            
            if 'gene_id' not in bed.df.columns or 'Strand' not in bed.df.columns:
                if add:
                    raise Exception('Cannot add new ends from {} because '+\
                                    'it does not contain gene_id information.')
            
            # add missing columns but keep track of what information we'll be 
            # able to merge on 
            bed, gid, strand = format_agg_2_ends_bed(bed, mode)
            
            # 
            df = agg_2_ends(df, bed,
                            strand, gid,
                            buffer, add, mode)
        
        # 
            i += 1


In [312]:
fname = 'tests/files/Canx_1_tss.bed'
df = read_bed(fname, mode='tss')

fname = 'tests/files/Canx_ccre_pels.bed'
# df = read_bed(fname, mode='tss')

In [313]:
df.head()

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_id,tss
0,chr11,50298268,50298369,ENSMUSG00000020368_3,.,-,ENSMUSG00000020368,3
1,chr11,50300939,50301040,ENSMUSG00000020368_2,.,-,ENSMUSG00000020368,2
2,chr11,50325622,50325723,ENSMUSG00000020368_1,.,-,ENSMUSG00000020368,1


In [None]:
# keeping entries from the old end df

# if we have strand information, enforce that
if 'Strand' in bed.columns:
    temp = df.join(bed,
                   strandedness='same',
                   suffix='_new',
                   slack=buffer,
                   how='left')
else:
    temp = df.join(bed,
                   strandedness=False,
                   suffix='_new', 
                   slack=buffer,
                   how='left')

# convert back to df
temp = temp.df
print(temp.head())

# if we have gene id information, also enforce that
if 'gene_id' in bed.columns:
    temp = temp.loc[(temp.gene_id == temp.gene_id_new)|(temp.gene_id_new == '-1')]

# reconcile sources
inds = temp.loc[temp.gene_id_new != '-1'].index
new_sources = temp.loc[inds, 'source']+','+temp.loc[inds, 'source_new']
temp.loc[inds, 'source'] = new_sources

# if we're adding in new ends from here
if add:
    

SyntaxError: unexpected EOF while parsing (<ipython-input-245-2533ef766177>, line 32)

In [3]:
# # first subset the human ccres for those that are promoters 
# # and are w/i genomic region of Canx
# #  https://api.wenglab.org/screen_v13/fdownloads/V3/GRCh38-cCREs.bed
# fname = '/Users/fairliereese/mortazavi_lab/ref/ccre_hg38/GRCh38-cCREs.bed'
# df = pr.read_bed(fname)
# df = df['chr11', 50293961:50325673]

# # ok so the toy gtf I have is definitely mouse oops
# # https://api.wenglab.org/screen_v13/fdownloads/V3/mm10-ccREs.bed
# fname = '/Users/fairliereese/mortazavi_lab/ref/ccre_vM3/mm10-ccREs.bed'
# df = pr.read_bed(fname)
# df = df['chr11', 50293961:50325673].df
# df = df.loc[df.Strand.str.contains('pELS')] 
# df.to_csv('tests/files/Canx_ccre_pels.bed', sep='\t', index=False, header=None)