In [12]:
import pyranges as pr
import pandas as pd
import numpy as np
from cerberus.cerberus import *

## end aggregation

In [34]:
# def agg_2_ends(bed1, bed2,
#                strand,
#                gid,
#                buffer,
#                add_ends,
#                mode):
#     """
#     Parameters:
#         bed1 (pyranges PyRanges): Bed PyRanges object for existing ends
#         bed2 (pyranges PyRanges): Bed PyRanges object for new ends
#         strand (bool): Whether bed2 has strand info
#         gid (bool): Whether bed2 has gene id info
#         buffer (int): Maximum allowable distance between ends in bed1 and bed2
#             to call them the same end
#         add_ends (bool): Whether to initialize new regions from bed2
#         mode (str): {'tss', 'tes'}
#     """

#     source1 = bed1.df.source.unique().tolist()[0]
#     source2 = bed2.df.source.unique().tolist()[0]

#     new_c = '{}_new'.format(mode)
#     max_c = '{}_max'.format(mode)

#     # convert into int64 
#     bed1 = pr.PyRanges(bed1.df, int64=True)
#     bed2 = pr.PyRanges(bed2.df, int64=True)
    
#     # depending on whether the new bed has strand information,
#     # construct the join call
#     if strand:
#         temp_joined = bed1.join(bed2,
#             strandedness='same',
#             suffix='_new',
#             slack=buffer,
#             how='left')
#     elif not strand:
#         bed2 = bed2.df
#         bed2.drop('Strand', axis=1, inplace=True)
#         bed2 = pr.PyRanges(bed2)
#         temp_joined = bed1.join(bed2,
#             strandedness=False,
#             suffix='_new',
#             slack=buffer,
#             how='left')

#     # format null starts as actual nans b/c of join
#     temp_joined = temp_joined.df
#     temp_joined.loc[temp_joined.Start_new == -1, 'Start_new'] = np.nan

#     # df to hold final end annotations
#     df = pd.DataFrame()

#     ### old ends ###

#     # situation 1: ends match across the datasets in coord and gene id
#     if gid:
#         temp = temp_joined.loc[temp_joined.gene_id == temp_joined.gene_id_new].copy(deep=True)
#     else:
#         temp = temp_joined.loc[~temp_joined.Start_new.isnull()].copy(deep=True)
#     temp.source = temp.source+','+temp.source_new
#     df = pd.concat([df, temp])

#     # situation 2: ends are only in the first dataset
#     if gid:
#         temp = temp_joined.loc[(temp_joined.Start_new.isnull())|(temp_joined.gene_id!=temp_joined.gene_id_new)].copy(deep=True)
#     else:
#         temp = temp_joined.loc[temp_joined.Start_new.isnull()].copy(deep=True)
#     df = pd.concat([df, temp])

#     # restrict to relevant columns
#     cols = ['Chromosome', 'Start', 'End', 'Strand',
#             'Name', 'gene_id', 'source', mode, 'id_new']
#     df = df[cols]
#     df.rename({'id_new': 'id'}, axis=1, inplace=True)

#     ### new ends, only add if we're allowing them to be independent
#     ### end support
#     if add_ends and strand and gid:

#         new_df = pd.DataFrame()

#         drop_cols = ['Start', 'End', 'Strand', 'gene_id', 'source', 'Name', mode]
#         m = {'Start_new': 'Start',
#              'End_new': 'End',
#              'gene_id_new': 'gene_id',
#              'Strand_new': 'Strand',
#              'source_new': 'source',
#              'Name_new': 'Name',
#              new_c: mode}

#         # situation 3: the ends overlapped, but the gene ids didn't match
#         temp = temp_joined.loc[(temp_joined.gene_id!=temp_joined.gene_id_new)&(temp_joined.gene_id_new!='-1')].copy(deep=True)
#         temp.drop(drop_cols, axis=1, inplace=True)
#         temp.rename(m, axis=1, inplace=True)
#         new_df = pd.concat([new_df, temp])

#         # situation 4: the ends are brand new and didn't overlap at all in the existing ends
#         bed2 = bed2.df
#         inds = list(set(bed2.id.tolist())-set(df.id.tolist()))
#         temp = bed2.loc[inds]
#         new_df = pd.concat([new_df, temp])

#         g_maxes = get_gene_feat_max(df, mode)
#         new_df = renumber_new_feats(new_df, g_maxes, mode)

#         # some df formatting
#         new_df.drop([mode, max_c], axis=1, inplace=True)
#         new_df.rename({new_c: mode}, axis=1, inplace=True)

#         # finally, concatenate new and old df
#         df = pd.concat([df, new_df])

#     # drop unnecessary columns and create new names
#     # and do some extra formatting
#     df.drop('id', axis=1, inplace=True)
#     df['Name'] = df.gene_id+'_'+df[mode].astype(str)
#     df[mode] = df[mode].astype(int)
#     df['Start'] = df.Start.astype(int)
#     keep_cols = ['Chromosome', 'Start', 'End', 'Strand', 'Name', 'gene_id', mode, 'source']
#     df = df[keep_cols]
#     df['id'] = [i for i in range(len(df.index))]

#     return df

In [35]:
def aggregate_ends(beds, sources, add_ends, buffer, mode):
    """
    Aggregate ends from more than one bed source.
    
    Parameters:
        beds (list of str): List of bed file names
        sources (list of str): List of source names for each bed
        add_ends (list of bool): List of booleans indicating whether
            to add novel ends for each bed file
        buffer (int): Allowable distance to an existing end for new
            ends to be called the same end
        mode (str): {'tss', 'tes'}
        
    Returns:
        df (pandas DataFrame): DataFrame of regions from 
            aggregated bed files
    """
    
    df = pd.DataFrame()
    i = 0
    for bed_fname, source, add in zip(beds, sources, add_ends):
        
        # read in bed file and do some formatting
        bed = read_bed(bed_fname, mode)
        bed = bed.df
        bed['source'] = source
        bed['id'] = [i for i in range(len(bed.index))]
        bed = pr.PyRanges(bed)
        
        # first bed; just accept all these ends
        if len(df.index) == 0:
            
            if not add:
                raise Exception('Must add ends from first bed file')
            
            if 'gene_id' not in bed.df.columns and 'Strand' not in bed.df.columns:
                raise Exception('First bed must contain Strand and gene_id columns')
                
            df = bed.df
        
        # more than one bed; merge and reconcile ends
        else:

            if 'gene_id' not in bed.df.columns or 'Strand' not in bed.df.columns:
                if add:
                    raise Exception('Cannot add new ends from {} because '+\
                                    'it does not contain gene_id information.')
            
            # add missing columns but keep track of what information we'll be 
            # able to merge on 
            bed, gid, strand = format_agg_2_ends_bed(bed, mode)
            df = pr.PyRanges(df)
            df = agg_2_ends(df, bed,
                            strand, gid,
                            buffer, add, mode)
            i += 1
     
    df.drop('id', axis=1, inplace=True)
    return df


In [41]:
def format_agg_2_ends_bed(bed, mode):
    """
    Format bed file for agg_2_ends
    
    Parameters:
        bed (pandas DataFrame): DataFrame of bed file
        mode (str): {'tss', 'tes'}
        
    Returns:
        bed (pandas DataFrame): DataFrame with added gene id
            and strand info
        gid (bool): Whether or not the bed file contained a gene id
        strand (bool): Whether or not the bed file contained a strand
    """
    cols = ['Name', 'gene_id', mode, 'Strand']
    
    gid = True
    strand = True
    for col in cols:
        if col not in bed.columns:
            bed[col] = [np.nan for i in range(len(bed.index))]
            
            # record some bools
            if col == 'gene_id':
                gid = False
            elif col == 'Strand':
                strand = False    
                
    return bed, gid, strand

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-41-bb5c361109af>, line 18)

In [42]:
d = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/'
v29_bed = '{}v29_tss.bed'.format(d)
v39_bed = '{}v39_tss.bed'.format(d)
talon_bed = '{}talon_tss.bed'.format(d)

# beds = [v39_bed, v29_bed, talon_bed] #problem w/ 2nd pair
# beds = [v29_bed, talon_bed, v39_bed] #problem on 1st pair
beds = [v29_bed, v39_bed, talon_bed] # problem on 1st pair
sources = ['v39', 'v29', 'talon']
add_ends = [True, True, True]

mode = 'tss'
buffer = 20

df = aggregate_ends(beds, sources, add_ends, buffer, mode)

# fname = 'tests/files/Canx_1_tss.bed'
# df = read_bed(fname, mode='tss')

# fname = 'tests/files/Canx_ccre_pels.bed'
# # df = read_bed(fname, mode='tss')

Chromosome    category
Start            int64
End              int64
Name            object
Score           object
Strand        category
gene_id         object
tss             object
source          object
id               int64
dtype: object
Chromosome    category
Start            int64
End              int64
Name            object
Score           object
Strand        category
gene_id         object
tss             object
source          object
id               int64
dtype: object

Chromosome    category
Start            int64
End              int64
Strand        category
Name            object
gene_id         object
tss              int64
source          object
id               int64
dtype: object
Chromosome    category
Start            int64
End              int64
Name            object
Score           object
Strand        category
gene_id         object
tss             object
source          object
id               int64
dtype: object



In [40]:
df.head()

Unnamed: 0,Chromosome,Start,End,Strand,Name,gene_id,tss,source
0,chr1,169794679,169794780,+,ENSG00000000460_3,ENSG00000000460,3,"v39,v29,talon"
1,chr1,169794992,169795129,+,ENSG00000000460_1,ENSG00000000460,1,"v39,v29,talon"
2,chr1,169795358,169795459,+,ENSG00000000460_2,ENSG00000000460,2,"v39,v29,talon"
3,chr1,196651994,196652106,+,ENSG00000000971_2,ENSG00000000971,2,"v39,v29,talon"
4,chr1,196676937,196677038,+,ENSG00000000971_3,ENSG00000000971,3,"v39,v29,talon"


In [None]:
# keeping entries from the old end df

# if we have strand information, enforce that
if 'Strand' in bed.columns:
    temp = df.join(bed,
                   strandedness='same',
                   suffix='_new',
                   slack=buffer,
                   how='left')
else:
    temp = df.join(bed,
                   strandedness=False,
                   suffix='_new', 
                   slack=buffer,
                   how='left')

# convert back to df
temp = temp.df
print(temp.head())

# if we have gene id information, also enforce that
if 'gene_id' in bed.columns:
    temp = temp.loc[(temp.gene_id == temp.gene_id_new)|(temp.gene_id_new == '-1')]

# reconcile sources
inds = temp.loc[temp.gene_id_new != '-1'].index
new_sources = temp.loc[inds, 'source']+','+temp.loc[inds, 'source_new']
temp.loc[inds, 'source'] = new_sources

# if we're adding in new ends from here
if add:
    

SyntaxError: unexpected EOF while parsing (<ipython-input-245-2533ef766177>, line 32)

In [3]:
# # first subset the human ccres for those that are promoters 
# # and are w/i genomic region of Canx
# #  https://api.wenglab.org/screen_v13/fdownloads/V3/GRCh38-cCREs.bed
# fname = '/Users/fairliereese/mortazavi_lab/ref/ccre_hg38/GRCh38-cCREs.bed'
# df = pr.read_bed(fname)
# df = df['chr11', 50293961:50325673]

# # ok so the toy gtf I have is definitely mouse oops
# # https://api.wenglab.org/screen_v13/fdownloads/V3/mm10-ccREs.bed
# fname = '/Users/fairliereese/mortazavi_lab/ref/ccre_vM3/mm10-ccREs.bed'
# df = pr.read_bed(fname)
# df = df['chr11', 50293961:50325673].df
# df = df.loc[df.Strand.str.contains('pELS')] 
# df.to_csv('tests/files/Canx_ccre_pels.bed', sep='\t', index=False, header=None)