In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
import h5py 
import os

from cerberus.cerberus import *
from cerberus.main import *

## fix issue with agg_2_ends w/ duplicated regions

In [2]:
def make_end_df(c,s,st,e,n, source,mode):
    df = pd.DataFrame()
    cols = ['Chromosome', 'Strand', 'Start', 'End', 'Name']
    var = [c,s,st,e,n]
    for col, var in zip(cols, var):
        if type(var) == list:
            df[col] = var

    # add source
    df['source'] = source

    df = format_end_df(df)

    # get end # and gene id
    if any(df.Name.isnull()):
        df['gene_id'] = np.nan
        df[mode] = np.nan
    else:
        df['gene_id'] = df.Name.str.split('_', expand=True)[0]
        df[mode] = df.Name.str.split('_', expand=True)[1]

    # get arbitrary unique ids
    df['id'] = [i for i in range(len(df.index))]

    return df#

def format_end_df(df):
    sort_cols = ['Chromosome', 'Start', 'End', 'Strand']
    df = df.sort_values(by=sort_cols)
    order = ['Chromosome', 'Start', 'End', 'Strand', 'Name', 'source']
    order = [o for o in order if o in df.columns]
    df = df[order]
    df.reset_index(drop=True, inplace=True)
    return df

def test_agg_2_ends_1(print_dfs=True):
    """
    Test agg_2_ends w/ and w/o end adding
    """

    def get_test(mode='tss'):
        # example has
        # - entries that overlap
        # - entries that don't overlap but are within a certain distance
        # - entries that are unique to either
        # - entries that overlap but aren't using the same gene_id / strand (these will be
        #      equivalent situtations b/c the gene ids for things on different strands will always
        #      differ
        # - entries in bed2 that are non continuous but overlap the same
        #      entry in bed1
        # - entries in bed1 that the same entry in bed2 maps to that are all the same gene
        #       bed1: gene3_1, gene3_2
        #       bed2: gene3_1
        # - entries in bed1 that the same entry in bed2 maps to that are NOT from the same gene;
        #    need to create new end
        #       bed1: gene3_1, gene3_2
        #       bed2: gene4_1
        # - entry in bed1 that multiple entries in bed2 overlap
        #      

        slack = 20

        n = 6
        c = ['1' for i in range(n)]
        s = ['+' for i in range(n)]
        st = [1, 200, 100, 300, 700, 760]
        e = [15, 250, 110, 340, 750, 770] 
        n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene1_4', 'gene3_1', 'gene3_2']
        source = 'v1'
        bed1 = make_end_df(c,s,st,e,n, source, mode)
        bed1 = pr.PyRanges(bed1)

        n = 7
        c = ['1' for i in range(n)]
        s = ['+' for i in range(n)]
        st = [5, 20, 120, 500, 200, 750, 750]
        e = [10, 25, 140, 550, 250, 775, 775]
        n = ['gene1_1', 'gene1_4', 'gene1_2', 'gene1_3', 'gene2_1', 'gene3_1', 'gene4_1']
        source = 'v2'
        bed2 = make_end_df(c,s,st,e,n, source, mode)
        bed2 = pr.PyRanges(bed2)

        return bed1, bed2

    def get_ctrl(add=True):

        mode = 'tss'

        n = 9
        c = ['1' for i in range(n)]
        s = ['+' for i in range(n)]
        st = [1,200,100,300,700,760, 200,500,750]
        e = [15,250,110,340,750,770, 250,550,775]
        n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene1_4', 'gene3_1', 'gene3_2', 'gene2_1', 'gene1_5', 'gene4_1']
        source = ['v1,v2','v1','v1,v2','v1','v1,v2','v1,v2', 'v2','v2','v2']
        df = make_end_df(c,s,st,e,n, source,mode)

        # convert a few dtypes
        df['Strand'] = df['Strand'].astype('category')
        df['Chromosome'] = df['Chromosome'].astype('category')

        # if we're not adding new ends
        if not add:
            df = df.loc[df.source != 'v2']

        # remove unnecessary columns
        df.drop(['gene_id', 'id', mode], axis=1, inplace=True)

        # fix ids
        df.reset_index(drop=True, inplace=True)

        return df

    def get_ctrl_source_map(add=True):
        # n = 9
        # c = ['1' for i in range(n)]
        # s = ['+' for i in range(n)]
        # st = [1, 200, 100, 300,  5, 20, 120, 500, 200]
        # e = [15, 250, 110, 340,  10, 25, 140, 550, 250]
        # n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene1_4',
        #      'gene1_1', 'gene1_1', 'gene1_3', 'gene1_5', 'gene2_1']
        # source = ['v1' for i in range(4)]+['v2' for i in range(5)]
        # mode = 'tss'
        # df = make_end_df(c,s,st,e,n, source,mode)
        # df = df.loc[df.source == 'v2']
        # df.drop(['gene_id', mode, 'id'], axis=1, inplace=True)

        n = 7
        c = ['1' for i in range(n)]
        s = ['+' for i in range(n)]
        st = [5, 20, 120, 500, 200, 750, 750]
        e = [10, 25, 140, 550, 250, 775, 775]
        n = ['gene1_1', 'gene1_1', 'gene1_3', 'gene1_5', 'gene2_1', 'gene3_1', 'gene3_2']
        source = 'v2'
        mode = 'tss'
        df = make_end_df(c,s,st,e,n, source,mode)
        df = df.loc[df.source == 'v2']
        df.drop(['gene_id', mode, 'id'], axis=1, inplace=True)
        df['Strand'] = df['Strand'].astype('category')
        df['Chromosome'] = df['Chromosome'].astype('category')

        if not add:
            new_names = ['gene2_1', 'gene1_5']
            new_inds = df.loc[df.Name.isin(new_names)].index
            df.loc[new_inds, 'Name'] = np.nan

        return df

    tests = [True, False]
    for add_ends in tests:

        slack = 20
        mode = 'tss'
        sort_cols = ['Chromosome', 'Strand', 'gene_id', 'Start', 'End']
        order = ['Chromosome', 'Start', 'End', 'Strand', 'Name',
                 'gene_id', 'source', mode]
        bed1, bed2 = get_test()

        df, m_source = agg_2_ends(bed1, bed2,
                        strand=True,
                        gid=True,
                        slack=slack,
                        add_ends=add_ends,
                        mode=mode)

        test = format_end_df(df)
        test_m = format_end_df(m_source)
        ctrl = get_ctrl(add=add_ends)
        ctrl_m = get_ctrl_source_map(add=add_ends)

        if print_dfs:
            print('test')
            print(test)
            print(test.index)
            print(test.dtypes)
            print('ctrl')
            print(ctrl)
            print(ctrl.index)
            print(ctrl.dtypes)
            print('test source map')
            print(test_m)
            print(test_m.index)
            print(test_m.dtypes)
            print('ctrl source map')
            print(ctrl_m)
            print(ctrl_m.index)
            print(ctrl_m.dtypes)

        pd.testing.assert_frame_equal(ctrl, test, check_like=True)
        assert len(ctrl.index) == len(test.index)
    
        pd.testing.assert_frame_equal(ctrl_m, test_m, check_like=True)
        assert len(ctrl_m.index) == len(test_m.index)

# def test_agg_2_ends_2(print_dfs=True):
#     """
#     Test agg_2_ends w/ and w/o end adding
#     """

#     def get_test(mode='tss'):

#         # adding a bed file that doesn't have strand or gid info
#         # entries in bed2 that are duplicated based on
#         # - strandedness
#         # - gene id
#         # entries in bed2 where both of them overlap the same region
#         # in bed1

#         slack = 20

#         n = 6
#         c = ['1' for i in range(n)]
#         s = ['+' for i in range(n)]
#         s[-1] = '-'
#         st = [1, 200, 100, 300, 260, 260]
#         e = [15, 250, 110, 340, 290, 290]
#         n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene1_4', 'gene2_1', 'gene3_1']
#         source = 'v1'
#         bed1 = make_end_df(c,s,st,e,n, source, mode)
#         bed1 = pr.PyRanges(bed1)

#         n = 5
#         c = ['1' for i in range(n)]
#         s = [np.nan for i in range(n)]
#         st = [5, 11, 120, 500, 200]
#         e = [10, 21, 140, 550, 250]
#         n = [np.nan for i in range(n)]
#         source = 'v2'
#         bed2 = make_end_df(c,s,st,e,n, source, mode)
#         bed2 = pr.PyRanges(bed2)

#         return bed1, bed2

#     def get_ctrl(add=True):

#         mode = 'tss'

#         n = 6
#         c = ['1' for i in range(n)]
#         s = ['+' for i in range(n)]
#         s[-1] = '-'
#         st = [1,200,100,300, 260,260]
#         e = [15,250,110,340, 290,290]
#         n = ['gene1_1', 'gene1_2', 'gene1_3', 'gene1_4', 'gene2_1', 'gene3_1']
#         source = ['v1,v2','v1,v2','v1,v2','v1', 'v1,v2', 'v1,v2']
#         df = make_end_df(c,s,st,e,n, source,mode)

#         # convert a few dtypes
#         df['Strand'] = df['Strand'].astype('category')
#         df['Chromosome'] = df['Chromosome'].astype('category')

#         # if we're not adding new ends
#         if not add:
#             df = df.loc[df.source != 'v2']

#         # remove unnecessary columns
#         df.drop(['gene_id', 'id', mode], axis=1, inplace=True)

#         # fix ids
#         df.reset_index(drop=True, inplace=True)

#         return df

#     def get_ctrl_source_map(add=False):

#         n = 7
#         c = ['1' for i in range(n)]
#         s = ['+' for i in range(n)]
#         s[-2] = '-'
#         s[-1] = np.nan
#         st = [5, 11, 120, 200, 200, 200, 500]
#         e = [10, 21, 140, 250, 250, 250, 550]
#         n = ['gene1_1', 'gene1_1', 'gene1_3', 'gene1_2', 'gene2_1', 'gene3_1', np.nan]
#         source = 'v2'
#         mode = 'tss'
#         df = make_end_df(c,s,st,e,n, source,mode)
#         df = df.loc[df.source == 'v2']
#         df.drop(['gene_id', mode, 'id'], axis=1, inplace=True)
#         df['Strand'] = df['Strand'].astype('category')
#         df['Chromosome'] = df['Chromosome'].astype('category')

#         return df

#     strand = False
#     gid = False
#     add_ends = False

#     slack = 20
#     mode = 'tss'
#     sort_cols = ['Chromosome', 'Strand', 'gene_id', 'Start', 'End']
#     order = ['Chromosome', 'Start', 'End', 'Strand', 'Name',
#              'gene_id', 'source', mode]
#     bed1, bed2 = get_test()
#     df, m_source = agg_2_ends(bed1, bed2,
#                     strand=strand,
#                     gid=gid,
#                     slack=slack,
#                     add_ends=add_ends,
#                     mode=mode)
#     test = format_end_df(df)
#     test_m = format_end_df(m_source)
#     ctrl = get_ctrl(add=add_ends)
#     ctrl_m = get_ctrl_source_map()

#     if print_dfs:
#         print('test')
#         print(test)
#         print(test.index)
#         print(test.dtypes)
#         print('ctrl')
#         print(ctrl)
#         print(ctrl.index)
#         print(ctrl.dtypes)
#         print('test source map')
#         print(test_m)
#         print(test_m.index)
#         print(test_m.dtypes)
#         print('ctrl source map')
#         print(ctrl_m)
#         print(ctrl_m.index)
#         print(ctrl_m.dtypes)

#     pd.testing.assert_frame_equal(ctrl, test, check_like=True)
#     assert len(ctrl.index) == len(test.index)

#     pd.testing.assert_frame_equal(ctrl_m, test_m, check_like=True)
#     assert len(ctrl_m.index) == len(test_m.index)

In [None]:
test_agg_2_ends_1(print_dfs=True)

> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(428)[0;36magg_2_ends[0;34m()[0m
[0;32m    426 [0;31m[0;34m[0m[0m
[0m[0;32m    427 [0;31m    [0;31m# situation 1: ends match across the datasets in coord and gene id[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 428 [0;31m    [0;32mif[0m [0mgid[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    429 [0;31m        [0mtemp[0m [0;34m=[0m [0mtemp_joined[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0mtemp_joined[0m[0;34m.[0m[0mgene_id[0m [0;34m==[0m [0mtemp_joined[0m[0;34m.[0m[0mgene_id_new[0m[0;34m][0m[0;34m.[0m[0mcopy[0m[0;34m([0m[0mdeep[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    430 [0;31m    [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  temp_joined.head()


  Chromosome  Start  End Strand     Name source gene_id tss  id  Start_new  \
0          1      1   15      +  gene1_1     v1   gene1   1   0        5.0   
1          1      1   15      +  gene1_1     v1   gene1   1   0       20.0   
2          1    100  110      +  gene1_3     v1   gene1   3   1      120.0   
3          1    200  250      +  gene1_2     v1   gene1   2   2      200.0   
8          1    300  340      +  gene1_4     v1   gene1   4   3        NaN   

   End_new Strand_new Name_new source_new gene_id_new tss_new  id_new  
0       10          +  gene1_1         v2       gene1       1       0  
1       25          +  gene1_4         v2       gene1       4       1  
2      140          +  gene1_2         v2       gene1       2       2  
3      250          +  gene2_1         v2       gene2       1       3  
8       -1          +       -1         -1          -1      -1      -1  


ipdb>  temp_joined


  Chromosome  Start  End Strand     Name source gene_id tss  id  Start_new  \
0          1      1   15      +  gene1_1     v1   gene1   1   0        5.0   
1          1      1   15      +  gene1_1     v1   gene1   1   0       20.0   
2          1    100  110      +  gene1_3     v1   gene1   3   1      120.0   
3          1    200  250      +  gene1_2     v1   gene1   2   2      200.0   
8          1    300  340      +  gene1_4     v1   gene1   4   3        NaN   
4          1    700  750      +  gene3_1     v1   gene3   1   4      750.0   
5          1    700  750      +  gene3_1     v1   gene3   1   4      750.0   
6          1    760  770      +  gene3_2     v1   gene3   2   5      750.0   
7          1    760  770      +  gene3_2     v1   gene3   2   5      750.0   

   End_new Strand_new Name_new source_new gene_id_new tss_new  id_new  
0       10          +  gene1_1         v2       gene1       1       0  
1       25          +  gene1_4         v2       gene1       4       1  
2  

ipdb>  temp_joined[['Start', 'End', 'gene_id', 'Start_new', 'End_new', 'gene_id_new']]


   Start  End gene_id  Start_new  End_new gene_id_new
0      1   15   gene1        5.0       10       gene1
1      1   15   gene1       20.0       25       gene1
2    100  110   gene1      120.0      140       gene1
3    200  250   gene1      200.0      250       gene2
8    300  340   gene1        NaN       -1          -1
4    700  750   gene3      750.0      775       gene3
5    700  750   gene3      750.0      775       gene4
6    760  770   gene3      750.0      775       gene3
7    760  770   gene3      750.0      775       gene4


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(429)[0;36magg_2_ends[0;34m()[0m
[0;32m    427 [0;31m    [0;31m# situation 1: ends match across the datasets in coord and gene id[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    428 [0;31m    [0;32mif[0m [0mgid[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 429 [0;31m        [0mtemp[0m [0;34m=[0m [0mtemp_joined[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0mtemp_joined[0m[0;34m.[0m[0mgene_id[0m [0;34m==[0m [0mtemp_joined[0m[0;34m.[0m[0mgene_id_new[0m[0;34m][0m[0;34m.[0m[0mcopy[0m[0;34m([0m[0mdeep[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    430 [0;31m    [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    431 [0;31m        [0mtemp[0m [0;34m=[0m [0mtemp_joined[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0;34m~[0m[0mtemp_joined[0m[0;34m.[0m[0mStart_new[0m[0;34m.[0m[0mis

ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(432)[0;36magg_2_ends[0;34m()[0m
[0;32m    430 [0;31m    [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    431 [0;31m        [0mtemp[0m [0;34m=[0m [0mtemp_joined[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0;34m~[0m[0mtemp_joined[0m[0;34m.[0m[0mStart_new[0m[0;34m.[0m[0misnull[0m[0;34m([0m[0;34m)[0m[0;34m][0m[0;34m.[0m[0mcopy[0m[0;34m([0m[0mdeep[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 432 [0;31m    [0mtemp[0m[0;34m.[0m[0msource[0m [0;34m=[0m [0mtemp[0m[0;34m.[0m[0msource[0m[0;34m+[0m[0;34m','[0m[0;34m+[0m[0mtemp[0m[0;34m.[0m[0msource_new[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    433 [0;31m    [0mdf[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[[0m[0mdf[0m[0;34m,[0m [0mtemp[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m

ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(433)[0;36magg_2_ends[0;34m()[0m
[0;32m    431 [0;31m        [0mtemp[0m [0;34m=[0m [0mtemp_joined[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0;34m~[0m[0mtemp_joined[0m[0;34m.[0m[0mStart_new[0m[0;34m.[0m[0misnull[0m[0;34m([0m[0;34m)[0m[0;34m][0m[0;34m.[0m[0mcopy[0m[0;34m([0m[0mdeep[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    432 [0;31m    [0mtemp[0m[0;34m.[0m[0msource[0m [0;34m=[0m [0mtemp[0m[0;34m.[0m[0msource[0m[0;34m+[0m[0;34m','[0m[0;34m+[0m[0mtemp[0m[0;34m.[0m[0msource_new[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 433 [0;31m    [0mdf[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[[0m[0mdf[0m[0;34m,[0m [0mtemp[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    434 [0;31m[0;34m[0m[0m
[0m[0;32m    435 [0;31m    [0;31m# create s

ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(436)[0;36magg_2_ends[0;34m()[0m
[0;32m    434 [0;31m[0;34m[0m[0m
[0m[0;32m    435 [0;31m    [0;31m# create source map for ends from this source[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 436 [0;31m    [0mm_source[0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0mdf[0m[0;34m.[0m[0msource_new[0m [0;34m==[0m [0msource2[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    437 [0;31m    m_source = m_source[['Chromosome', 'Start_new', 'End_new', 'Strand',
[0m[0;32m    438 [0;31m                   'source_new', 'Name']].copy(deep=True)
[0m


ipdb>  df[['Start', 'End', 'gene_id', 'Start_new', 'End_new', 'gene_id_new']]


   Start  End gene_id  Start_new  End_new gene_id_new
0      1   15   gene1        5.0       10       gene1
1      1   15   gene1       20.0       25       gene1
2    100  110   gene1      120.0      140       gene1
4    700  750   gene3      750.0      775       gene3
6    760  770   gene3      750.0      775       gene3


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(437)[0;36magg_2_ends[0;34m()[0m
[0;32m    435 [0;31m    [0;31m# create source map for ends from this source[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    436 [0;31m    [0mm_source[0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0mdf[0m[0;34m.[0m[0msource_new[0m [0;34m==[0m [0msource2[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 437 [0;31m    m_source = m_source[['Chromosome', 'Start_new', 'End_new', 'Strand',
[0m[0;32m    438 [0;31m                   'source_new', 'Name']].copy(deep=True)
[0m[0;32m    439 [0;31m    m_source.rename({'Start_new': 'Start',
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(438)[0;36magg_2_ends[0;34m()[0m
[0;32m    436 [0;31m    [0mm_source[0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0mdf[0m[0;34m.[0m[0msource_new[0m [0;34m==[0m [0msource2[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    437 [0;31m    m_source = m_source[['Chromosome', 'Start_new', 'End_new', 'Strand',
[0m[0;32m--> 438 [0;31m                   'source_new', 'Name']].copy(deep=True)
[0m[0;32m    439 [0;31m    m_source.rename({'Start_new': 'Start',
[0m[0;32m    440 [0;31m                     [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(439)[0;36magg_2_ends[0;34m()[0m
[0;32m    437 [0;31m    m_source = m_source[['Chromosome', 'Start_new', 'End_new', 'Strand',
[0m[0;32m    438 [0;31m                   'source_new', 'Name']].copy(deep=True)
[0m[0;32m--> 439 [0;31m    m_source.rename({'Start_new': 'Start',
[0m[0;32m    440 [0;31m                     [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    441 [0;31m                     'source_new': 'source'},
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(440)[0;36magg_2_ends[0;34m()[0m
[0;32m    438 [0;31m                   'source_new', 'Name']].copy(deep=True)
[0m[0;32m    439 [0;31m    m_source.rename({'Start_new': 'Start',
[0m[0;32m--> 440 [0;31m                     [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    441 [0;31m                     'source_new': 'source'},
[0m[0;32m    442 [0;31m                     axis=1, inplace=True)
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(441)[0;36magg_2_ends[0;34m()[0m
[0;32m    439 [0;31m    m_source.rename({'Start_new': 'Start',
[0m[0;32m    440 [0;31m                     [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 441 [0;31m                     'source_new': 'source'},
[0m[0;32m    442 [0;31m                     axis=1, inplace=True)
[0m[0;32m    443 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(442)[0;36magg_2_ends[0;34m()[0m
[0;32m    440 [0;31m                     [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    441 [0;31m                     'source_new': 'source'},
[0m[0;32m--> 442 [0;31m                     axis=1, inplace=True)
[0m[0;32m    443 [0;31m[0;34m[0m[0m
[0m[0;32m    444 [0;31m    [0;31m# situation 2: ends are only in the first dataset[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(445)[0;36magg_2_ends[0;34m()[0m
[0;32m    443 [0;31m[0;34m[0m[0m
[0m[0;32m    444 [0;31m    [0;31m# situation 2: ends are only in the first dataset[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 445 [0;31m    [0;32mif[0m [0mgid[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    446 [0;31m        [0;31m# end either didn't match something or matched wrong gid[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    447 [0;31m        temp = temp_joined.loc[(temp_joined.Start_new.isnull())|\
[0m


ipdb>  m_source


  Chromosome  Start  End Strand source     Name
0          1    5.0   10      +     v2  gene1_1
1          1   20.0   25      +     v2  gene1_1
2          1  120.0  140      +     v2  gene1_3
4          1  750.0  775      +     v2  gene3_1
6          1  750.0  775      +     v2  gene3_2


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(447)[0;36magg_2_ends[0;34m()[0m
[0;32m    445 [0;31m    [0;32mif[0m [0mgid[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    446 [0;31m        [0;31m# end either didn't match something or matched wrong gid[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 447 [0;31m        temp = temp_joined.loc[(temp_joined.Start_new.isnull())|\
[0m[0;32m    448 [0;31m                               (temp_joined.gene_id!=temp_joined.gene_id_new)].copy(deep=True)
[0m[0;32m    449 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(448)[0;36magg_2_ends[0;34m()[0m
[0;32m    446 [0;31m        [0;31m# end either didn't match something or matched wrong gid[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    447 [0;31m        temp = temp_joined.loc[(temp_joined.Start_new.isnull())|\
[0m[0;32m--> 448 [0;31m                               (temp_joined.gene_id!=temp_joined.gene_id_new)].copy(deep=True)
[0m[0;32m    449 [0;31m[0;34m[0m[0m
[0m[0;32m    450 [0;31m        [0;31m# restrict to ends that haven't been added to our main df[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(451)[0;36magg_2_ends[0;34m()[0m
[0;32m    449 [0;31m[0;34m[0m[0m
[0m[0;32m    450 [0;31m        [0;31m# restrict to ends that haven't been added to our main df[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 451 [0;31m        [0mtemp[0m [0;34m=[0m [0mtemp[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0;34m~[0m[0mtemp[0m[0;34m.[0m[0mName[0m[0;34m.[0m[0misin[0m[0;34m([0m[0mdf[0m[0;34m.[0m[0mName[0m[0;34m.[0m[0mtolist[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    452 [0;31m[0;34m[0m[0m
[0m[0;32m    453 [0;31m    [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(455)[0;36magg_2_ends[0;34m()[0m
[0;32m    453 [0;31m    [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    454 [0;31m        [0mtemp[0m [0;34m=[0m [0mtemp_joined[0m[0;34m.[0m[0mloc[0m[0;34m[[0m[0mtemp_joined[0m[0;34m.[0m[0mStart_new[0m[0;34m.[0m[0misnull[0m[0;34m([0m[0;34m)[0m[0;34m][0m[0;34m.[0m[0mcopy[0m[0;34m([0m[0mdeep[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 455 [0;31m    [0mdf[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[[0m[0mdf[0m[0;34m,[0m [0mtemp[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    456 [0;31m[0;34m[0m[0m
[0m[0;32m    457 [0;31m    [0;31m# pdb.set_trace()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  temp[['Start', 'End', 'gene_id', 'Start_new', 'End_new', 'gene_id_new']]


   Start  End gene_id  Start_new  End_new gene_id_new
3    200  250   gene1      200.0      250       gene2
8    300  340   gene1        NaN       -1          -1


ipdb>  df['Start', 'End', 'gene_id', 'Start_new', 'End_new', 'gene_id_new']


*** KeyError: ('Start', 'End', 'gene_id', 'Start_new', 'End_new', 'gene_id_new')


ipdb>  df[['Start', 'End', 'gene_id', 'Start_new', 'End_new', 'gene_id_new']]


   Start  End gene_id  Start_new  End_new gene_id_new
0      1   15   gene1        5.0       10       gene1
1      1   15   gene1       20.0       25       gene1
2    100  110   gene1      120.0      140       gene1
4    700  750   gene3      750.0      775       gene3
6    760  770   gene3      750.0      775       gene3


ipdb>  temp[['Start', 'End', 'gene_id', 'Start_new', 'End_new', 'gene_id_new']]


   Start  End gene_id  Start_new  End_new gene_id_new
3    200  250   gene1      200.0      250       gene2
8    300  340   gene1        NaN       -1          -1


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(461)[0;36magg_2_ends[0;34m()[0m
[0;32m    459 [0;31m[0;34m[0m[0m
[0m[0;32m    460 [0;31m    [0;31m# restrict to relevant columns[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 461 [0;31m    cols = ['Chromosome', 'Start', 'End', 'Strand',
[0m[0;32m    462 [0;31m            'Name', 'gene_id', 'source', mode, 'id_new']
[0m[0;32m    463 [0;31m    [0mdf[0m [0;34m=[0m [0mdf[0m[0;34m[[0m[0mcols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(462)[0;36magg_2_ends[0;34m()[0m
[0;32m    460 [0;31m    [0;31m# restrict to relevant columns[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    461 [0;31m    cols = ['Chromosome', 'Start', 'End', 'Strand',
[0m[0;32m--> 462 [0;31m            'Name', 'gene_id', 'source', mode, 'id_new']
[0m[0;32m    463 [0;31m    [0mdf[0m [0;34m=[0m [0mdf[0m[0;34m[[0m[0mcols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    464 [0;31m    [0mdf[0m[0;34m.[0m[0mrename[0m[0;34m([0m[0;34m{[0m[0;34m'id_new'[0m[0;34m:[0m [0;34m'id'[0m[0;34m}[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(463)[0;36magg_2_ends[0;34m()[0m
[0;32m    461 [0;31m    cols = ['Chromosome', 'Start', 'End', 'Strand',
[0m[0;32m    462 [0;31m            'Name', 'gene_id', 'source', mode, 'id_new']
[0m[0;32m--> 463 [0;31m    [0mdf[0m [0;34m=[0m [0mdf[0m[0;34m[[0m[0mcols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    464 [0;31m    [0mdf[0m[0;34m.[0m[0mrename[0m[0;34m([0m[0;34m{[0m[0;34m'id_new'[0m[0;34m:[0m [0;34m'id'[0m[0;34m}[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    465 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(464)[0;36magg_2_ends[0;34m()[0m
[0;32m    462 [0;31m            'Name', 'gene_id', 'source', mode, 'id_new']
[0m[0;32m    463 [0;31m    [0mdf[0m [0;34m=[0m [0mdf[0m[0;34m[[0m[0mcols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 464 [0;31m    [0mdf[0m[0;34m.[0m[0mrename[0m[0;34m([0m[0;34m{[0m[0;34m'id_new'[0m[0;34m:[0m [0;34m'id'[0m[0;34m}[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    465 [0;31m[0;34m[0m[0m
[0m[0;32m    466 [0;31m    [0;31m### new ends, only add if we're allowing them to be independent[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(468)[0;36magg_2_ends[0;34m()[0m
[0;32m    466 [0;31m    [0;31m### new ends, only add if we're allowing them to be independent[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    467 [0;31m    [0;31m### end support[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 468 [0;31m    [0;32mif[0m [0madd_ends[0m [0;32mand[0m [0mstrand[0m [0;32mand[0m [0mgid[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    469 [0;31m[0;34m[0m[0m
[0m[0;32m    470 [0;31m        [0mnew_df[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(470)[0;36magg_2_ends[0;34m()[0m
[0;32m    468 [0;31m    [0;32mif[0m [0madd_ends[0m [0;32mand[0m [0mstrand[0m [0;32mand[0m [0mgid[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    469 [0;31m[0;34m[0m[0m
[0m[0;32m--> 470 [0;31m        [0mnew_df[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    471 [0;31m[0;34m[0m[0m
[0m[0;32m    472 [0;31m        [0mdrop_cols[0m [0;34m=[0m [0;34m[[0m[0;34m'Start'[0m[0;34m,[0m [0;34m'End'[0m[0;34m,[0m [0;34m'Strand'[0m[0;34m,[0m [0;34m'gene_id'[0m[0;34m,[0m [0;34m'source'[0m[0;34m,[0m [0;34m'Name'[0m[0;34m,[0m [0mmode[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(472)[0;36magg_2_ends[0;34m()[0m
[0;32m    470 [0;31m        [0mnew_df[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    471 [0;31m[0;34m[0m[0m
[0m[0;32m--> 472 [0;31m        [0mdrop_cols[0m [0;34m=[0m [0;34m[[0m[0;34m'Start'[0m[0;34m,[0m [0;34m'End'[0m[0;34m,[0m [0;34m'Strand'[0m[0;34m,[0m [0;34m'gene_id'[0m[0;34m,[0m [0;34m'source'[0m[0;34m,[0m [0;34m'Name'[0m[0;34m,[0m [0mmode[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    473 [0;31m        m = {'Start_new': 'Start',
[0m[0;32m    474 [0;31m             [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(473)[0;36magg_2_ends[0;34m()[0m
[0;32m    471 [0;31m[0;34m[0m[0m
[0m[0;32m    472 [0;31m        [0mdrop_cols[0m [0;34m=[0m [0;34m[[0m[0;34m'Start'[0m[0;34m,[0m [0;34m'End'[0m[0;34m,[0m [0;34m'Strand'[0m[0;34m,[0m [0;34m'gene_id'[0m[0;34m,[0m [0;34m'source'[0m[0;34m,[0m [0;34m'Name'[0m[0;34m,[0m [0mmode[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 473 [0;31m        m = {'Start_new': 'Start',
[0m[0;32m    474 [0;31m             [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    475 [0;31m             [0;34m'gene_id_new'[0m[0;34m:[0m [0;34m'gene_id'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(474)[0;36magg_2_ends[0;34m()[0m
[0;32m    472 [0;31m        [0mdrop_cols[0m [0;34m=[0m [0;34m[[0m[0;34m'Start'[0m[0;34m,[0m [0;34m'End'[0m[0;34m,[0m [0;34m'Strand'[0m[0;34m,[0m [0;34m'gene_id'[0m[0;34m,[0m [0;34m'source'[0m[0;34m,[0m [0;34m'Name'[0m[0;34m,[0m [0mmode[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    473 [0;31m        m = {'Start_new': 'Start',
[0m[0;32m--> 474 [0;31m             [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    475 [0;31m             [0;34m'gene_id_new'[0m[0;34m:[0m [0;34m'gene_id'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    476 [0;31m             [0;34m'Strand_new'[0m[0;34m:[0m [0;34m'Strand'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(475)[0;36magg_2_ends[0;34m()[0m
[0;32m    473 [0;31m        m = {'Start_new': 'Start',
[0m[0;32m    474 [0;31m             [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 475 [0;31m             [0;34m'gene_id_new'[0m[0;34m:[0m [0;34m'gene_id'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    476 [0;31m             [0;34m'Strand_new'[0m[0;34m:[0m [0;34m'Strand'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    477 [0;31m             [0;34m'source_new'[0m[0;34m:[0m [0;34m'source'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(476)[0;36magg_2_ends[0;34m()[0m
[0;32m    474 [0;31m             [0;34m'End_new'[0m[0;34m:[0m [0;34m'End'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    475 [0;31m             [0;34m'gene_id_new'[0m[0;34m:[0m [0;34m'gene_id'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 476 [0;31m             [0;34m'Strand_new'[0m[0;34m:[0m [0;34m'Strand'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    477 [0;31m             [0;34m'source_new'[0m[0;34m:[0m [0;34m'source'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    478 [0;31m             [0;34m'Name_new'[0m[0;34m:[0m [0;34m'Name'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(477)[0;36magg_2_ends[0;34m()[0m
[0;32m    475 [0;31m             [0;34m'gene_id_new'[0m[0;34m:[0m [0;34m'gene_id'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    476 [0;31m             [0;34m'Strand_new'[0m[0;34m:[0m [0;34m'Strand'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 477 [0;31m             [0;34m'source_new'[0m[0;34m:[0m [0;34m'source'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    478 [0;31m             [0;34m'Name_new'[0m[0;34m:[0m [0;34m'Name'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    479 [0;31m             new_c: mode}
[0m


ipdb>  


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(478)[0;36magg_2_ends[0;34m()[0m
[0;32m    476 [0;31m             [0;34m'Strand_new'[0m[0;34m:[0m [0;34m'Strand'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    477 [0;31m             [0;34m'source_new'[0m[0;34m:[0m [0;34m'source'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 478 [0;31m             [0;34m'Name_new'[0m[0;34m:[0m [0;34m'Name'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    479 [0;31m             new_c: mode}
[0m[0;32m    480 [0;31m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(479)[0;36magg_2_ends[0;34m()[0m
[0;32m    477 [0;31m             [0;34m'source_new'[0m[0;34m:[0m [0;34m'source'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    478 [0;31m             [0;34m'Name_new'[0m[0;34m:[0m [0;34m'Name'[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 479 [0;31m             new_c: mode}
[0m[0;32m    480 [0;31m[0;34m[0m[0m
[0m[0;32m    481 [0;31m        [0;31m# situation 3: the ends overlapped, but the gene ids didn't match[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(482)[0;36magg_2_ends[0;34m()[0m
[0;32m    480 [0;31m[0;34m[0m[0m
[0m[0;32m    481 [0;31m        [0;31m# situation 3: the ends overlapped, but the gene ids didn't match[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 482 [0;31m        temp = temp_joined.loc[(temp_joined.gene_id!=temp_joined.gene_id_new)&\
[0m[0;32m    483 [0;31m                               (temp_joined.gene_id_new!='-1')].copy(deep=True)
[0m[0;32m    484 [0;31m        [0;31m# pdb.set_trace()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(483)[0;36magg_2_ends[0;34m()[0m
[0;32m    481 [0;31m        [0;31m# situation 3: the ends overlapped, but the gene ids didn't match[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    482 [0;31m        temp = temp_joined.loc[(temp_joined.gene_id!=temp_joined.gene_id_new)&\
[0m[0;32m--> 483 [0;31m                               (temp_joined.gene_id_new!='-1')].copy(deep=True)
[0m[0;32m    484 [0;31m        [0;31m# pdb.set_trace()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    485 [0;31m        [0mtemp[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0mdrop_cols[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(485)[0;36magg_2_ends[0;34m()[0m
[0;32m    483 [0;31m                               (temp_joined.gene_id_new!='-1')].copy(deep=True)
[0m[0;32m    484 [0;31m        [0;31m# pdb.set_trace()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 485 [0;31m        [0mtemp[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0mdrop_cols[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    486 [0;31m        [0mtemp[0m[0;34m.[0m[0mrename[0m[0;34m([0m[0mm[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    487 [0;31m        [0mnew_df[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[[0m[0mnew_df[0m[0;34m,[0m [0mtemp[0m[0;34m][0m[0;34m)

ipdb>  n


> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/cerberus/cerberus/cerberus.py[0m(486)[0;36magg_2_ends[0;34m()[0m
[0;32m    484 [0;31m        [0;31m# pdb.set_trace()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    485 [0;31m        [0mtemp[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0mdrop_cols[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 486 [0;31m        [0mtemp[0m[0;34m.[0m[0mrename[0m[0;34m([0m[0mm[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    487 [0;31m        [0mnew_df[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[[0m[0mnew_df[0m[0;34m,[0m [0mtemp[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    488 [0;31m[0;34m[0m[0m
[0m


ipdb>  nn


*** NameError: name 'nn' is not defined


## write h5 ref from input beds and ics tsv

In [70]:
ic = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/temp/talon_ic.tsv'
tes = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tes.bed'
tss = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tss.bed'
tss_map = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tes_source_map.bed'
tes_map = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/lr_bulk/cerberus/test_tss_source_map.bed'

# df = read_ic_ref(ic)
# df.head()

write_reference(tss, tes, ic, 'test.h5')

# df = read_cerberus_ends(tss, mode='tss')
# df

df = read_cerberus_source_map(tss_map)
df.head()

Unnamed: 0,Chromosome,Start,End,Strand,source,Name
0,chr1,169804296,169804436,+,v40,ENSG00000000460_2
1,chr1,169807786,169807887,+,v40,ENSG00000000460_4
2,chr1,169821668,169821769,+,v40,ENSG00000000460_5
3,chr1,169852986,169853135,+,v40,ENSG00000000460_3
4,chr1,169854029,169854130,+,v40,ENSG00000000460_1


In [32]:
import pandas as pd
import numpy as np

df = pd.DataFrame()
n = 5
test_1 = [np.nan for i in range(n)]
test_1[-1] = 'b'
df['test_1'] = test_1
df['test_1'] = df.test_1.astype('category')
df['test_2'] = ['a' for i in range(n)]

o = 'test.h5'
df.to_hdf(o, 'test', mode='w', format='table')

df = pd.read_hdf(o, key='test')

In [33]:
df.head()

Unnamed: 0,test_1,test_2
0,,a
1,,a
2,,a
3,,a
4,b,a
