In [1]:
import cooler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py

# get annotated bins
- new liftover results for gene TSS as well as enhancer liftover

### lifted over bed

In [2]:
tss = pd.read_csv('tss_lifted.bed', sep='\t', header=None)
enhs = pd.read_csv('enhancers_lifted.bed', sep='\t', header=None)

### load cooler

In [3]:
hic_path = '/cndd2/Public_Datasets/m3c_Lee_etal_2019/'

In [4]:
mcool_files = !ls {hic_path}

In [5]:
mcool_files

['L2_3_all_brain.txt_1kb_contacts.mcool',
 'L4_all_brain.txt_1kb_contacts.mcool',
 'L5_all_brain.txt_1kb_contacts.mcool',
 'L6_all_brain.txt_1kb_contacts.mcool',
 'Pvalb_all_brain.txt_1kb_contacts.mcool',
 'Sst_all_brain.txt_1kb_contacts.mcool',
 'Vip_all_brain.txt_1kb_contacts.mcool']

In [6]:
l23_file = hic_path+mcool_files[0]

In [7]:
l23_file

'/cndd2/Public_Datasets/m3c_Lee_etal_2019/L2_3_all_brain.txt_1kb_contacts.mcool'

In [8]:
l23_file = hic_path+mcool_files[0]
f = h5py.File(l23_file)

### Use lowest resolution as coolers can easily aggregate to coarser bins

In [9]:
cool = cooler.Cooler(f['resolutions']['1000'])

In [10]:
bins = cool.bins()[:]

In [11]:
bins.head()

Unnamed: 0,chrom,start,end,weight
0,chr1,0,1000,
1,chr1,1000,2000,
2,chr1,2000,3000,
3,chr1,3000,4000,
4,chr1,4000,5000,


In [12]:
bins.shape

(3095706, 4)

In [13]:
bins = bins.drop('weight', axis=1)
bins['idx'] = bins.index

In [79]:
f.close()

# Getting intersections of bins and liftover using bedtools
### run intermeidate steps in a temp directory to conserve space and increase speed

In [14]:
tmp_dir = '/scratch/ethan/hic/'

In [15]:
bins.to_csv(tmp_dir+'hi_c_bins.bed', sep='\t', header=None, index=None)

In [16]:
tss_bed = 'tss_lifted.bed'
enhancer_bed = 'enhancers_lifted.bed'

In [17]:
# get intersect with enhancers

In [18]:
!bedtools intersect -wb -a {enhancer_bed} -b {tmp_dir+'hi_c_bins.bed'} > {tmp_dir+'enhancer_bins.bed'}

In [19]:
# get intersect with tss

In [20]:
!bedtools intersect -wb -a {tss_bed} -b {tmp_dir+'hi_c_bins.bed'} > {tmp_dir+'tss_bins.bed'}

# annotate bins with overlaps

In [21]:
enh_over = pd.read_csv(tmp_dir+'enhancer_bins.bed', header=None, sep='\t')

In [22]:
tss_over = pd.read_csv(tmp_dir+'tss_bins.bed', header=None, sep='\t')

In [23]:
columns = ['chr_overlap', 'start_overlap', 'end_overlap', 'enh_idx', 'chr_b', 'start_bin', 'end_bin', 'bin_idx']

In [24]:
enh_over.columns = columns

In [25]:
enh_over.head()

Unnamed: 0,chr_overlap,start_overlap,end_overlap,enh_idx,chr_b,start_bin,end_bin,bin_idx
0,chr8,55592914,55593000,5,chr8,55592000,55593000,1448392
1,chr8,55593000,55593008,5,chr8,55593000,55594000,1448393
2,chr8,55587811,55587820,7,chr8,55587000,55588000,1448387
3,chr8,55528337,55528612,10,chr8,55528000,55529000,1448328
4,chr8,55482940,55483000,17,chr8,55482000,55483000,1448282


In [26]:
columns = ['chr_overlap', 'start_overlap', 'end_overlap', 'mm_gene_id', 'chr_b', 'start_bin', 'end_bin', 'bin_idx']

In [27]:
tss_over.columns = columns

In [28]:
tss_over.head()

Unnamed: 0,chr_overlap,start_overlap,end_overlap,mm_gene_id,chr_b,start_bin,end_bin,bin_idx
0,chr8,55102323,55102324,ENSMUSG00000051951,chr8,55102000,55103000,1447902
1,chr4,83273518,83273519,ENSMUSG00000102851,chr4,83273000,83274000,773747
2,chr8,55409874,55409875,ENSMUSG00000103377,chr8,55409000,55410000,1448209
3,chr8,55288183,55288184,ENSMUSG00000089699,chr8,55288000,55289000,1448088
4,chr8,55230212,55230213,ENSMUSG00000103201,chr8,55230000,55231000,1448030


In [29]:
bins.head()

Unnamed: 0,chrom,start,end,idx
0,chr1,0,1000,0
1,chr1,1000,2000,1
2,chr1,2000,3000,2
3,chr1,3000,4000,3
4,chr1,4000,5000,4


In [30]:
tss_over = tss_over.set_index("bin_idx")

In [31]:
enh_over = enh_over.set_index("bin_idx")

In [32]:
tss_over.head()

Unnamed: 0_level_0,chr_overlap,start_overlap,end_overlap,mm_gene_id,chr_b,start_bin,end_bin
bin_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1447902,chr8,55102323,55102324,ENSMUSG00000051951,chr8,55102000,55103000
773747,chr4,83273518,83273519,ENSMUSG00000102851,chr4,83273000,83274000
1448209,chr8,55409874,55409875,ENSMUSG00000103377,chr8,55409000,55410000
1448088,chr8,55288183,55288184,ENSMUSG00000089699,chr8,55288000,55289000
1448030,chr8,55230212,55230213,ENSMUSG00000103201,chr8,55230000,55231000


In [33]:
tss_over.loc[tss_over.index.duplicated(keep=False)].shape

(12363, 7)

In [34]:
dupe = tss_over.loc[tss_over.index.duplicated(keep=False)]

In [35]:
dupe.head()

Unnamed: 0_level_0,chr_overlap,start_overlap,end_overlap,mm_gene_id,chr_b,start_bin,end_bin
bin_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019667,chr12,68746378,68746379,ENSMUSG00000103147,chr12,68746000,68747000
1447359,chr8,54559208,54559209,ENSMUSG00000025900,chr8,54559000,54560000
1447359,chr8,54559302,54559303,ENSMUSG00000109048,chr8,54559000,54560000
112674,chr1,112674914,112674915,ENSMUSG00000102269,chr1,112674000,112675000
2083746,chr12,132825706,132825707,ENSMUSG00000103003,chr12,132825000,132826000


In [36]:
dupe.shape

(12363, 7)

In [37]:
dupe.index.unique().shape

(3559,)

In [38]:
bin_names_tss = dupe.index.unique()
tss_names = []
for i in dupe.index.unique():
    tss_names.append(dupe.loc[i, 'mm_gene_id'].values)

In [39]:
len(tss_names)

3559

In [40]:
dupe = enh_over.loc[enh_over.index.duplicated(keep=False)]
dupe.head()

Unnamed: 0_level_0,chr_overlap,start_overlap,end_overlap,enh_idx,chr_b,start_bin,end_bin
bin_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1448217,chr8,55417801,55418000,28,chr8,55417000,55418000
1448217,chr8,55417241,55417460,29,chr8,55417000,55418000
1445396,chr8,52596329,52596587,164,chr8,52596000,52597000
1445396,chr8,52596000,52596151,165,chr8,52596000,52597000
1444316,chr8,51516569,51516902,205,chr8,51516000,51517000


In [41]:
dupe.shape

(36652, 7)

In [42]:
bin_names_enh = dupe.index.unique()
enh_names = []
for i in dupe.index.unique():
    enh_names.append(dupe.loc[i, 'enh_idx'].values)

In [43]:
len(enh_names)

17562

## fix duplicates

In [44]:
annotate_enh = enh_over.enh_idx

In [45]:
annotate_tss = tss_over.mm_gene_id

In [46]:
annotate_enh = annotate_enh.drop(bin_names_enh)

In [47]:
annotate_tss = annotate_tss.drop(bin_names_tss)

In [48]:
annotate_enh.shape, annotate_tss.shape

((137283,), (21260,))

In [49]:
multi_enh = pd.Series(index=bin_names_enh, data=enh_names)

In [50]:
multi_enh

bin_idx
1448217            [28, 29]
1445396          [164, 165]
1444316          [205, 206]
1442959          [243, 244]
1442715          [267, 268]
                 ...       
2893165    [233423, 233424]
2893060    [233432, 233433]
2892208    [233479, 233480]
2892143    [233487, 233488]
2892142    [233488, 233489]
Length: 17562, dtype: object

In [51]:
multi_gene = pd.Series(index=bin_names_tss, data=tss_names)

In [52]:
multi_gene

bin_idx
2019667             [ENSMUSG00000103147, ENSMUSG00000106067]
1447359             [ENSMUSG00000025900, ENSMUSG00000109048]
112674              [ENSMUSG00000102269, ENSMUSG00000002227]
2083746    [ENSMUSG00000103003, ENSMUSG00000103379, ENSMU...
220253     [ENSMUSG00000102735, ENSMUSG00000104171, ENSMU...
                                 ...                        
2891539             [ENSMUSG00000095134, ENSMUSG00000099871]
3095689             [ENSMUSG00000064336, ENSMUSG00000064337]
3095690             [ENSMUSG00000064338, ENSMUSG00000064339]
3095692             [ENSMUSG00000064340, ENSMUSG00000064341]
3095701    [ENSMUSG00000064364, ENSMUSG00000064365, ENSMU...
Length: 3559, dtype: object

In [53]:
annotate_enh = pd.concat([annotate_enh, multi_enh])

In [54]:
annotate_enh

bin_idx
1448392                   5
1448393                   5
1448387                   7
1448328                  10
1448282                  17
                 ...       
2893165    [233423, 233424]
2893060    [233432, 233433]
2892208    [233479, 233480]
2892143    [233487, 233488]
2892142    [233488, 233489]
Length: 154845, dtype: object

In [55]:
annotate_tss = pd.concat([annotate_tss, multi_gene])

In [56]:
annotate_tss

bin_idx
1447902                                   ENSMUSG00000051951
773747                                    ENSMUSG00000102851
1448209                                   ENSMUSG00000103377
1448088                                   ENSMUSG00000089699
1448030                                   ENSMUSG00000103201
                                 ...                        
2891539             [ENSMUSG00000095134, ENSMUSG00000099871]
3095689             [ENSMUSG00000064336, ENSMUSG00000064337]
3095690             [ENSMUSG00000064338, ENSMUSG00000064339]
3095692             [ENSMUSG00000064340, ENSMUSG00000064341]
3095701    [ENSMUSG00000064364, ENSMUSG00000064365, ENSMU...
Length: 24819, dtype: object

In [57]:
annotate_enh.index.isin(annotate_tss.index).shape

(154845,)

In [58]:
annotate_enh.index.duplicated().sum()

0

In [59]:
annotate_tss.index.duplicated().sum()

0

In [60]:
both_vals = annotate_enh.loc[annotate_enh.index.isin(annotate_tss.index)].index

In [61]:
both_tss = annotate_tss.loc[both_vals]

In [62]:
both_tss.index.duplicated(keep=False).sum()

0

In [81]:
both_enh.index.duplicated(keep=False).sum()

0

In [83]:
annotate_enh.index.duplicated(keep=False).sum()

0

In [85]:
annotate_tss.index.duplicated(keep=False).sum()

0

In [87]:
annotate_enh.index.isin(both_enh.index).sum()

212

In [88]:
annotate_tss.index.isin(both_enh.index).sum()

212

In [63]:
both_enh = annotate_enh.loc[both_vals]

In [64]:
both_tss = pd.Series(index=both_tss.index, 
                      data= [[i] if type(i) != np.ndarray else i.tolist() for i in both_tss.values])

In [65]:
both_enh = pd.Series(index=both_enh.index, 
                      data= [[i] if type(i) != np.ndarray else i.tolist() for i in both_enh.values])

In [66]:
both = both_tss + both_enh.loc[both_tss.index]

In [67]:
both

bin_idx
3057222    [ENSMUSG00000084238, ENSMUSG00000100468, ENSMU...
1112638                            [ENSMUSG00000101738, 972]
1142919                           [ENSMUSG00000099694, 3521]
594027     [ENSMUSG00000083786, ENSMUSG00000083985, ENSMU...
455062                            [ENSMUSG00000100426, 5194]
                                 ...                        
2672348    [ENSMUSG00000084151, ENSMUSG00000099801, ENSMU...
1552138    [ENSMUSG00000109692, ENSMUSG00000094530, ENSMU...
2777478                 [ENSMUSG00000103878, 138577, 138578]
1573890    [ENSMUSG00000084010, ENSMUSG00000082414, 15551...
1949946                 [ENSMUSG00000100409, 218145, 218146]
Length: 212, dtype: object

In [107]:
annotate_tss = annotate_tss.drop(both.index)

KeyError: '[3057222 1112638 1142919  594027  455062  460427  373276  373384  206528\n  944406 2895288 1918209  423971  220253 1709278  197328  166276  166277\n 2112394  209267 1054604 1752652 2981954  164356 1171401  958005   54089\n 2852793 1799251 2038722 2026244  287732  287733 1062377 2717132 2717130\n  171593  182335  182336 1300084 1341111 2955420  321235 2587838 2478302\n 1935213 1069304 2732710  334157  515708 2220420   40333  715132 2224552\n 2134784 2164676 1299732  937468  553181 1609382 1343621 1648001  258522\n 2005105 1928149 2851396 2723875  924750 1849151 1065435 2389665  182942\n  990222  592970  154558  259694  259695  259696  171591 1096246 1105908\n 1101867 1509566 2590558 2545214 2269828 1397587  706730  153634  153636\n  153638 2004216 2004220 1087521  914791  176616 2696355 1006438 1672185\n 1012618 2991390  631533  525203  525204 1423612 1874354 1611313  480836\n 1780725 2275539 2235205 2981955  429065  347078 1749679 2397927 1096808\n 2751631   77129 2757485 2764361 2671918 2988396 2984400  663105  673693\n 2281189 1545913 1637772 1647791 1656345 1657248 1553151 1553369 1037120\n   53291   46613 2739749 1326378 2549055  734372 1068517  772394 2762235\n 1349242 1278128 1279658 3032037 2679579 1371382 1375949  766057  997680\n  997681 1961767 2701081 2418981 2904881  442981 2401326 2672097 1827267\n  309986 1816228  593908 2234512 2083285 2497093 1558365 2460932  234357\n 1947167 1927992 2515884 2378330 1473358 1473359 2366049  150903 2221177\n 2907267 1772186 2968748 2070880 2183988 1890660 1819806 1075059 2265956\n  878444  425452 2996762 2996763 2546902 3035861 1496568 1832888 2025459\n 2025458 1999862  146298 1332047  676951 2255293 2435857  770553  514833\n 2672348 1552138 2777478 1573890 1949946] not found in axis'

In [106]:
annotate_enh = annotate_enh.drop(both.index)

KeyError: '[3057222 1112638 1142919  594027  455062  460427  373276  373384  206528\n  944406 2895288 1918209  423971  220253 1709278  197328  166276  166277\n 2112394  209267 1054604 1752652 2981954  164356 1171401  958005   54089\n 2852793 1799251 2038722 2026244  287732  287733 1062377 2717132 2717130\n  171593  182335  182336 1300084 1341111 2955420  321235 2587838 2478302\n 1935213 1069304 2732710  334157  515708 2220420   40333  715132 2224552\n 2134784 2164676 1299732  937468  553181 1609382 1343621 1648001  258522\n 2005105 1928149 2851396 2723875  924750 1849151 1065435 2389665  182942\n  990222  592970  154558  259694  259695  259696  171591 1096246 1105908\n 1101867 1509566 2590558 2545214 2269828 1397587  706730  153634  153636\n  153638 2004216 2004220 1087521  914791  176616 2696355 1006438 1672185\n 1012618 2991390  631533  525203  525204 1423612 1874354 1611313  480836\n 1780725 2275539 2235205 2981955  429065  347078 1749679 2397927 1096808\n 2751631   77129 2757485 2764361 2671918 2988396 2984400  663105  673693\n 2281189 1545913 1637772 1647791 1656345 1657248 1553151 1553369 1037120\n   53291   46613 2739749 1326378 2549055  734372 1068517  772394 2762235\n 1349242 1278128 1279658 3032037 2679579 1371382 1375949  766057  997680\n  997681 1961767 2701081 2418981 2904881  442981 2401326 2672097 1827267\n  309986 1816228  593908 2234512 2083285 2497093 1558365 2460932  234357\n 1947167 1927992 2515884 2378330 1473358 1473359 2366049  150903 2221177\n 2907267 1772186 2968748 2070880 2183988 1890660 1819806 1075059 2265956\n  878444  425452 2996762 2996763 2546902 3035861 1496568 1832888 2025459\n 2025458 1999862  146298 1332047  676951 2255293 2435857  770553  514833\n 2672348 1552138 2777478 1573890 1949946] not found in axis'

In [108]:
annotate_bins = pd.concat([annotate_tss, annotate_enh, both])

In [109]:
both.shape

(212,)

In [110]:
bins.head()

Unnamed: 0,chrom,start,end,idx
0,chr1,0,1000,0
1,chr1,1000,2000,1
2,chr1,2000,3000,2
3,chr1,3000,4000,3
4,chr1,4000,5000,4


In [111]:
annotate_bins.head()

bin_idx
1447902    ENSMUSG00000051951
773747     ENSMUSG00000102851
1448209    ENSMUSG00000103377
1448088    ENSMUSG00000089699
1448030    ENSMUSG00000103201
dtype: object

In [112]:
bins.shape

(3095706, 4)

In [113]:
annotate_bins.loc[bins.idx].shape

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(3095706,)

In [114]:
annotate_bins.index.isin(bins.index).sum()

179452

In [115]:
annotate_bins.index.duplicated().sum()

0

In [116]:
annotate_bins

bin_idx
1447902                                   ENSMUSG00000051951
773747                                    ENSMUSG00000102851
1448209                                   ENSMUSG00000103377
1448088                                   ENSMUSG00000089699
1448030                                   ENSMUSG00000103201
                                 ...                        
2672348    [ENSMUSG00000084151, ENSMUSG00000099801, ENSMU...
1552138    [ENSMUSG00000109692, ENSMUSG00000094530, ENSMU...
2777478                 [ENSMUSG00000103878, 138577, 138578]
1573890    [ENSMUSG00000084010, ENSMUSG00000082414, 15551...
1949946                 [ENSMUSG00000100409, 218145, 218146]
Length: 179452, dtype: object

In [117]:
bins.idx.shape

(3095706,)

In [118]:
annotate_bins.shape

(179452,)

In [119]:
annotate_bins.loc[bins.index].shape

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


(3095706,)

In [120]:
bins.index.isin(annotate_bins.index).sum()

179452

In [121]:
annotate_bins.shape

(179452,)

In [122]:
bins.head()

Unnamed: 0,chrom,start,end,idx
0,chr1,0,1000,0
1,chr1,1000,2000,1
2,chr1,2000,3000,2
3,chr1,3000,4000,3
4,chr1,4000,5000,4


In [123]:
bins['annotation'] = annotate_bins.loc[bins.index].values

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [124]:
bins.head()

Unnamed: 0,chrom,start,end,idx,annotation
0,chr1,0,1000,0,
1,chr1,1000,2000,1,
2,chr1,2000,3000,2,
3,chr1,3000,4000,3,
4,chr1,4000,5000,4,


In [126]:
bins.annotation.notna().sum()

179452

In [127]:
import pickle

In [133]:
pickle.dump(bins, open('data/annotated_bins.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)