In [3]:
import cooler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py

In [4]:
use_genes = pd.read_csv('use_gene_bed', sep='\t', header=None)

In [5]:
enhancers = pd.read_csv('enhancers_lifted.bed', sep='\t', header=None)

In [6]:
use_genes.head()

Unnamed: 0,0,1,2,3,4
0,chr1,1018123,1022123,ENSG00000188157,ENSMUSG00000041936
1,chr1,1271885,1275885,ENSG00000160087,ENSMUSG00000023286
2,chr1,1307609,1311609,ENSG00000131584,ENSMUSG00000029033
3,chr1,1322691,1326691,ENSG00000127054,ENSMUSG00000029034
4,chr1,1322756,1326756,ENSG00000224051,ENSMUSG00000029073


In [7]:
enhancers.head()

Unnamed: 0,0,1,2,3
0,chr8,55587811,55587820,7
1,chr8,55528337,55528612,10
2,chr8,55473255,55473498,19
3,chr8,55468084,55468527,20
4,chr8,55447899,55447923,23


# annnotate bins containing enhancers and genes

In [15]:
mcool_files = !ls | grep mcool

In [17]:
mcool_files

['L2_3_all_brain.txt_1kb_contacts.mcool',
 'Pvalb_all_brain.txt_1kb_contacts.mcool']

In [19]:
l23_file = mcool_files[0]
f = h5py.File(l23_file)

In [120]:
f.close()

In [20]:
l23_cool = cooler.Cooler(f['resolutions']['2000'])

In [52]:
bins = l23_cool.bins()[:]

In [53]:
bins.head()

Unnamed: 0,chrom,start,end,weight
0,chr1,0,2000,
1,chr1,2000,4000,
2,chr1,4000,6000,
3,chr1,6000,8000,
4,chr1,8000,10000,


In [54]:
bins = bins.drop('weight', axis=1)
bins['idx'] = bins.index

In [55]:
bins.head()

Unnamed: 0,chrom,start,end,idx
0,chr1,0,2000,0
1,chr1,2000,4000,1
2,chr1,4000,6000,2
3,chr1,6000,8000,3
4,chr1,8000,10000,4


In [56]:
bins.to_csv('hi_c_bins.bed', sep='\t', header=None, index=None)

Use bedtools to get intersections with enhancers

In [57]:
bedfiles = !ls | grep bed

In [58]:
bedfiles

['enhancer_bed.bed',
 'enhancer_bins.bed',
 'enhancers_chromsort_slop1kb.bed',
 'enhancers_lifted.bed',
 'enhancers_unlifted.bed',
 'hi_c_bins.bed',
 'promoters_lifted.bed',
 'promoter_sort.bed',
 'promoters_unlifted.bed',
 'tss_bins.bed',
 'use_gene_bed']

In [64]:
! bedtools intersect -wb -a enhancers_lifted.bed -b hi_c_bins.bed > enhancer_bins.bed

In [65]:
! head enhancer_bins.bed
! wc -l enhancer_bins.bed
!wc -l enhancers_lifted.bed

chr8	55587811	55587820	7	chr8	55586000	55588000	724195
chr8	55528337	55528612	10	chr8	55528000	55530000	724166
chr8	55473255	55473498	19	chr8	55472000	55474000	724138
chr8	55468084	55468527	20	chr8	55468000	55470000	724136
chr8	55447899	55447923	23	chr8	55446000	55448000	724125
chr8	55436441	55436592	24	chr8	55436000	55438000	724120
chr8	55417801	55418000	28	chr8	55416000	55418000	724110
chr8	55418000	55418057	28	chr8	55418000	55420000	724111
chr8	55417241	55417460	29	chr8	55416000	55418000	724110
chr8	55416481	55416673	30	chr8	55416000	55418000	724110
94659 enhancer_bins.bed
81483 enhancers_lifted.bed


In [66]:
# intersect TSS with bins
! bedtools intersect -wb -a use_gene_bed -b hi_c_bins.bed > tss_bins.bed

In [67]:
!wc -l tss_bins.bed

23132 tss_bins.bed


In [68]:
!wc -l use_gene_bed

7763 use_gene_bed


# get annotations for bins

In [71]:
enhancer_bins = pd.read_csv('enhancer_bins.bed', sep='\t', header=None)

In [72]:
enhancer_bins.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,chr8,55587811,55587820,7,chr8,55586000,55588000,724195
1,chr8,55528337,55528612,10,chr8,55528000,55530000,724166
2,chr8,55473255,55473498,19,chr8,55472000,55474000,724138
3,chr8,55468084,55468527,20,chr8,55468000,55470000,724136
4,chr8,55447899,55447923,23,chr8,55446000,55448000,724125


In [74]:
columns = ['chr_overlap', 'start_overlap', 'end_overlap', 'enh_idx', 'chr_b', 'start_bin', 'end_bin', 'bin_idx']

In [75]:
enhancer_bins.columns = columns

In [76]:
enhancer_bins.head()

Unnamed: 0,chr_overlap,start_overlap,end_overlap,enh_idx,chr_b,start_bin,end_bin,bin_idx
0,chr8,55587811,55587820,7,chr8,55586000,55588000,724195
1,chr8,55528337,55528612,10,chr8,55528000,55530000,724166
2,chr8,55473255,55473498,19,chr8,55472000,55474000,724138
3,chr8,55468084,55468527,20,chr8,55468000,55470000,724136
4,chr8,55447899,55447923,23,chr8,55446000,55448000,724125


In [93]:
bins = pd.read_csv('hi_c_bins.bed', sep='\t', header=None)
bins.columns = ['chrom', 'start', 'end', 'idx']

In [94]:
bins.head()

Unnamed: 0,chrom,start,end,idx
0,chr1,0,2000,0
1,chr1,2000,4000,1
2,chr1,4000,6000,2
3,chr1,6000,8000,3
4,chr1,8000,10000,4


In [79]:
bin_enh = enhancer_bins[['enh_idx', 'bin_idx']]

In [80]:
bin_enh.shape

(94659, 2)

In [81]:
bin_enh.head()

Unnamed: 0,enh_idx,bin_idx
0,7,724195
1,10,724166
2,19,724138
3,20,724136
4,23,724125


In [82]:
bin_enh = bin_enh.set_index('bin_idx')

In [83]:
bin_enh.head()

Unnamed: 0_level_0,enh_idx
bin_idx,Unnamed: 1_level_1
724195,7
724166,10
724138,19
724136,20
724125,23


In [95]:
bins = bins.join(bin_enh.loc[bins.index])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [96]:
bins.head()

Unnamed: 0,chrom,start,end,idx,enh_idx
0,chr1,0,2000,0,
1,chr1,2000,4000,1,
2,chr1,4000,6000,2,
3,chr1,6000,8000,3,
4,chr1,8000,10000,4,


In [97]:
bins.enh_idx.notna().sum()

94659

# add gene bins

In [98]:
!ls | grep tss

tss_bins.bed


In [99]:
tss_bins = pd.read_csv('tss_bins.bed', header=None, sep='\t')

In [100]:
tss_bins.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,1018123,1020000,ENSG00000188157,ENSMUSG00000041936,chr1,1018000,1020000,509
1,chr1,1020000,1022000,ENSG00000188157,ENSMUSG00000041936,chr1,1020000,1022000,510
2,chr1,1022000,1022123,ENSG00000188157,ENSMUSG00000041936,chr1,1022000,1024000,511
3,chr1,1271885,1272000,ENSG00000160087,ENSMUSG00000023286,chr1,1270000,1272000,635
4,chr1,1272000,1274000,ENSG00000160087,ENSMUSG00000023286,chr1,1272000,1274000,636


In [107]:
columns = ['chr_overlap', 'start_overlap', 'end_overlap', 'gene_name_mouse', 'gene_name_human', 'chr_b', 'start_bin', 'end_bin', 'bin_idx']

In [108]:
tss_bins.columns = columns

In [109]:
tss_bins.head()

Unnamed: 0,chr_overlap,start_overlap,end_overlap,gene_name_mouse,gene_name_human,chr_b,start_bin,end_bin,bin_idx
0,chr1,1018123,1020000,ENSG00000188157,ENSMUSG00000041936,chr1,1018000,1020000,509
1,chr1,1020000,1022000,ENSG00000188157,ENSMUSG00000041936,chr1,1020000,1022000,510
2,chr1,1022000,1022123,ENSG00000188157,ENSMUSG00000041936,chr1,1022000,1024000,511
3,chr1,1271885,1272000,ENSG00000160087,ENSMUSG00000023286,chr1,1270000,1272000,635
4,chr1,1272000,1274000,ENSG00000160087,ENSMUSG00000023286,chr1,1272000,1274000,636


In [110]:
genes_to_bins = tss_bins[['gene_name_mouse', 'gene_name_human', 'bin_idx']]

In [111]:
genes_to_bins.head()

Unnamed: 0,gene_name_mouse,gene_name_human,bin_idx
0,ENSG00000188157,ENSMUSG00000041936,509
1,ENSG00000188157,ENSMUSG00000041936,510
2,ENSG00000188157,ENSMUSG00000041936,511
3,ENSG00000160087,ENSMUSG00000023286,635
4,ENSG00000160087,ENSMUSG00000023286,636


In [112]:
bins.head()

Unnamed: 0,chrom,start,end,idx,enh_idx
0,chr1,0,2000,0,
1,chr1,2000,4000,1,
2,chr1,4000,6000,2,
3,chr1,6000,8000,3,
4,chr1,8000,10000,4,


In [114]:
genes_to_bins = genes_to_bins.set_index('bin_idx')

In [115]:
bins = bins.join(genes_to_bins.loc[bins.index])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [116]:
bins.head()

Unnamed: 0,chrom,start,end,idx,enh_idx,gene_name_mouse,gene_name_human
0,chr1,0,2000,0,,,
1,chr1,2000,4000,1,,,
2,chr1,4000,6000,2,,,
3,chr1,6000,8000,3,,,
4,chr1,8000,10000,4,,,


In [117]:
bins.gene_name_mouse.notna().sum()

23320

In [118]:
bins.head()

Unnamed: 0,chrom,start,end,idx,enh_idx,gene_name_mouse,gene_name_human
0,chr1,0,2000,0,,,
1,chr1,2000,4000,1,,,
2,chr1,4000,6000,2,,,
3,chr1,6000,8000,3,,,
4,chr1,8000,10000,4,,,


In [119]:
bins.to_csv('annotated_bins.tsc', sep='\t')