# Determine Enhancer-Enhancer Pairs

This notebook is used to determine the enhancer pairs present in the Gasperini at-scale screen, by first determining all enhancers within 1 MB of a gene, and then generating all pairwise enhancer combinations.

Author: Karthik Guruvayurappan

In [2]:
import pandas as pd
import numpy as np
import itertools

In [3]:
enhancer_gene_pairs = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_gene_gRNAgroup_pair_table.at_scale.txt', sep = '\t')

In [4]:
enhancer_gene_pairs.head()

Unnamed: 0,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup,general_group,chr.targetgene,start.targetgene,stop.targetgene,ENSG.targetgene,targetgene_short_name,strand.targetgene,pairs
0,NTC,NTC,NTC,bassik_mch,NTC,chr10,28034777,28034778,ENSG00000150051,MKX,-,MKX:bassik_mch
1,NTC,NTC,NTC,bassik_mch,NTC,chr10,28287976,28287977,ENSG00000169126,ARMC4,-,ARMC4:bassik_mch
2,NTC,NTC,NTC,bassik_mch,NTC,chr10,28571017,28571018,ENSG00000150054,MPP7,-,MPP7:bassik_mch
3,NTC,NTC,NTC,bassik_mch,NTC,chr10,28821422,28821423,ENSG00000095787,WAC,+,WAC:bassik_mch
4,NTC,NTC,NTC,bassik_mch,NTC,chr10,28966271,28966272,ENSG00000095739,BAMBI,+,BAMBI:bassik_mch


In [5]:
enhancer_gene_pairs.groupby('general_group').count()

Unnamed: 0_level_0,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup,chr.targetgene,start.targetgene,stop.targetgene,ENSG.targetgene,targetgene_short_name,strand.targetgene,pairs
general_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
NTC,937839,937839,937839,937839,937839,937839,937839,937839,937839,937839,937839
TSS,9938,9938,9938,9938,9938,9938,9938,9938,9938,9938,9938
not_model_selected_K1119hit-hit,374,374,374,374,374,374,374,374,374,374,374
positive_ctrl,730,730,730,730,730,730,730,730,730,730,730
submodular_1k-untested,23712,23712,23712,23712,23712,23712,23712,23712,23712,23712,23712
test_5k-K1119,26971,26971,26971,26971,26971,26971,26971,26971,26971,26971,26971
test_5k-hit,5620,5620,5620,5620,5620,5620,5620,5620,5620,5620,5620
test_5k-untested,81380,81380,81380,81380,81380,81380,81380,81380,81380,81380,81380


In [6]:
enhancer_gene_pairs.shape

(1086564, 12)

In [7]:
# get unique enhancers from enhancer-gene pairs
enhancers = enhancer_gene_pairs[['gRNAgroup', 'gRNAgroup.chr', 'gRNAgroup.start', 'gRNAgroup.stop', 'general_group']]
enhancers = enhancers.drop_duplicates()
enhancers = enhancers[enhancers['general_group'] != 'NTC']
enhancers = enhancers[enhancers['general_group'] != 'positive_ctrl']
enhancers = enhancers[enhancers['general_group'] != 'TSS']
enhancers = enhancers.reset_index(drop=True)
enhancers.head()

Unnamed: 0,gRNAgroup,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,general_group
0,chr10.1007_top_two,chr10,29023920,29024458,test_5k-untested
1,chr10.1018_top_two,chr10,29111303,29112331,test_5k-untested
2,chr10.1019_top_two,chr10,29115307,29115488,test_5k-untested
3,chr10.1020_top_two,chr10,29128454,29128983,test_5k-untested
4,chr10.1030_top_two,chr10,29275275,29275635,test_5k-untested


In [8]:
enhancers.dtypes

gRNAgroup          object
gRNAgroup.chr      object
gRNAgroup.start    object
gRNAgroup.stop     object
general_group      object
dtype: object

In [9]:
enhancers['gRNAgroup.start'] = enhancers['gRNAgroup.start'].astype(int)
enhancers['gRNAgroup.stop'] = enhancers['gRNAgroup.stop'].astype(int)
enhancers['gRNAgroup.position'] = (enhancers['gRNAgroup.start'] + enhancers['gRNAgroup.stop']) / 2

In [10]:
enhancers.shape

(6144, 6)

In [11]:
enhancers = enhancers.drop_duplicates()

In [12]:
enhancers.shape

(6143, 6)

In [13]:
# get unique genes
genes = enhancer_gene_pairs[['ENSG.targetgene', 'chr.targetgene', 'start.targetgene', 'stop.targetgene']]
genes = genes.drop_duplicates()
genes['position.targetgene'] = (genes['start.targetgene'] + genes['stop.targetgene']) / 2
genes.head()

Unnamed: 0,ENSG.targetgene,chr.targetgene,start.targetgene,stop.targetgene,position.targetgene
0,ENSG00000150051,chr10,28034777,28034778,28034777.5
1,ENSG00000169126,chr10,28287976,28287977,28287976.5
2,ENSG00000150054,chr10,28571017,28571018,28571017.5
3,ENSG00000095787,chr10,28821422,28821423,28821422.5
4,ENSG00000095739,chr10,28966271,28966272,28966271.5


In [14]:
genes.drop_duplicates().shape

(18389, 5)

In [15]:
genes.shape

(18389, 5)

In [16]:
genes['ENSG.targetgene'].nunique()

18026

In [17]:
# find all enhancers within 1MB of each gene 
def find_proximal_enhancers(gene):
    '''finds enhancers within 1MB of gene'''

    gene_chrom = gene['chr.targetgene']
    gene_position = gene['position.targetgene']

    match_chrom = gene_chrom == enhancers['gRNAgroup.chr']
    match_position = np.abs(gene_position - enhancers['gRNAgroup.position']) < 1000000

    match_enhancers = np.array(enhancers[match_chrom & match_position]['gRNAgroup'])
    return match_enhancers

genes['proximal_enhancers'] = genes.apply(find_proximal_enhancers, axis = 1)
genes.head()

Unnamed: 0,ENSG.targetgene,chr.targetgene,start.targetgene,stop.targetgene,position.targetgene,proximal_enhancers
0,ENSG00000150051,chr10,28034777,28034778,28034777.5,"[chr10.1007_top_two, chr10.995_top_two]"
1,ENSG00000169126,chr10,28287976,28287977,28287976.5,"[chr10.1007_top_two, chr10.1018_top_two, chr10..."
2,ENSG00000150054,chr10,28571017,28571018,28571017.5,"[chr10.1007_top_two, chr10.1018_top_two, chr10..."
3,ENSG00000095787,chr10,28821422,28821423,28821422.5,"[chr10.1007_top_two, chr10.1018_top_two, chr10..."
4,ENSG00000095739,chr10,28966271,28966272,28966271.5,"[chr10.1007_top_two, chr10.1018_top_two, chr10..."


In [23]:
enhancer_gene = genes.explode('proximal_enhancers', ignore_index = True)
enhancer_gene.head()

Unnamed: 0,ENSG.targetgene,chr.targetgene,start.targetgene,stop.targetgene,position.targetgene,proximal_enhancers
0,ENSG00000150051,chr10,28034777,28034778,28034777.5,chr10.1007_top_two
1,ENSG00000150051,chr10,28034777,28034778,28034777.5,chr10.995_top_two
2,ENSG00000169126,chr10,28287976,28287977,28287976.5,chr10.1007_top_two
3,ENSG00000169126,chr10,28287976,28287977,28287976.5,chr10.1018_top_two
4,ENSG00000169126,chr10,28287976,28287977,28287976.5,chr10.1019_top_two


In [24]:
enhancer_gene.shape

(140678, 6)

In [25]:
enhancer_gene['proximal_enhancers'].isin(enhancers['gRNAgroup'])

0          True
1          True
2          True
3          True
4          True
          ...  
140673     True
140674     True
140675     True
140676     True
140677    False
Name: proximal_enhancers, Length: 140678, dtype: bool

In [26]:
enhancer_gene.tail()

Unnamed: 0,ENSG.targetgene,chr.targetgene,start.targetgene,stop.targetgene,position.targetgene,proximal_enhancers
140673,ENSG00000132681,chr1,160147201,160147202,160147201.5,chr1.9416_top_two
140674,ENSG00000132681,chr1,160147201,160147202,160147201.5,chr1.9428_top_two
140675,ENSG00000132681,chr1,160147201,160147202,160147201.5,chr1.9429_top_two
140676,ENSG00000132681,chr1,160147201,160147202,160147201.5,chr1.9450_top_two
140677,ENSG00000198673,chr12,62221013,62221014,62221013.5,


In [20]:
enhancer_gene[enhancer_gene.duplicated()]

Unnamed: 0,ENSG.targetgene,chr.targetgene,start.targetgene,stop.targetgene,position.targetgene,proximal_enhancers


In [27]:
enhancer_gene = enhancer_gene.merge(enhancers, left_on = 'proximal_enhancers', right_on = 'gRNAgroup')
enhancer_gene.head()

Unnamed: 0,ENSG.targetgene,chr.targetgene,start.targetgene,stop.targetgene,position.targetgene,proximal_enhancers,gRNAgroup,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,general_group,gRNAgroup.position
0,ENSG00000150051,chr10,28034777,28034778,28034777.5,chr10.1007_top_two,chr10.1007_top_two,chr10,29023920,29024458,test_5k-untested,29024189.0
1,ENSG00000169126,chr10,28287976,28287977,28287976.5,chr10.1007_top_two,chr10.1007_top_two,chr10,29023920,29024458,test_5k-untested,29024189.0
2,ENSG00000150054,chr10,28571017,28571018,28571017.5,chr10.1007_top_two,chr10.1007_top_two,chr10,29023920,29024458,test_5k-untested,29024189.0
3,ENSG00000095787,chr10,28821422,28821423,28821422.5,chr10.1007_top_two,chr10.1007_top_two,chr10,29023920,29024458,test_5k-untested,29024189.0
4,ENSG00000095739,chr10,28966271,28966272,28966271.5,chr10.1007_top_two,chr10.1007_top_two,chr10,29023920,29024458,test_5k-untested,29024189.0


In [28]:
enhancer_gene.shape

(140635, 12)

In [29]:
enhancer_gene = enhancer_gene.drop(['proximal_enhancers', 'general_group'], axis = 1)
enhancer_gene.head()

Unnamed: 0,ENSG.targetgene,chr.targetgene,start.targetgene,stop.targetgene,position.targetgene,gRNAgroup,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup.position
0,ENSG00000150051,chr10,28034777,28034778,28034777.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0
1,ENSG00000169126,chr10,28287976,28287977,28287976.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0
2,ENSG00000150054,chr10,28571017,28571018,28571017.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0
3,ENSG00000095787,chr10,28821422,28821423,28821422.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0
4,ENSG00000095739,chr10,28966271,28966272,28966271.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0


In [30]:
enhancer_gene.shape

(140635, 10)

In [31]:
enhancer_gene['enhancer_gene_distance'] = np.abs(enhancer_gene['position.targetgene'] - enhancer_gene['gRNAgroup.position'])
enhancer_gene.head()

Unnamed: 0,ENSG.targetgene,chr.targetgene,start.targetgene,stop.targetgene,position.targetgene,gRNAgroup,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup.position,enhancer_gene_distance
0,ENSG00000150051,chr10,28034777,28034778,28034777.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0,989411.5
1,ENSG00000169126,chr10,28287976,28287977,28287976.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0,736212.5
2,ENSG00000150054,chr10,28571017,28571018,28571017.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0,453171.5
3,ENSG00000095787,chr10,28821422,28821423,28821422.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0,202766.5
4,ENSG00000095739,chr10,28966271,28966272,28966271.5,chr10.1007_top_two,chr10,29023920,29024458,29024189.0,57917.5


In [32]:
enhancer_gene.shape

(140635, 11)

In [34]:
enhancer_gene.drop_duplicates().shape

(140635, 11)

In [37]:
enhancer_gene[enhancer_gene['gRNAgroup'] == 'chr10.1018_top_two']

Unnamed: 0,ENSG.targetgene,chr.targetgene,start.targetgene,stop.targetgene,position.targetgene,gRNAgroup,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup.position,enhancer_gene_distance
14,ENSG00000169126,chr10,28287976,28287977,28287976.5,chr10.1018_top_two,chr10,29111303,29112331,29111817.0,823840.5
15,ENSG00000150054,chr10,28571017,28571018,28571017.5,chr10.1018_top_two,chr10,29111303,29112331,29111817.0,540799.5
16,ENSG00000095787,chr10,28821422,28821423,28821422.5,chr10.1018_top_two,chr10,29111303,29112331,29111817.0,290394.5
17,ENSG00000095739,chr10,28966271,28966272,28966271.5,chr10.1018_top_two,chr10,29111303,29112331,29111817.0,145545.5
18,ENSG00000120563,chr10,29577990,29577991,29577990.5,chr10.1018_top_two,chr10,29111303,29112331,29111817.0,466173.5
19,ENSG00000197321,chr10,29923900,29923901,29923900.5,chr10.1018_top_two,chr10,29111303,29112331,29111817.0,812083.5
20,ENSG00000197321,chr10,30024732,30024733,30024732.5,chr10.1018_top_two,chr10,29111303,29112331,29111817.0,912915.5


In [38]:
enhancer_gene = enhancer_gene[['ENSG.targetgene', 'gRNAgroup']]
enhancer_gene.head()

Unnamed: 0,ENSG.targetgene,gRNAgroup
0,ENSG00000150051,chr10.1007_top_two
1,ENSG00000169126,chr10.1007_top_two
2,ENSG00000150054,chr10.1007_top_two
3,ENSG00000095787,chr10.1007_top_two
4,ENSG00000095739,chr10.1007_top_two


In [39]:
enhancer_gene.shape

(140635, 2)

In [40]:
enhancer_gene.drop_duplicates().shape

(138012, 2)

In [41]:
enhancer_gene = enhancer_gene.drop_duplicates()

In [42]:
def get_pairwise_enhancers(gene_enhancers):
    return list(itertools.combinations(gene_enhancers['gRNAgroup'], 2))

enhancer_enhancer_gene = pd.DataFrame(enhancer_gene.groupby('ENSG.targetgene').apply(get_pairwise_enhancers))
enhancer_enhancer_gene.head()

Unnamed: 0_level_0,0
ENSG.targetgene,Unnamed: 1_level_1
ENSG00000000003,[]
ENSG00000000005,[]
ENSG00000000419,"[(chr20.2275_top_two, chr20.2276_top_two), (ch..."
ENSG00000000457,[]
ENSG00000000460,[]


In [43]:
enhancer_enhancer_gene = enhancer_enhancer_gene.reset_index()
enhancer_enhancer_gene.head()

Unnamed: 0,ENSG.targetgene,0
0,ENSG00000000003,[]
1,ENSG00000000005,[]
2,ENSG00000000419,"[(chr20.2275_top_two, chr20.2276_top_two), (ch..."
3,ENSG00000000457,[]
4,ENSG00000000460,[]


In [44]:
enhancer_enhancer_gene.columns = ['gene', 'enhancer_pair']
enhancer_enhancer_gene.head()

Unnamed: 0,gene,enhancer_pair
0,ENSG00000000003,[]
1,ENSG00000000005,[]
2,ENSG00000000419,"[(chr20.2275_top_two, chr20.2276_top_two), (ch..."
3,ENSG00000000457,[]
4,ENSG00000000460,[]


In [45]:
enhancer_enhancer_gene = enhancer_enhancer_gene.explode('enhancer_pair', ignore_index = True)
enhancer_enhancer_gene.head()

Unnamed: 0,gene,enhancer_pair
0,ENSG00000000003,
1,ENSG00000000005,
2,ENSG00000000419,"(chr20.2275_top_two, chr20.2276_top_two)"
3,ENSG00000000419,"(chr20.2275_top_two, chr20.2279_top_two)"
4,ENSG00000000419,"(chr20.2275_top_two, chr20.2320_top_two)"


In [46]:
enhancer_enhancer_gene = enhancer_enhancer_gene.dropna()
enhancer_enhancer_gene.head()

Unnamed: 0,gene,enhancer_pair
2,ENSG00000000419,"(chr20.2275_top_two, chr20.2276_top_two)"
3,ENSG00000000419,"(chr20.2275_top_two, chr20.2279_top_two)"
4,ENSG00000000419,"(chr20.2275_top_two, chr20.2320_top_two)"
5,ENSG00000000419,"(chr20.2275_top_two, chr20.2321_top_two)"
6,ENSG00000000419,"(chr20.2275_top_two, chr20.2355_top_two)"


In [47]:
enhancer_enhancer_gene.shape

(905614, 2)

In [48]:
enhancer_enhancer_gene['enhancer_1'] = enhancer_enhancer_gene['enhancer_pair'].apply(lambda x: x[0])
enhancer_enhancer_gene['enhancer_2'] = enhancer_enhancer_gene['enhancer_pair'].apply(lambda x: x[1])
enhancer_enhancer_gene.head()

Unnamed: 0,gene,enhancer_pair,enhancer_1,enhancer_2
2,ENSG00000000419,"(chr20.2275_top_two, chr20.2276_top_two)",chr20.2275_top_two,chr20.2276_top_two
3,ENSG00000000419,"(chr20.2275_top_two, chr20.2279_top_two)",chr20.2275_top_two,chr20.2279_top_two
4,ENSG00000000419,"(chr20.2275_top_two, chr20.2320_top_two)",chr20.2275_top_two,chr20.2320_top_two
5,ENSG00000000419,"(chr20.2275_top_two, chr20.2321_top_two)",chr20.2275_top_two,chr20.2321_top_two
6,ENSG00000000419,"(chr20.2275_top_two, chr20.2355_top_two)",chr20.2275_top_two,chr20.2355_top_two


In [49]:
enhancer_enhancer_gene = enhancer_enhancer_gene[['gene', 'enhancer_1', 'enhancer_2']]
enhancer_enhancer_gene.head()

Unnamed: 0,gene,enhancer_1,enhancer_2
2,ENSG00000000419,chr20.2275_top_two,chr20.2276_top_two
3,ENSG00000000419,chr20.2275_top_two,chr20.2279_top_two
4,ENSG00000000419,chr20.2275_top_two,chr20.2320_top_two
5,ENSG00000000419,chr20.2275_top_two,chr20.2321_top_two
6,ENSG00000000419,chr20.2275_top_two,chr20.2355_top_two


In [50]:
enhancer_enhancer_gene.shape

(905614, 3)

In [51]:
enhancer_enhancer_gene.drop_duplicates().shape

(905614, 3)

In [53]:
(enhancer_enhancer_gene['enhancer_1'] == enhancer_enhancer_gene['enhancer_2']).sum()

0

In [54]:
enhancer_enhancer_gene.to_csv('/iblm/netapp/data1/external/Gasperini2019/processed/at_scale_enhancer_enhancer_pairs.csv', index = False)


In [88]:
enhancer_enhancer_gene = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/processed/at_scale_enhancer_enhancer_pairs.csv')
enhancer_enhancer_gene.shape

(905614, 3)

In [89]:
enhancer_enhancer_gene[enhancer_enhancer_gene['enhancer_1'].str.startswith('chr20.2275')].head(30)

Unnamed: 0,gene,enhancer_1,enhancer_2
0,ENSG00000000419,chr20.2275_top_two,chr20.2276_top_two
1,ENSG00000000419,chr20.2275_top_two,chr20.2279_top_two
2,ENSG00000000419,chr20.2275_top_two,chr20.2320_top_two
3,ENSG00000000419,chr20.2275_top_two,chr20.2321_top_two
4,ENSG00000000419,chr20.2275_top_two,chr20.2355_top_two
5,ENSG00000000419,chr20.2275_top_two,chr20.2371_top_two
6,ENSG00000000419,chr20.2275_top_two,chr20.2372_top_two
7,ENSG00000000419,chr20.2275_top_two,chr20.2373_top_two
8,ENSG00000000419,chr20.2275_top_two,chr20.2379_top_two
9,ENSG00000000419,chr20.2275_top_two,chr20.2380_top_two


In [90]:
def clean_enhancer_name(enhancer):
    if enhancer.startswith('chr'):
        return enhancer.split('_')[0]
    else:
        return enhancer

enhancer_enhancer_gene['enhancer_1'] = enhancer_enhancer_gene['enhancer_1'].apply(clean_enhancer_name)
enhancer_enhancer_gene['enhancer_2'] = enhancer_enhancer_gene['enhancer_2'].apply(clean_enhancer_name)
enhancer_enhancer_gene.head()

Unnamed: 0,gene,enhancer_1,enhancer_2
0,ENSG00000000419,chr20.2275,chr20.2276
1,ENSG00000000419,chr20.2275,chr20.2279
2,ENSG00000000419,chr20.2275,chr20.2320
3,ENSG00000000419,chr20.2275,chr20.2321
4,ENSG00000000419,chr20.2275,chr20.2355


In [67]:
enhancer_enhancer_gene[enhancer_enhancer_gene['enhancer_1'] == 'chr20.2275'].head(30)

Unnamed: 0,gene,enhancer_1,enhancer_2
0,ENSG00000000419,chr20.2275,chr20.2276
1,ENSG00000000419,chr20.2275,chr20.2279
2,ENSG00000000419,chr20.2275,chr20.2320
3,ENSG00000000419,chr20.2275,chr20.2321
4,ENSG00000000419,chr20.2275,chr20.2355
5,ENSG00000000419,chr20.2275,chr20.2371
6,ENSG00000000419,chr20.2275,chr20.2372
7,ENSG00000000419,chr20.2275,chr20.2373
8,ENSG00000000419,chr20.2275,chr20.2379
9,ENSG00000000419,chr20.2275,chr20.2380


In [91]:
enhancer_enhancer_gene[enhancer_enhancer_gene.duplicated()]

Unnamed: 0,gene,enhancer_1,enhancer_2
12,ENSG00000000419,chr20.2275,chr20.2382
14,ENSG00000000419,chr20.2275,chr20.2384
37,ENSG00000000419,chr20.2276,chr20.2382
39,ENSG00000000419,chr20.2276,chr20.2384
61,ENSG00000000419,chr20.2279,chr20.2382
...,...,...,...
905594,ENSG00000284554,chr22.2017,chr22.2018
905604,ENSG00000284554,chr22.2018,chr22.2053
905605,ENSG00000284554,chr22.2018,chr22.2054
905606,ENSG00000284554,chr22.2018,chr22.2074


In [92]:
enhancer_enhancer_gene = enhancer_enhancer_gene.drop_duplicates()
enhancer_enhancer_gene.shape

(804710, 3)

In [99]:
enhancer_enhancer_gene = enhancer_enhancer_gene[~(enhancer_enhancer_gene['enhancer_1'] == enhancer_enhancer_gene['enhancer_2'])]
enhancer_enhancer_gene.shape

(795616, 3)

In [100]:
enhancer_pair_counts = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/processed/at_scale_enhancer_enhancer_pairs_both_cells_count.csv')
enhancer_pair_counts.head()

Unnamed: 0,enhancer.1.list,enhancer.2.list,count.list
0,chr20.2275,chr20.2276,10
1,chr20.2275,chr20.2279,9
2,chr20.2275,chr20.2320,14
3,chr20.2275,chr20.2321,8
4,chr20.2275,chr20.2355,6


In [101]:
sum(enhancer_pair_counts['enhancer.1.list'] == enhancer_pair_counts['enhancer.2.list'])

13019

In [102]:
enhancer_pair_counts.columns = ['enhancer_1', 'enhancer_2', 'count']
enhancer_pair_counts.head()

Unnamed: 0,enhancer_1,enhancer_2,count
0,chr20.2275,chr20.2276,10
1,chr20.2275,chr20.2279,9
2,chr20.2275,chr20.2320,14
3,chr20.2275,chr20.2321,8
4,chr20.2275,chr20.2355,6


In [103]:
enhancer_enhancer_gene

Unnamed: 0,gene,enhancer_1,enhancer_2
0,ENSG00000000419,chr20.2275,chr20.2276
1,ENSG00000000419,chr20.2275,chr20.2279
2,ENSG00000000419,chr20.2275,chr20.2320
3,ENSG00000000419,chr20.2275,chr20.2321
4,ENSG00000000419,chr20.2275,chr20.2355
...,...,...,...
905609,ENSG00000284554,chr22.2053,chr22.2074
905610,ENSG00000284554,chr22.2053,chr22.2098
905611,ENSG00000284554,chr22.2054,chr22.2074
905612,ENSG00000284554,chr22.2054,chr22.2098


In [104]:
enhancer_enhancer_gene = enhancer_enhancer_gene.merge(enhancer_pair_counts, how = 'left').drop_duplicates()

In [114]:
(enhancer_enhancer_gene['count'] > 50).sum()

25

In [115]:
enhancer_enhancer_gene.to_csv('/iblm/netapp/data1/external/Gasperini2019/processed/at_scale_enhancer_enhancer_pairs_both_cells_count_nodups.csv', index = False)

In [3]:
enhancer_enhancer_gene = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/processed/at_scale_enhancer_enhancer_pairs_both_cells_count_nodups.csv')
enhancer_enhancer_gene.head()

Unnamed: 0,gene,enhancer_1,enhancer_2,count
0,ENSG00000000419,chr20.2275,chr20.2276,10
1,ENSG00000000419,chr20.2275,chr20.2279,9
2,ENSG00000000419,chr20.2275,chr20.2320,14
3,ENSG00000000419,chr20.2275,chr20.2321,8
4,ENSG00000000419,chr20.2275,chr20.2355,6


In [5]:
(enhancer_enhancer_gene['count'] > 20).sum(0)

7178