# Find Candidate Enhancer Pairs

This code looks through the 664 previously published enhancer-gene pairs from the Gasperini et al. (2019) dataset. Using these pairs, we identified 113 unique genes that had multiple enhancers affecting them. Then, we generated pairwise combinations to create a list of candidate enhancer pairs to be tested for interaction effects.

Author: Karthik Guruvayurappan

In [1]:
import numpy as np
import pandas as pd

In [2]:
# read in table of enhancer-gene pairs
enhancer_gene_pairs = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/gasperini_enhancer_gene_pairs_suppl_table_2.csv')
enhancer_gene_pairs.head()

Unnamed: 0,Target_Site,ENSG,target_gene_short,Diff_expression_test_raw_pval,Diff_expression_test_fold_change,Diff_expression_test_Empirical_pval,Diff_expression_test_Empirical_adjusted_pval,high_confidence_subset,chr.candidate_enhancer,start.candidate_enhancer,stop.candidate_enhancer
0,chr2.2482,ENSG00000115977,AAK1,0.001451572,0.756542,0.002719,0.098652,True,chr2,69056234,69056865
1,chrX.2695,ENSG00000101986,ABCD1,0.000735184,0.669369,0.001825,0.073014,True,chrX,153250743,153251468
2,chr10.2252,ENSG00000138316,ADAMTS14,2.14e-09,0.447355,0.000449,0.032498,False,chr10,72426863,72427518
3,chr1.8461,ENSG00000143382,ADAMTSL4,0.000115726,0.667658,0.001044,0.050414,True,chr1,150517877,150518596
4,chr11.1006,ENSG00000148926,ADM,0.000588113,0.309175,0.001651,0.067336,True,chr11,9573345,9573973


In [3]:
# get list of genes that have multiple enhancers
enhancer_counts = enhancer_gene_pairs.groupby('ENSG').count()
enhancer_counts = enhancer_counts[enhancer_counts['Target_Site'] > 1]
multiple_enhancer_genes = pd.Series(enhancer_counts.index)
multiple_enhancer_genes.head()

0    ENSG00000005249
1    ENSG00000006042
2    ENSG00000013306
3    ENSG00000023516
4    ENSG00000034510
Name: ENSG, dtype: object

In [4]:
enhancer_pairs = []

# get all enhancer pair combinations
grouped_genes = enhancer_gene_pairs.groupby('ENSG')

for gene in multiple_enhancer_genes:
    
    # get all enhancer-gene pairs for given gene
    gene_enhancers = np.array(grouped_genes.get_group(gene)['Target_Site'])

    for i in np.arange(len(gene_enhancers)):

        enhancer_one = gene_enhancers[i]

        for j in np.arange(1, len(gene_enhancers)):
            
            enhancer_two = gene_enhancers[j]
            enhancer_pairs.append([gene, enhancer_one, enhancer_two])

enhancer_pairs = pd.DataFrame(enhancer_pairs)
enhancer_pairs.columns = ['gene', 'enhancer_1', 'enhancer_2']
enhancer_pairs.head()


Unnamed: 0,gene,enhancer_1,enhancer_2
0,ENSG00000005249,chr7.4040,chr7.4045
1,ENSG00000005249,chr7.4040,chr7.4046
2,ENSG00000005249,chr7.4040,chr7.4041
3,ENSG00000005249,chr7.4040,chr7.4042
4,ENSG00000005249,chr7.4040,chr7.4048


In [5]:
enhancer_pairs.to_csv('/iblm/netapp/data1/external/Gasperini2019/processed/enhancer_pairs_suppl_table_2.csv', index = False)

In [6]:
# get number of enhancer pairs
enhancer_pairs.shape

(660, 3)

In [7]:
# get number of unique genes
enhancer_pairs['gene'].nunique()

113