05/14/2020

# 1. reproduce max's P-P, E-P, and E-E #'s 

Maxs paper"Promoter/Enhancer Annotations of HiChIP Loops Promoters were defined as 1 kb regions centered at the TSS, and enhancers were identified as chromHMM enhancers not overlapping with promoters in any cell type. We annotated loop anchors as ‘other’ if the anchor did not contain a promoter or enhancer as defined above."\

In [1]:
import pandas as pd
import numpy as np
import subprocess
import seaborn as sns
import pybedtools

# 1A. make promoter bed

In [9]:
tss_file = '../data/external/TSS_annon_hg19.csv'
tss_df = pd.read_csv(tss_file,index_col=0)
tss_df['start'] = tss_df['loc'].apply(lambda x: max(0,x-1000))
tss_df['stop'] = tss_df['end'].apply(lambda x: x+1000)
tss_df = tss_df[['chr','start','stop','TSS']]

In [13]:
tss_bed = pybedtools.BedTool.from_dataframe(tss_df)
tss_bed.to_dataframe()[:5]

Unnamed: 0,chrom,start,end,name
0,chr1,1845266,1847266,CALML6
1,chr1,231375933,231377933,C1orf131
2,chr1,241682061,241684061,FH
3,chr1,120201421,120203421,PHGDH
4,chr1,204134465,204136465,REN


# 1B. get enhancer bed

E116 - GM12878
E123 - K561

In [14]:
gm12878_chromHMM_bed = pybedtools.BedTool('../data/external/chromHMM/E116_15_coreMarks_dense.bed')
K562_chromHMM_bed = pybedtools.BedTool('../data/external/chromHMM/E123_15_coreMarks_dense.bed')


K562_chromHMM_bed.to_dataframe()

# 1C. get loops/anchors

In [159]:
gm_loops = pd.read_excel('../data/external/mumbach2017/supp_table_2.xlsx',sheet_name='GM12878 H3K27ac Loops')
k562_loops = pd.read_excel('../data/external/mumbach2017/supp_table_2.xlsx',sheet_name='K562 H3K27ac Loops')
print(gm_loops.shape, k562_loops.shape)

(6396, 6) (1553, 6)


In [160]:
gm_loops[:5]

Unnamed: 0,chr_x,start_x,stop_x,chr_y,start_y,stop_y
0,10,101190000,101195000,10,101375000,101380000
1,10,101375000,101380000,10,101470000,101475000
2,10,101605000,101610000,10,101805000,101810000
3,10,101730000,101735000,10,101765000,101770000
4,10,101730000,101735000,10,101805000,101810000


In [161]:
gm_loops['source'] =  'chr' + gm_loops['chr_x'].map(str) + '_' + gm_loops['start_x'].map(str)+ '_' + gm_loops['stop_x'].map(str)
gm_loops['target'] =  'chr' + gm_loops['chr_y'].map(str) + '_' + gm_loops['start_y'].map(str)+ '_' + gm_loops['stop_y'].map(str)
k562_loops['source'] =  'chr' + k562_loops['chr_x'].map(str) + '_' + k562_loops['start_x'].map(str)+ '_' + k562_loops['stop_x'].map(str)
k562_loops['target'] =  'chr' + k562_loops['chr_y'].map(str) + '_' + k562_loops['start_y'].map(str)+ '_' + k562_loops['stop_y'].map(str)

In [162]:
anchor_1 = gm_loops[['chr_x','start_x','stop_x']]
anchor_2 = gm_loops[['chr_y','start_y','stop_y']]
anchor_1.columns = ['chr','start','stop']
anchor_2.columns = ['chr','start','stop']
gm_anchors = pd.concat([anchor_1,anchor_2],axis=0).drop_duplicates()
gm_anchors['chr'] = 'chr'+gm_anchors['chr'].map(str)
gm_anchors['anchor_name'] = gm_anchors['chr']+ '_' + gm_anchors['start'].map(str)+ '_' + gm_anchors['stop'].map(str)

anchor_1 = k562_loops[['chr_x','start_x','stop_x']]
anchor_2 = k562_loops[['chr_y','start_y','stop_y']]
anchor_1.columns = ['chr','start','stop']
anchor_2.columns = ['chr','start','stop']
k562_anchors = pd.concat([anchor_1,anchor_2],axis=0).drop_duplicates()
k562_anchors['chr'] = 'chr'+k562_anchors['chr'].map(str)
k562_anchors['anchor_name'] = k562_anchors['chr']+ '_' + k562_anchors['start'].map(str)+ '_' + k562_anchors['stop'].map(str)

print(gm_anchors.shape, k562_anchors.shape)

(10714, 4) (2955, 4)


In [163]:
gm_anchors_bed = pybedtools.BedTool.from_dataframe(gm_anchors)
k562_anchors_bed = pybedtools.BedTool.from_dataframe(k562_anchors)

In [164]:
gm_anchors_bed.to_dataframe()

Unnamed: 0,chrom,start,end,name
0,chr10,101190000,101195000,chr10_101190000_101195000
1,chr10,101375000,101380000,chr10_101375000_101380000
2,chr10,101605000,101610000,chr10_101605000_101610000
3,chr10,101730000,101735000,chr10_101730000_101735000
4,chr10,101765000,101770000,chr10_101765000_101770000
...,...,...,...,...
10709,chrX,9640000,9650000,chrX_9640000_9650000
10710,chrX,94325000,94330000,chrX_94325000_94330000
10711,chrX,9640000,9645000,chrX_9640000_9645000
10712,chrX,9960000,9970000,chrX_9960000_9970000


### 1Ci for gms

In [70]:
anchor_bed_prom = gm_anchors_bed.intersect(tss_bed,wa=True,wb=True)
promoter_anchors = anchor_bed_prom.to_dataframe()['name'].unique()
print(promoter_anchors.shape)

(2162,)


In [69]:
anchor_bed_chromHMM = gm_anchors_bed.intersect(gm12878_chromHMM_bed,wa=True,wb=True).to_dataframe(header=None)
anchor_bed_chromHMM_enh = anchor_bed_chromHMM[anchor_bed_chromHMM.loc[:,7].isin(['7_Enh','6_EnhG'])]
enhancer_anchors = anchor_bed_chromHMM_enh.loc[:,3].unique()#.shape 
print(enhancer_anchors.shape) # num enhancers before filter promoters out
enhancer_anchors = np.array(list(set(enhancer_anchors) - set(promoter_anchors)))
print(enhancer_anchors.shape) # num enhancers after filter promoters out


(7107,)
(5680,)


['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 13 fields; you can supply custom names with the `names` kwarg
  "`names` kwarg" % (self.file_type, _names, self.field_count())


In [82]:
print('num anchors', gm_anchors.shape[0])
print('% anchors captured by P or E: ',(enhancer_anchors.shape[0] +promoter_anchors.shape[0] )/gm_anchors.shape[0] )
print('% anchors captured by P: ',(promoter_anchors.shape[0] )/gm_anchors.shape[0] )
print('% anchors captured by E: ',(enhancer_anchors.shape[0] )/gm_anchors.shape[0] )


num anchors 10714
% anchors captured by P or E:  0.21849915997759942
% anchors captured by P:  0.08418891170431211
% anchors captured by E:  0.13431024827328727


In [165]:
loop_df = gm_loops
loop_df['source_P'] = loop_df.source.isin(promoter_anchors)
loop_df['target_P'] = loop_df.target.isin(promoter_anchors)
loop_df['source_E'] = loop_df.source.isin(enhancer_anchors)
loop_df['target_E'] = loop_df.target.isin(enhancer_anchors)
    
loop_df['is_P_P'] = loop_df['source_P'] & loop_df['target_P']
loop_df['is_P_E'] = (loop_df['source_P'] & loop_df['target_E']) |  (loop_df['source_E'] & loop_df['target_P'])
loop_df['is_E_E'] =loop_df['source_E'] & loop_df['target_E']
print('num loops',loop_df.shape[0])
print('num P-P: ',loop_df['is_P_P'].sum())#,loop_df['is_P_P'].sum()/loop_df.shape[0])
print('num P-E: ',loop_df['is_P_E'].sum())#,loop_df['is_P_P'].sum()/loop_df.shape[0])
print('num E-E: ',loop_df['is_E_E'].sum())#,loop_df['is_P_P'].sum()/loop_df.shape[0])
sum_loops = loop_df['is_P_P'].sum() + loop_df['is_P_E'].sum() + loop_df['is_E_E'].sum()
print('num loops to focus on', sum_loops)
print('% P-P: ',loop_df['is_P_P'].sum()/sum_loops)#,loop_df['is_P_P'].sum()/loop_df.shape[0])
print('% P-E: ',loop_df['is_P_E'].sum()/sum_loops)#,loop_df['is_P_P'].sum()/loop_df.shape[0])
print('% E-E: ',loop_df['is_E_E'].sum()/sum_loops)#,loop_df['is_P_P'].sum()/loop_df.shape[0])


num loops 6396
num P-P:  28
num P-E:  68
num E-E:  43
num loops to focus on 139
% P-P:  0.2014388489208633
% P-E:  0.4892086330935252
% E-E:  0.30935251798561153


### 1Cii for K562

In [77]:
anchor_bed_prom = k562_anchors_bed.intersect(tss_bed,wa=True,wb=True)
promoter_anchors = anchor_bed_prom.to_dataframe()['name'].unique()
print(promoter_anchors.shape)

(902,)


In [79]:
anchor_bed_chromHMM = k562_anchors_bed.intersect(K562_chromHMM_bed,wa=True,wb=True).to_dataframe(header=None)
anchor_bed_chromHMM_enh = anchor_bed_chromHMM[anchor_bed_chromHMM.loc[:,7].isin(['7_Enh','6_EnhG'])]
enhancer_anchors = anchor_bed_chromHMM_enh.loc[:,3].unique()#.shape 
print(enhancer_anchors.shape) # num enhancers before filter promoters out
enhancer_anchors = np.array(list(set(enhancer_anchors) - set(promoter_anchors)))
print(enhancer_anchors.shape) # num enhancers after filter promoters out


(2191,)
(1439,)


['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 13 fields; you can supply custom names with the `names` kwarg
  "`names` kwarg" % (self.file_type, _names, self.field_count())


In [83]:
print('num anchors', k562_anchors.shape[0])
print('% anchors captured by P or E: ',(enhancer_anchors.shape[0] +promoter_anchors.shape[0] )/k562_anchors.shape[0] )
print('% anchors captured by P: ',(promoter_anchors.shape[0] )/k562_anchors.shape[0] )
print('% anchors captured by E: ',(enhancer_anchors.shape[0] )/k562_anchors.shape[0] )


num anchors 2955
% anchors captured by P or E:  0.7922165820642978
% anchors captured by P:  0.30524534686971233
% anchors captured by E:  0.48697123519458546


In [81]:
loop_df = k562_loops
loop_df['source_P'] = loop_df.source.isin(promoter_anchors)
loop_df['target_P'] = loop_df.target.isin(promoter_anchors)
loop_df['source_E'] = loop_df.source.isin(enhancer_anchors)
loop_df['target_E'] = loop_df.target.isin(enhancer_anchors)
    
loop_df['is_P_P'] = loop_df['source_P'] & loop_df['target_P']
loop_df['is_P_E'] = (loop_df['source_P'] & loop_df['target_E']) |  (loop_df['source_E'] & loop_df['target_P'])
loop_df['is_E_E'] =loop_df['source_E'] & loop_df['target_E']
print('num loops',loop_df.shape[0])
print('num P-P: ',loop_df['is_P_P'].sum())#,loop_df['is_P_P'].sum()/loop_df.shape[0])
print('num P-E: ',loop_df['is_P_E'].sum())#,loop_df['is_P_P'].sum()/loop_df.shape[0])
print('num E-E: ',loop_df['is_E_E'].sum())#,loop_df['is_P_P'].sum()/loop_df.shape[0])
sum_loops = loop_df['is_P_P'].sum() + loop_df['is_P_E'].sum() + loop_df['is_E_E'].sum()
print('num loops to focus on', sum_loops)
print('% P-P: ',loop_df['is_P_P'].sum()/sum_loops)#,loop_df['is_P_P'].sum()/loop_df.shape[0])
print('% P-E: ',loop_df['is_P_E'].sum()/sum_loops)#,loop_df['is_P_P'].sum()/loop_df.shape[0])
print('% E-E: ',loop_df['is_E_E'].sum()/sum_loops)#,loop_df['is_P_P'].sum()/loop_df.shape[0])


num loops 1553
num P-P:  118
num P-E:  459
num E-E:  394
num loops to focus on 971
% P-P:  0.121524201853759
% P-E:  0.4727085478887745
% E-E:  0.4057672502574665


```
kk i compared to max's results





9:21
this is max's GM12878 data percentage:
num anchors 10714
% anchors captured by P or E:  0.21849915997759942
% anchors captured by P:  0.08418891170431211
% anchors captured by E:  0.13431024827328727
num loops 6396
num P-P:  338
num P-E:  1263
num E-E:  2136
num loops to focus on 3737
% P-P:  0.09044688252609044
% P-E:  0.337971635001338
% E-E:  0.5715814824725716
9:21
i modified our stuff
9:22
P-P (don't care about accessibility here
9:22
P-E = P-PIRacc (so a promoter region connecting to an accessible region that is not a promoter)
9:22
E-E = PIRacc-PIRacc
9:22
and then you get these percentages
9:22
perc_is_P_P_adj 0.08977712805519222
perc_is_P_PIRacc_adj 0.3681875304557462
perc_is_PIRacc_PIRacc_adj 0.5420353414890615
9:27
this is max's K562 data
num anchors 2955
% anchors captured by P or E:  0.7922165820642978
% anchors captured by P:  0.30524534686971233
% anchors captured by E:  0.48697123519458546
num loops 1553
num P-P:  118
num P-E:  459
num E-E:  394
num loops to focus on 971
% P-P:  0.121524201853759
% P-E:  0.4727085478887745
% E-E:  0.4057672502574665
```

# 2. compare our GM loops/anchors to max's loops/anchors

# 2A anchors

In [166]:
tissue='GM12878'
loop_file = '../data/interim/merged/loops/'+tissue+'.loops.csv'
loop_df = pd.read_csv(loop_file, index_col=0)
anchor_file = '../data/interim/merged/anchors/'+tissue+'.anchors.csv'
anchor_df = pd.read_csv(anchor_file, index_col=0)
anchor_df = anchor_df[['chr','start','end','anchors','sample']]#[:5]
anchor_bed = pybedtools.BedTool.from_dataframe(anchor_df)

# anchor_annon_file = os.path.join(anchor_annon_dir, tissue+'_annon.bed')
# anchor_annon_df = pd.read_csv(anchor_annon_file, names=['chr', 'start', 'stop', 'name', 'atac_chr', 'atac_start', 'atac_stop'], sep='\t')
# anchor_to_count_dict = anchor_annon_df.groupby('name').name.count().to_dict()
# anchors_acc = anchor_to_count_dict.keys()

# print(anchor_df.shape)

# promoter_anchor_file = os.path.join(promoter_annon_dir, 'promoter_'+'GM12878'+'_annon.bed')
# promoter_anchor_df = pd.read_csv(promoter_anchor_file, names=['chr', 'start', 'stop', 'tss', 'anchor_chr', 'anchor_start', 
#                                                          'anchor_stop', 'anchor', 'overlap'], sep='\t')
# promoter_anchors = promoter_anchor_df.anchor.unique()
# print(promoter_anchors.shape)

In [167]:
# union_anchors_df = pd.concat([gm_anchors_bed.to_dataframe(),anchor_bed.to_dataframe()])
# union_anchors_df = union_anchors_df[['chrom','start','end']]
shared_anchors = (gm_anchors_bed+anchor_bed)#.sort().merge()
# union_bed = pybedtools.BedTool.from_dataframe(union_anchors_df).sort().merge()
print('# anchors we have: ',anchor_bed.count()) 
print('# anchors Max has: ', gm_anchors_bed.count())
print('# shared anchors: ', shared_anchors.count())
print('# anchors Max had we didnt: ',(gm_anchors_bed-anchor_bed).count())#.merge().count())
print('# anchors we had Max didnt: ',(anchor_bed-gm_anchors_bed).count())#.merge().count())
print('# total union set of anchors: ', union_bed.count())


# anchors we have:  83619
# anchors Max has:  10714
# shared anchors:  8727
# anchors Max had we didnt:  1987
# anchors we had Max didnt:  73831
# total union set of anchors:  35088


In [168]:
shared_anchors.count()/gm_anchors_bed.count()

0.814541721112563

In [169]:
anchor_bed.count()/gm_anchors_bed.count()

7.804648123949972

we capture 81.5% of the anchors (sensitivitity) that Max predicted, we also have almost 8x as many anchor points

# 2B loops

In [170]:
def name_loop_row(row):
    
    if row.x_before_y:
        return '::'.join([row.source,row.target])
    else:
        return '::'.join([row.target,row.source])


In [172]:
gm_loops = gm_loops[gm_loops.chr_x ==gm_loops.chr_y]
gm_loops['x_before_y'] = gm_loops.start_y > gm_loops.start_x
gm_loops['loop_name'] = gm_loops.apply(name_loop_row,axis=1)

loop_df[['chr_x','start_x','stop_x']] = loop_df.source.str.split('_',expand=True)
loop_df[['chr_y','start_y','stop_y']] = loop_df.target.str.split('_',expand=True)
loop_df = loop_df[loop_df.chr_x ==loop_df.chr_y]
loop_df['x_before_y'] = loop_df.start_y.map(int) > loop_df.start_x.map(int)
loop_df['loop_name'] = loop_df.apply(name_loop_row,axis=1)

In [173]:
our_loops_set = set(loop_df.loop_name)
max_loops_set = set(gm_loops.loop_name)


In [174]:
print('num our loops: ', len(our_loops_set))
print('num max loops: ', len(max_loops_set))
print('shared loops: ', len(our_loops_set.intersection(max_loops_set)))

num our loops:  167347
num max loops:  6396
shared loops:  3003


In [175]:
len(our_loops_set.intersection(max_loops_set))/len(max_loops_set)

0.4695121951219512

In [176]:
len(our_loops_set)/len(max_loops_set)

26.164321450906815

we capture 47.0% of the loops (sensitivitity) that Max predicted, we also have 26x more loops