In [1]:
import tfcomb.annotation
import pandas as pd
pd.set_option("display.max_columns", 50)

# Annotate binding sites to genes

We will start by reading in known co-occurring rules:

In [2]:
C = tfcomb.CombObj().from_pickle("../data/GM12878_selected.pkl")

In [3]:
C.rules.head(20)

Unnamed: 0,TF1,TF2,TF1_TF2_count,TF1_count,TF2_count,cosine,zscore
CTCF-RAD21,CTCF,RAD21,1751,2432,2241,0.750038,18.643056
RAD21-SMC3,RAD21,SMC3,1376,2241,1638,0.718192,20.314026
CTCF-SMC3,CTCF,SMC3,1361,2432,1638,0.681898,20.245177
IKZF1-IKZF2,IKZF1,IKZF2,1726,2922,2324,0.662343,11.21596
SMC3-ZNF143,SMC3,ZNF143,1060,1638,1652,0.644383,21.838431
FOS-NFYA,FOS,NFYA,19,45,21,0.61807,58.099184
BATF-JUNB,BATF,JUNB,1135,1854,1866,0.610218,15.351205
RAD21-ZNF143,RAD21,ZNF143,1136,2241,1652,0.590408,15.16818
CTCF-ZNF143,CTCF,ZNF143,1170,2432,1652,0.583713,15.459923
CREB1-CREM,CREB1,CREM,717,1114,1392,0.57578,26.936982


We then use _get_pair_locations_ to get the locations of the co-occurring sites:

In [4]:
locations = C.get_pair_locations(("JUNB", "BATF"))

INFO: Setting up binding sites for counting


In [5]:
type(locations)

tfcomb.utils.TFBSPairList

In [6]:
len(locations)

1135

We can show these locations as a table using `.as_table`:

In [7]:
location_table = locations.as_table()
location_table

Unnamed: 0,site1_chrom,site1_start,site1_end,site1_name,site1_score,site1_strand,site2_chrom,site2_start,site2_end,site2_name,site2_score,site2_strand,site_distance,site_orientation
0,chr4,699544,699545,BATF,1000,.,chr4,699546,699547,JUNB,791,.,1,
1,chr4,799162,799163,BATF,1000,.,chr4,799177,799178,JUNB,1000,.,14,
2,chr4,924255,924256,JUNB,1000,.,chr4,924307,924308,BATF,1000,.,51,
3,chr4,1218967,1218968,BATF,1000,.,chr4,1218986,1218987,JUNB,1000,.,18,
4,chr4,1710533,1710534,BATF,1000,.,chr4,1710546,1710547,JUNB,718,.,12,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,chr4,185788438,185788439,BATF,1000,.,chr4,185788469,185788470,JUNB,1000,.,30,
1131,chr4,186134462,186134463,BATF,1000,.,chr4,186134479,186134480,JUNB,1000,.,16,
1132,chr4,186839676,186839677,JUNB,1000,.,chr4,186839701,186839702,BATF,1000,.,24,
1133,chr4,189503100,189503101,JUNB,1000,.,chr4,189503109,189503110,BATF,744,.,8,


## Annotate regions

We can now use _annotate_regions_ to annotate these locations to genes:

In [8]:
annotated = tfcomb.annotation.annotate_regions(location_table, gtf="../data/chr4_genes.gtf")

In [9]:
annotated

Unnamed: 0,site1_chrom,site1_start,site1_end,site1_name,site1_score,site1_strand,site2_chrom,site2_start,site2_end,site2_name,site2_score,site2_strand,site_distance,site_orientation,feature,feat_strand,feat_start,feat_end,query_name,distance,feat_anchor,feat_ovl_peak,peak_ovl_feat,relative_location,gene_id,gene_version,gene_name,gene_source,gene_biotype
0,chr4,699544,699545,BATF,1000,.,chr4,699546,699547,JUNB,791,.,1,,gene,+,705747.0,770640.0,query_1,6203.0,start,0.0,0.0,Upstream,ENSG00000185619,18,PCGF3,ensembl_havana,protein_coding
1,chr4,799162,799163,BATF,1000,.,chr4,799177,799178,JUNB,1000,.,14,,,,,,,,,,,,,,,,
2,chr4,924255,924256,JUNB,1000,.,chr4,924307,924308,BATF,1000,.,51,,gene,+,932386.0,958656.0,query_1,8131.0,start,0.0,0.0,Upstream,ENSG00000127419,17,TMEM175,ensembl_havana,protein_coding
3,chr4,1218967,1218968,BATF,1000,.,chr4,1218986,1218987,JUNB,1000,.,18,,,,,,,,,,,,,,,,
4,chr4,1710533,1710534,BATF,1000,.,chr4,1710546,1710547,JUNB,718,.,12,,gene,+,1712857.0,1745171.0,query_1,2324.0,start,0.0,0.0,Upstream,ENSG00000013810,21,TACC3,ensembl_havana,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,chr4,185788438,185788439,BATF,1000,.,chr4,185788469,185788470,JUNB,1000,.,30,,,,,,,,,,,,,,,,
1131,chr4,186134462,186134463,BATF,1000,.,chr4,186134479,186134480,JUNB,1000,.,16,,,,,,,,,,,,,,,,
1132,chr4,186839676,186839677,JUNB,1000,.,chr4,186839701,186839702,BATF,1000,.,24,,,,,,,,,,,,,,,,
1133,chr4,189503100,189503101,JUNB,1000,.,chr4,189503109,189503110,BATF,744,.,8,,,,,,,,,,,,,,,,


By subsetting all sites, we can highlight the pairs annotated to promoters of any genes:

In [10]:
annotated[~annotated["gene_id"].isna()]

Unnamed: 0,site1_chrom,site1_start,site1_end,site1_name,site1_score,site1_strand,site2_chrom,site2_start,site2_end,site2_name,site2_score,site2_strand,site_distance,site_orientation,feature,feat_strand,feat_start,feat_end,query_name,distance,feat_anchor,feat_ovl_peak,peak_ovl_feat,relative_location,gene_id,gene_version,gene_name,gene_source,gene_biotype
0,chr4,699544,699545,BATF,1000,.,chr4,699546,699547,JUNB,791,.,1,,gene,+,705747.0,770640.0,query_1,6203.0,start,0.0,0.0,Upstream,ENSG00000185619,18,PCGF3,ensembl_havana,protein_coding
2,chr4,924255,924256,JUNB,1000,.,chr4,924307,924308,BATF,1000,.,51,,gene,+,932386.0,958656.0,query_1,8131.0,start,0.0,0.0,Upstream,ENSG00000127419,17,TMEM175,ensembl_havana,protein_coding
4,chr4,1710533,1710534,BATF,1000,.,chr4,1710546,1710547,JUNB,718,.,12,,gene,+,1712857.0,1745171.0,query_1,2324.0,start,0.0,0.0,Upstream,ENSG00000013810,21,TACC3,ensembl_havana,protein_coding
13,chr4,2761381,2761382,JUNB,855,.,chr4,2761397,2761398,BATF,1000,.,15,,gene,-,2741647.0,2756342.0,query_1,5039.0,start,0.0,0.0,Upstream,ENSG00000168884,15,TNIP2,ensembl_havana,protein_coding
15,chr4,2787533,2787534,BATF,1000,.,chr4,2787566,2787567,JUNB,954,.,32,,gene,+,2793070.0,2841098.0,query_1,5537.0,start,0.0,0.0,Upstream,ENSG00000087266,17,SH3BP2,ensembl_havana,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106,chr4,184734256,184734257,BATF,1000,.,chr4,184734285,184734286,JUNB,580,.,28,,gene,-,184694084.0,184734130.0,query_1,126.0,start,0.0,0.0,Upstream,ENSG00000151725,12,CENPU,ensembl_havana,protein_coding
1117,chr4,185205229,185205230,BATF,607,.,chr4,185205254,185205255,JUNB,878,.,24,,gene,+,185204236.0,185370185.0,query_1,993.0,start,1.0,0.000006,PeakInsideFeature,ENSG00000109762,16,SNX25,ensembl_havana,protein_coding
1124,chr4,185399224,185399225,JUNB,1000,.,chr4,185399267,185399268,BATF,1000,.,42,,gene,-,185363871.0,185395924.0,query_1,3300.0,start,0.0,0.0,Upstream,ENSG00000109771,16,LRP2BP,ensembl_havana,protein_coding
1125,chr4,185405783,185405784,JUNB,1000,.,chr4,185405801,185405802,BATF,1000,.,17,,gene,-,185363871.0,185395924.0,query_1,9859.0,start,0.0,0.0,Upstream,ENSG00000109771,16,LRP2BP,ensembl_havana,protein_coding
