In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyBigWig as bw
import torch
import h5py
from multiprocessing import Pool
from functools import partial

genome = 'mm10'
window_kb = 5

In [2]:
def get_factor_peak_matrix(Peak_tensor,j):

    my_size = (Peak_tensor.size()[0],Peak_tensor.size()[2])
    indices = Peak_tensor.coalesce().indices()
    values = Peak_tensor.coalesce().values()
    idx = indices[1] == j
    indices = indices[:,idx]
    indices = indices[[0,2],:]
    values = values[idx]

    return torch.sparse_coo_tensor(indices, values, size=my_size)

In [3]:
# load regulatory sites
infile = f'../results/{genome}/regulatory_regions_pm{window_kb}kb.bed'
Regulatory_regions_bed = pd.read_csv(infile, sep='\t')
infile = f'../results/{genome}/regulatory_regions_pm{window_kb}kb.npy'
Regulatory_regions = np.load(infile)

# load peak tensor
infile = f'../results/{genome}/Peak_tensors/Window_pm{window_kb}kb/Sparse_peak_tensor_prom_tf_position.pt'
Peak_tensor = torch.load(infile)

# load chip experiment 
infile = f'../resources/experimentList_v3_{genome}_TFs_only_QC_filtered.tab'
chip_experiment = pd.read_csv(infile, sep='\t',index_col=0)
chip_experiment = chip_experiment.loc[:,'antigen']
TFs = chip_experiment.unique()

# load motif matrix
infile = f"../../Jaspar/results/{genome}/Window_pm{window_kb}kb/convolution_PromSeq_PWM.hdf5"
Jaspar = h5py.File(infile, 'r')

# load Promoterome
infile = f"~/Promoterome/results/{genome}/promoterome_pm{window_kb}kb_filtered_clustered_sorted.bed"
Promoterome = pd.read_csv(infile, sep='\t')

# add promoter index and gene name to Regulatory regions
prom2idx =  dict(zip(Promoterome.id.values, Promoterome.index.values))
prom2gene = dict(zip(Promoterome.id.values, Promoterome.gene.values))
prom2prom_start = dict(zip(Promoterome.id.values, Promoterome.start.values))
prom2prom_end = dict(zip(Promoterome.id.values, Promoterome.end.values))
Regulatory_regions_bed['prom_idx'] = Regulatory_regions_bed['name'].map(prom2idx)
Regulatory_regions_bed['gene'] = Regulatory_regions_bed['name'].map(prom2gene)
Regulatory_regions_bed['prom_start'] = Regulatory_regions_bed['name'].map(prom2prom_start)
Regulatory_regions_bed['prom_end'] = Regulatory_regions_bed['name'].map(prom2prom_end)

# get dimentions
N_reg_regions = Regulatory_regions_bed.shape[0]
N_TFs = len(TFs)
N_motifs = Jaspar['convolution'].shape[1]

In [266]:
n_s = 5
N_sf = np.zeros((len(Regulatory_regions_bed[:n_s]),len(TFs)))
for s in Regulatory_regions_bed.index[:n_s]:
    chr = Regulatory_regions_bed.loc[s,'chr']
    start = Regulatory_regions_bed.loc[s,'start']
    end = Regulatory_regions_bed.loc[s,'end']

    for f,tf in enumerate(TFs):
        n = 0
        for exp in chip_experiment[chip_experiment == tf].index:
            
            bbfile = bw.open(f'../resources/tracks/mm10/{exp}.05.bb')
            if chr not in bbfile.chroms():
                continue

            val = bbfile.entries(chr, start, end)
            if val != None:
                N_sf[s,f] += np.sum([int(v[2]) for v in val])
                n += 1
            bbfile.close()
        if n > 0:
            N_sf[s,f] /= n


In [267]:
def get_N_s_chip(TFs,chip_experiment,coord):
    chr,start,end = coord
    N_s = np.zeros(len(TFs))
    for f,tf in enumerate(TFs):
        n = 0
        for exp in chip_experiment[chip_experiment == tf].index:
            bbfile = bw.open(f'../resources/tracks/mm10/{exp}.05.bb')
            if chr not in bbfile.chroms():
                continue
            val = bbfile.entries(chr, start, end)
            if val != None:
                N_s[f] += np.sum([int(v[2]) for v in val])
                n += 1
            bbfile.close()
        if n > 0:
            N_s[f] /= n
        
    return N_s

threads = 46

COORD = []
for s in Regulatory_regions_bed.index[:10]:
    chr = Regulatory_regions_bed.loc[s,'chr']
    start = Regulatory_regions_bed.loc[s,'start']
    end = Regulatory_regions_bed.loc[s,'end']
    COORD.append( (chr,start,end) )

with Pool(processes=threads) as pool:
    OUT = pool.map(partial(get_N_s_chip,TFs,chip_experiment), COORD)

N_sf = np.array(OUT)

#N_sf = np.load(f'../results/{genome}/N_sf_pm{window_kb}kb.npy')


In [331]:
N_sm = np.zeros((N_reg_regions,N_motifs))
th = 0.5
for s in Regulatory_regions_bed.index:
    if s%100 == 0:
        print(s/N_reg_regions*100, '%')

    # get coordinates
    p = Regulatory_regions_bed.loc[s,'prom_idx']
    start = Regulatory_regions_bed.loc[s,'start'] - Regulatory_regions_bed.loc[s,'prom_start']
    end = Regulatory_regions_bed.loc[s,'end'] - Regulatory_regions_bed.loc[s,'prom_start']
    
    # get motif matrix, apply threshold and sum
    X = Jaspar['convolution'][p,:,start:end]
    X[X<th] = 0
    N_sm[s] = X.sum(axis=1)

np.save(f'../results/{genome}/N_sm_pm{window_kb}kb.npy',N_sm)

0.0 %
0.054211413671034304 %
0.10842282734206861 %
0.1626342410131029 %
0.21684565468413722 %
0.2710570683551715 %
0.3252684820262058 %
0.3794798956972401 %
0.43369130936827444 %
0.48790272303930865 %
0.542114136710343 %
0.5963255503813772 %
0.6505369640524116 %
0.7047483777234459 %
0.7589597913944802 %
0.8131712050655144 %
0.8673826187365489 %
0.9215940324075831 %
0.9758054460786173 %
1.0300168597496517 %
1.084228273420686 %
1.1384396870917202 %
1.1926511007627545 %
1.246862514433789 %
1.3010739281048231 %
1.3552853417758575 %
1.4094967554468918 %
1.4637081691179261 %
1.5179195827889604 %
1.5721309964599948 %
1.6263424101310289 %
1.6805538238020632 %
1.7347652374730977 %
1.7889766511441318 %
1.8431880648151662 %
1.8973994784862007 %
1.9516108921572346 %
2.005822305828269 %
2.0600337194993035 %
2.114245133170338 %
2.168456546841372 %
2.222667960512406 %
2.2768793741834403 %
2.331090787854475 %
2.385302201525509 %
2.4395136151965433 %
2.493725028867578 %
2.547936442538612 %
2.6021478562

In [4]:
Jaspar['convolution']

<HDF5 dataset "convolution": shape (24424, 137, 10000), type "<f4">

In [333]:
infile = f'../resources/{genome}/jaspar.bb'
bb = bw.open(infile,'r')

chr                             chr1
start                        4858290
end                          4858392
name          chr1_+_4857760_4857814
score                            102
strand                             +
prom_idx                           5
gene                           Tcea1
prom_start                   4852787
prom_end                     4862787
Name: 42, dtype: object


In [363]:
s = 42
print(Regulatory_regions_bed.loc[s])
chr = Regulatory_regions_bed.loc[s,'chr']
Match = bb.entries(chr,Regulatory_regions_bed.loc[s,'start'],Regulatory_regions_bed.loc[s,'end'])

chr = [chr]*len(Match)
start = np.array([m[0] for m in Match])
end = np.array([m[1] for m in Match])
name = np.array([m[2].split('\t')[0] for m in Match])
score = np.array([int(m[2].split('\t')[1]) for m in Match])
strand = np.array([m[2].split('\t')[2] for m in Match])

df = pd.DataFrame({'chr':chr,'start':start,'end':end,'name':name,'score':score,'strand':strand})


chr                             chr1
start                        4858290
end                          4858392
name          chr1_+_4857760_4857814
score                            102
strand                             +
prom_idx                           5
gene                           Tcea1
prom_start                   4852787
prom_end                     4862787
Name: 42, dtype: object


In [365]:
df.sort_values(by='score',ascending=False,inplace=True)

In [366]:
df

Unnamed: 0,chr,start,end,name,score,strand
83,chr1,4858319,4858336,ERF::NHLH1,576,-
193,chr1,4858350,4858365,MAFA,552,+
192,chr1,4858350,4858365,MAFA,539,-
136,chr1,4858326,4858342,ZNF93,516,-
191,chr1,4858350,4858365,MAF,511,+
...,...,...,...,...,...,...
137,chr1,4858327,4858333,Foxn1,174,-
158,chr1,4858330,4858336,Foxn1,174,-
306,chr1,4858387,4858395,VAX2,168,+
179,chr1,4858341,4858349,NR2C2,167,+
