Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #75 from p-lambda/dev
WILDS v1.2
- Loading branch information
Showing
46 changed files
with
3,017 additions
and
222 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
__pycache__ | ||
build | ||
dist | ||
venv | ||
wilds.egg-info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
## ENCODE feature generation and preprocessing | ||
|
||
#### Requirements | ||
- pyBigWig | ||
|
||
#### Instructions to create Codalab bundle | ||
|
||
Here are instructions to reproduce the Codalab bundle, in a directory path `BUNDLE_ROOT_DIRECTORY`. | ||
|
||
1. Download the human genome sequence (hg19 assembly) in FASTA format from http://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/hg19.fa.gz and extract it into `SEQUENCE_PATH`. | ||
|
||
2. Run `python prep_sequence.py --seq_path SEQUENCE_PATH --output_dir OUTPUT_DIR` to write the fasta file found in `SEQUENCE_PATH` to a numpy array archive in `OUTPUT_PATH`. (The dataset loader assumes `OUTPUT_PATH` to be `<bundle root directory>/sequence.npz`.) | ||
|
||
3. Download the DNase accessibility data. This consists of whole-genome DNase files in bigwig format from https://guanfiles.dcmb.med.umich.edu/Leopard/dnase_bigwig/. Save these to filenames `<bundle root directory>/DNASE.<celltype>.fc.signal.bigwig` in the code. | ||
|
||
4. Run `python prep_accessibility.py`. This writes samples of each bigwig file to `<bundle root directory>/qn.<celltype>.npy`. These are used at runtime when the dataset loader is initialized, to perform quantile normalization on the DNase accessibility signals. | ||
|
||
5. Download the labels from the challenge into a label directory `<bundle root directory>/labels/` created for this purpose: | ||
- The training chromosome labels for the challenge's training cell types from https://www.synapse.org/#!Synapse:syn7413983 for the relevant transcription factor ( https://www.synapse.org/#!Synapse:syn7415202 for the TF MAX, downloaded as MAX.train.labels.tsv.gz ). | ||
- The training chromosome labels for the challenge's evaluation cell type (liver) from https://www.synapse.org/#!Synapse:syn8077511 for the relevant transcription factor ( https://www.synapse.org/#!Synapse:syn8077648 for the TF MAX, downloaded as MAX.train_wc.labels.tsv.gz ). | ||
- The validation chromosome labels for the challenge's training cell types from https://www.synapse.org/#!Synapse:syn8441154 for the relevant transcription factor ( https://www.synapse.org/#!Synapse:syn8442103 for the TF MAX, downloaded as MAX.val.labels.tsv.gz ). | ||
- The validation chromosome labels for the challenge's evaluation cell type (liver) from https://www.synapse.org/#!Synapse:syn8442975 for the relevant transcription factor ( https://www.synapse.org/#!Synapse:syn8443021 for the TF MAX, downloaded as MAX.test.labels.tsv.gz ). | ||
|
||
6. Run `python prep_metadata_labels.py`. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Adapted from https://github.com/GuanLab/Leopard/blob/master/data/quantile_normalize_bigwig.py | ||
|
||
import argparse, time | ||
import numpy as np | ||
import pyBigWig | ||
|
||
# Human chromosomes in hg19, and their sizes in bp | ||
chrom_sizes = {'chr1': 249250621, 'chr10': 135534747, 'chr11': 135006516, 'chr12': 133851895, 'chr13': 115169878, 'chr14': 107349540, 'chr15': 102531392, 'chr16': 90354753, 'chr17': 81195210, 'chr18': 78077248, 'chr19': 59128983, 'chr2': 243199373, 'chr20': 63025520, 'chr21': 48129895, 'chr22': 51304566, 'chr3': 198022430, 'chr4': 191154276, 'chr5': 180915260, 'chr6': 171115067, 'chr7': 159138663, 'chr8': 146364022, 'chr9': 141213431, 'chrX': 155270560} | ||
|
||
|
||
def qn_sample_to_array( | ||
input_celltypes, | ||
input_chroms=None, | ||
subsampling_ratio=1000, | ||
data_pfx = '/users/abalsubr/wilds/examples/data/encode_v1.0/' | ||
): | ||
""" | ||
Compute and write distribution of DNase bigwigs corresponding to input celltypes. | ||
""" | ||
if input_chroms is None: | ||
input_chroms = chrom_sizes.keys() | ||
qn_chrom_sizes = { k: chrom_sizes[k] for k in input_chroms } | ||
# Initialize chromosome-specific seeds for subsampling | ||
chr_to_seed = {} | ||
i = 0 | ||
for the_chr in qn_chrom_sizes: | ||
chr_to_seed[the_chr] = i | ||
i += 1 | ||
|
||
# subsampling | ||
sample_len = np.ceil(np.array(list(qn_chrom_sizes.values()))/subsampling_ratio).astype(int) | ||
sample = np.zeros(sum(sample_len)) | ||
start = 0 | ||
j = 0 | ||
for the_chr in qn_chrom_sizes: | ||
np.random.seed(chr_to_seed[the_chr]) | ||
for ct in input_celltypes: | ||
path = data_pfx + 'DNASE.{}.fc.signal.bigwig'.format(ct) | ||
bw = pyBigWig.open(path) | ||
signal = np.nan_to_num(np.array(bw.values(the_chr, 0, qn_chrom_sizes[the_chr]))) | ||
index = np.random.randint(0, len(signal), sample_len[j]) | ||
sample[start:(start+sample_len[j])] += (1.0/len(input_celltypes))*signal[index] | ||
start += sample_len[j] | ||
j += 1 | ||
print(the_chr, ct) | ||
sample.sort() | ||
np.save(data_pfx + "qn.{}.npy".format('.'.join(input_celltypes)), sample) | ||
|
||
|
||
if __name__ == '__main__': | ||
train_chroms = ['chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr10', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr22', 'chrX'] | ||
all_celltypes = ['H1-hESC', 'HCT116', 'HeLa-S3', 'K562', 'A549', 'GM12878', 'MCF-7', 'HepG2', 'liver'] | ||
for ct in all_celltypes: | ||
qn_sample_to_array([ct], input_chroms=train_chroms) |
Oops, something went wrong.