In [1]:
import prep_utils, scipy, numpy as np, time
from scipy import sparse

# Human chromosome names
chr_IDs = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']

## Sequence

In [3]:
a = prep_utils.read_fasta('sequence/hg19.genome.fa')

kw_dict = {}
itime = time.time()
for chrom in chr_IDs:
    seqstr = a[chrom]
    kw_dict[chrom] = prep_utils.one_hot_encode(seqstr, alphabet=['A', 'C', 'G', 'T', 'N'])
    print(chrom, time.time() - itime)

# Save as npz archive; can take several (>20) minutes
print("Saving npz archive...")
np.savez_compressed('codalab_archive/sequence', **kw_dict)
print(time.time() - itime)

# # Save as npy arrays
# itime = time.time()
# for chrom in kw_dict:
#     np.save('sequence/{}.npy'.format(chrom), kw_dict[chrom])
#     print(chrom, time.time() - itime)

npz_archive = np.load('codalab_archive/sequence.npz')

62743362it [00:54, 1151676.47it/s]


## DNase

In [12]:
### import pyBigWig
import glob

dnases = {}
celltypes = ['A549', 'GM12878', 'H1-hESC', 'HCT116', 'HeLa-S3', 'HepG2', 'K562']

for ctype in celltypes:#glob.glob('dnase_bigwigs/*'):
    itime = time.time()
    # ctype = pth.split('/')[1].split('.')[1]
    if ctype not in ['liver', 'MCF-7', 'K562']:
        continue
    bw = pyBigWig.open("dnase_bigwigs/DNASE.{}.fc.signal.bigwig".format(ctype))
    chromsizes = bw.chroms()
    print(ctype, time.time() - itime)
    dn_dict = {}
    for chrom in chromsizes: #chr_IDs:
        x = bw.values(chrom, 0, chromsizes[chrom], numpy=True)
        dn_dict[chrom] = np.nan_to_num(x).astype(np.float16)   # half-precision makes things significantly smaller (less time to load)
        print(chrom, time.time() - itime)
    
        np.save('dnase/{}/{}.npy'.format(ctype, chrom), dn_dict[chrom])
        print(chrom, time.time() - itime)
    dnases[ctype] = dn_dict

for ctype in dnases:
    itime = time.time()
    print(ctype)
    dn_dict = dnases[ctype]
    
    # Save as npz archive
    np.savez_compressed('codalab_archive/{}_dnase'.format(ctype), **dn_dict)
    print(time.time() - itime)

liver 0.006468534469604492
chr1 8.260387659072876
chr1 13.276052474975586
chr10 17.844778299331665
chr10 25.784512758255005
chr11 30.30143165588379
chr11 33.256701707839966
chr12 37.791435956954956
chr12 40.85292291641235
chr13 44.619521141052246
chr13 47.792500495910645
chr14 51.4214243888855
chr14 53.6813702583313
chr15 56.946401834487915
chr15 59.10466551780701
chr16 61.939475774765015
chr16 63.999470472335815
chr17 66.63648653030396
chr17 68.4126443862915
chr18 71.05454993247986
chr18 72.90085673332214
chr19 74.78594756126404
chr19 76.80954170227051
chr2 85.25815343856812
chr2 95.36479425430298
chr20 97.74516272544861
chr20 99.27151441574097
chr21 100.82207584381104
chr21 103.02815318107605
chr22 104.63926863670349
chr22 106.02127361297607
chr3 112.71910071372986
chr3 117.30491018295288
chr4 123.77405095100403
chr4 128.67069339752197
chr5 134.89299392700195
chr5 138.83413815498352
chr6 144.83386087417603
chr6 149.115407705307
chr7 154.4929392337799
chr7 157.8094253540039
chr8 162.8