In [7]:
import torch
import polars as pl
from enformer_pytorch import Enformer, GenomeIntervalDataset

# filter_train = lambda df: df.filter(pl.col('column_4') == 'train')

ds = GenomeIntervalDataset(
    bed_file = '/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/auged_sorted_GM12878_CTCF_intersect.bed', # bed file - columns 0, 1, 2 must be <chromosome>, <start position>, <end position>
    fasta_file = '/home/hxcai/ref_genome/hg38.fa',         # path to fasta file
    # filter_df_fn = filter_train,                        # filter dataframe function
    return_seq_indices = False,                          # return nucleotide indices (ACGTN) or one hot encodings
    shift_augs = (-2, 2),                               # random shift augmentations from -2 to +2 basepairs
    context_length = 196_608,
)

print(len(ds))
print(ds[0].shape)

3434122
torch.Size([196608, 4])


In [5]:
import selene_sdk
from selene_sdk import samplers, sequences, utils

# reference_sequence = sequences.Genome(input_path = '/home/hxcai/ref_genome/hg38.fa', blacklist_regions='hg38')
# features = utils.load_features_list('/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/distinct_features.txt')

sampler = samplers.IntervalsSampler(
    reference_sequence = sequences.Genome(input_path = '/home/hxcai/ref_genome/hg38.fa', blacklist_regions='hg38'), 
    features = utils.load_features_list('/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/distinct_features.txt'),
    target_path = '/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/HepG2_ENCFF913MQB.bed',
    intervals_path = '/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/deepsea_TF_intervals.txt',
    sample_negative = True,
    seed = 436,
    validation_holdout = ['chr6', 'chr7'],
    test_holdout = ['chr8', 'chr9'],
    sequence_length = 1000, 
    center_bin_to_predict = 200, 
    feature_thresholds = 0.5, 
    mode = "train", 
    save_datasets = ["test"], 
    output_dir = None)

x = sampler.sample(batch_size=1)
# print(len(sampler))
print(x[0].shape)
print(x[1].shape)

(1, 1000, 4)
(1, 1)


In [1]:
import sys
sys.path.append('/home/hxcai/cell_type_specific_CRE')
from datasets import BedDataset

d = BedDataset(bed_file= '/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/auged_sorted_GM12878_CTCF_intersect.bed',
        ref_genome_path= '/home/hxcai/ref_genome/hg19.fa',
        context_length= 1000,
        filter_in_list=['chr1']
        )

print(len(d))
print(d[0])

3434122
(tensor([[0.2500, 0.2500, 0.2500,  ..., 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500,  ..., 0.0000, 1.0000, 0.0000],
        [0.2500, 0.2500, 0.2500,  ..., 1.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500,  ..., 0.0000, 0.0000, 1.0000]]), tensor([0.]))


In [12]:
from kipoiseq.dataloaders import SeqIntervalDl
d = SeqIntervalDl(
    intervals_file='/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/auged_sorted_GM12878_CTCF_intersect.bed',
    fasta_file='/home/hxcai/ref_genome/hg19.fa',
    auto_resize_len=1000,
    
)
print(len(d))
print(d[0])

3434122
{'inputs': array([[0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       [0.25, 0.25, 0.25, 0.25],
       ...,
       [0.  , 0.  , 1.  , 0.  ],
       [0.  , 1.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 1.  ]]), 'targets': array([0.]), 'metadata': {'ranges': GenomicRanges(chr='chr1', start=9600, end=10600, id='0', strand='*')}}


In [None]:
import selene_sdk
from selene_sdk import samplers, sequences, utils

reference_sequence = sequences.Genome(input_path = '/home/hxcai/ref_genome/hg38.fa', blacklist_regions='hg38')
features = utils.load_features_list('/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/distinct_features.txt')

sampler = samplers.IntervalsSampler(
    reference_sequence = reference_sequence, 
    features = features,
    target_path = '/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/HepG2_ENCFF913MQB.bed',
    intervals_path = '/home/hxcai/cell_type_specific_CRE/data/ATAC_seq/deepsea_TF_intervals.txt',
    sample_negative = True,
    seed = 436,
    validation_holdout = ['chr6', 'chr7'],
    test_holdout = ['chr8', 'chr9'],
    sequence_length = 1000, 
    center_bin_to_predict = 200, 
    feature_thresholds = 0.5, 
    mode = "train", 
    save_datasets = ["test"], 
    output_dir = None)


if __name__ == '__main__':
    x = sampler.sample(batch_size=1)
    print(x[0].shape)

