In [21]:
import os
import tempfile
from pathlib import Path
import pandas as pd
import numpy as np
import pyBigWig
import importlib
import xarray as xr
import tqdm
import pandas as pd
import crandata
crandata = importlib.reload(crandata)


import crandata.yanndata
import crandata.chrom_io
crandata.yanndata = importlib.reload(crandata.yanndata)
crandata.chrom_io = importlib.reload(crandata.chrom_io)
crandata._anndatamodule = importlib.reload(crandata._anndatamodule)
# crandata._dataloader = importlib.reload(crandata._dataloader)
# crandata._dataset = importlib.reload(crandata._dataset)
crandata._anndatamodule = importlib.reload(crandata._anndatamodule)
from crandata._anndatamodule import MetaAnnDataModule

# Create temporary directories for beds, bigwigs, etc.
temp_dir = tempfile.TemporaryDirectory()
base_dir = Path(temp_dir.name)
beds_dir = base_dir / "beds"
bigwigs_dir = base_dir / "bigwigs"
beds_dir.mkdir(exist_ok=True)
bigwigs_dir.mkdir(exist_ok=True)

# Create a chromsizes file
chromsizes_file = base_dir / "chrom.sizes"
with open(chromsizes_file, "w") as f:
    f.write("chr1\t1000\n")

# Create two BED files (ClassA and ClassB)
bed_data_A = pd.DataFrame({0: ["chr1", "chr1"],
                           1: [100, 300],
                           2: [200, 400]})
bed_data_B = pd.DataFrame({0: ["chr1", "chr1"],
                           1: [150, 350],
                           2: [250, 450]})
bed_file_A = beds_dir / "ClassA.bed"
bed_file_B = beds_dir / "ClassB.bed"
bed_data_A.to_csv(bed_file_A, sep="\t", header=False, index=False)
bed_data_B.to_csv(bed_file_B, sep="\t", header=False, index=False)

# Create a consensus BED file
consensus = pd.DataFrame({0: ["chr1", "chr1", "chr1"],
                          1: [100, 300, 350],
                          2: [200, 400, 450]})
consensus_file = base_dir / "consensus.bed"
consensus.to_csv(consensus_file, sep="\t", header=False, index=False)

# Create a bigWig file with a single chromosome region
bigwig_file = bigwigs_dir / "test.bw"
bw = pyBigWig.open(str(bigwig_file), "w")
bw.addHeader([("chr1", 1000)])
bw.addEntries(chroms=["chr1"], starts=[0], ends=[1000], values=[5.0])
bw.close()

# Set parameters for extraction
target_region_width = 100
backed_path = os.path.join(base_dir, "chrom_data.h5")

# Create the CrAnData object from the bigWig files and consensus regions
adata = crandata.chrom_io.import_bigwigs(
    bigwigs_folder=bigwigs_dir,
    regions_file=consensus_file,
    backed_path=backed_path,
    target_region_width=target_region_width,
    chromsizes_file=chromsizes_file,
    
)

# Add a random obsm entry
adata.obsm['gex'] = xr.DataArray(np.random.randn(adata.obs.shape[0], 100),
                                 dims=['types', 'genes'])

# Create a synthetic BEDP file for Hi-C contacts and add contacts to adata.varp
synthetic_bedp = pd.DataFrame({
    0: ["chr1", "chr1"],
    1: [100, 300],
    2: [200, 400],
    3: ["chr1", "chr1"],
    4: [150, 350],
    5: [250, 450],
    6: [10, 20]
})
synthetic_bedp_file = base_dir / "synthetic.bedp"
synthetic_bedp.to_csv(synthetic_bedp_file, sep="\t", header=False, index=False)

from crandata.chrom_io import add_contact_strengths_to_varp
contacts = add_contact_strengths_to_varp(adata, [str(synthetic_bedp_file)], key="hic_contacts")

print("Added Hi-C contact data to adata.varp['hic_contacts']:")
print("Shape:", adata.varp["hic_contacts"].shape)
print(adata.varp["hic_contacts"])


# Write to HDF5 and load back.
h5_path = os.path.join(base_dir, "adata.h5")
adata.to_h5(h5_path)
adata_loaded = crandata.yanndata.CrAnData.from_h5(h5_path,backed=['X'])
print("\nDirectory contents:", os.listdir(base_dir))
print("\nLoaded CrAnData from HDF5:")
print(adata_loaded)
print("obs:")
print(adata_loaded.obs)
print("var:")
print(adata_loaded.var)
print("varp keys:", list(adata_loaded.varp.keys()))
if "hic_contacts" in adata_loaded.varp:
    print("Hi-C contact data shape:", adata_loaded.varp["hic_contacts"].shape)
    print(adata_loaded.varp["hic_contacts"])

# ----- Extended test: Create multiple Yanndata-based modules and a MetaAnnDataModule, then test dataloading -----

# Create two copies of the loaded CrAnData (simulate two different datasets/species)
adata1 = adata_loaded.copy()
adata2 = adata_loaded.copy()
# Ensure each has a 'split' column
adata1.var["split"] = "train"
adata2.var["split"] = "train"

# Create a dummy FASTA file for the genome (with a single record for chr1)
fasta_file = base_dir / "chr1.fa"
with open(fasta_file, "w") as f:
    f.write(">chr1\n")
    f.write("A" * 1000 + "\n")

# Instead of passing a string, create a Genome object.
from crandata._genome import Genome
dummy_genome = Genome(str(fasta_file), chrom_sizes=str(chromsizes_file))

# Import MetaAnnDataModule (using the package name so that relative imports resolve)


# Instantiate MetaAnnDataModule with the two datasets and corresponding genomes.
meta_module = MetaAnnDataModule(
    adatas=[adata1, adata2],
    genomes=[dummy_genome, dummy_genome],
    data_sources={'y': 'X','hic':'varp/hic_contacts','gex':'obsm/gex'},
    in_memory=True,
    random_reverse_complement=True,
    max_stochastic_shift=5,
    deterministic_shift=False,
    shuffle_obs=True,
    shuffle=True,
    batch_size=3,    # small batch size for testing
    epoch_size=10    # small epoch size for quick testing
)

# Setup the meta module for the "fit" stage (train/val)
meta_module.setup("fit")

# Retrieve the training dataloader from the meta module and iterate over a couple of batches.
meta_train_dl = meta_module.train_dataloader

print("\nIterating over a couple of training batches from MetaAnnDataModule:")
for i, batch in enumerate(tqdm.tqdm(meta_train_dl.data)):
    print(f"Meta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    # if i == 1:
    #     break

print(os.listdir(base_dir))

temp_dir.cleanup()


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/matthew.schmitz/Matthew/utils/miniforge3/envs/crested/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/scratch/fast/46103/ipykernel_3995050/2205277207.py", line 15, in <module>
    import crandata.yanndata
ModuleNotFoundError: No module named 'crandata.yanndata'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/matthew.schmitz/Matthew/utils/miniforge3/envs/crested/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 2168, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/matthew.schmitz/Matthew/utils/miniforge3/envs/crested/lib/python3.12/site-packages/IPython/core/ultratb.py", line 1457, in structured_traceback
    return FormattedTB.structured_traceback(
           ^^^^^^

In [None]:
import crandata
import importlib
crandata = importlib.reload(crandata)
crandata.chrom_io = importlib.reload(crandata.chrom_io)
crandata.crandata = importlib.reload(crandata.crandata)


In [2]:
import crandata
import os

genome_path = '/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/Team/Matthew/genome/onehots/mouse'
fasta_file = os.path.join(genome_path,'mouse.fa')
chrom_sizes = os.path.join(genome_path,'mouse.fa.sizes')
annotation_gtf_file = os.path.join(genome_path,'mouse.annotation.gtf')

genome = crandata.Genome(fasta_file, chrom_sizes, annotation_gtf_file)

# Set parameters for binning.
WINDOW_SIZE = 2114
OFFSET = WINDOW_SIZE // 2  # e.g., 50% overlap
N_THRESHOLD = 0.3

# Optionally specify an output path for the BED file.
OUTPUT_BED = os.path.join(genome_path, "binned_genome.bed")

# Generate bins and optionally write to disk.
binned_df = crandata.bin_genome(genome, WINDOW_SIZE, OFFSET, n_threshold=N_THRESHOLD, output_path=OUTPUT_BED)

print("Filtered bins:")
print(binned_df)


In [None]:

bigwigs_dir = '/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/SpinalCord/manuscript/ATAC/mouse/Group_bigwig/'
n_bins = WINDOW_SIZE//50

adata = crandata.chrom_io.import_bigwigs(
    bigwigs_folder=bigwigs_dir,
    regions_file=OUTPUT_BED,
    backed_path='/home/matthew.schmitz/Matthew/mouse_spc_test.h5',
    target_region_width=WINDOW_SIZE,
    chromsizes_file=chrom_sizes,
    n_bins=n_bins   
)

100%|██████████| 51/51 [00:00<00:00, 716.03it/s]
[32m2025-03-08 23:20:33.435[0m | [1mINFO    [0m | [36mcrandata.chrom_io[0m:[36mimport_bigwigs[0m:[36m417[0m - [1mExtracting values from 49 bigWig files...[0m
32it [31:27, 58.93s/it]

In [20]:
adata

NameError: name 'adata' is not defined

In [None]:
adata.X[0,:,:]

In [None]:
adata.var

In [None]:
adata.obs

In [35]:
adata = crandata.crandata.CrAnData.from_h5('/home/matthew.schmitz/Matthew/mouse_spc_test.h5')

In [None]:
import crandata
import os
import crested
from tqdm import tqdm

In [None]:
genomes = {}
beds = {}
chromsizes_files = {}
bed_files = {}
species = ['mouse','human','macaque']

WINDOW_SIZE = 2114
OFFSET = WINDOW_SIZE // 2  # e.g., 50% overlap
N_THRESHOLD = 0.3
n_bins = WINDOW_SIZE//50

In [None]:
for s in species:
    genome_path = '/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/Team/Matthew/genome/onehots/'+s
    fasta_file = os.path.join(genome_path,s+'.fa')
    chrom_sizes = os.path.join(genome_path,s+'.fa.sizes')
    annotation_gtf_file = os.path.join(genome_path,s+'.annotation.gtf')
    chromsizes_files[s] = chrom_sizes
    genome = crandata.Genome(fasta_file, chrom_sizes, annotation_gtf_file)
    genomes[s] = genome
    # Set parameters for binning.
    
    # Optionally specify an output path for the BED file.
    OUTPUT_BED = os.path.join(genome_path, "binned_genome.bed")
    bed_files[s] = OUTPUT_BED
    # Generate bins and optionally write to disk.
    # binned_df = crandata.bin_genome(genome, WINDOW_SIZE, OFFSET, n_threshold=N_THRESHOLD, output_path=OUTPUT_BED)
    # print("Filtered bins:")
    # print(binned_df)

In [None]:
adatas = {}

for s in species:
    bigwigs_dir = os.path.join('/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/SpinalCord/manuscript/ATAC',s,'Group_bigwig')
    # adata = crandata.chrom_io.import_bigwigs(
    #     bigwigs_folder=bigwigs_dir,
    #     regions_file=bed_files[s],
    #     backed_path='/home/matthew.schmitz/Matthew/'+s+'_spc_test.h5',
    #     target_region_width=WINDOW_SIZE,
    #     chromsizes_file=chromsizes_files[s],
    #     n_bins=n_bins   
    # )
    # adatas[s] = adata
    adatas[s] = crandata.crandata.CrAnData.from_h5('/home/matthew.schmitz/Matthew/'+s+'_spc_test.h5')
    

In [None]:
import numpy as np
adatas['mouse'].uns['chunk_size'] = 512
adatas['human'].uns['chunk_size'] = 512
adatas['macaque'].uns['chunk_size'] = 512
adatas['mouse'].var["chunk_index"] = np.arange(adatas['mouse'].var.shape[0]) // 512
adatas['human'].var["chunk_index"] = np.arange(adatas['human'].var.shape[0]) // 512
adatas['macaque'].var["chunk_index"] = np.arange(adatas['macaque'].var.shape[0]) // 512


In [None]:
for s in adatas.keys():
    crested.pp.train_val_test_split(
        adatas[s], strategy="region", val_size=0.1, test_size=0.1, random_state=42
    )


In [1]:
import crandata
import importlib
importlib.reload(crandata)
importlib.reload(crandata.crandata)
importlib.reload(crandata._anndatamodule)
importlib.reload(crandata._dataloader)
importlib.reload(crandata._dataset)

NameError: name 'MetaAnnDataModule' is not defined

In [None]:
meta_module = crandata._anndatamodule.MetaAnnDataModule(
    adatas=list(adatas.values()),
    genomes=list(genomes.values()),
    data_sources={'y': 'X'},
    in_memory=False,
    random_reverse_complement=True,
    max_stochastic_shift=10,
    deterministic_shift=False,
    shuffle_obs=False,
    shuffle=True,
    batch_size=32,    # small batch siqrze for testing
    epoch_size=1000000    # small epoch size for quick testing
)

# Setup the meta module for the "fit" stage (train/val)
meta_module.setup("fit")

# Retrieve the training dataloader from the meta module and iterate over a couple of batches.
meta_train_dl = meta_module.train_dataloader

print("\nIterating over a couple of training batches from MetaAnnDataModule:")
for i, batch in enumerate(tqdm(meta_train_dl.data)):
    print(f"Meta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    if i == 5:
        break


In [None]:
adatas

In [10]:
import time
import crandata

# --- Patch the AnnDataset __getitem__ method ---
# Save the original __getitem__
orig_getitem = crandata._dataset.AnnDataset.__getitem__

def timed_getitem(self, idx):
    start = time.perf_counter()
    result = orig_getitem(self, idx)
    elapsed = time.perf_counter() - start
    print(f"AnnDataset.__getitem__ for index {idx} took {elapsed:.4f} seconds")
    return result

# Monkey-patch the method
crandata._dataset.AnnDataset.__getitem__ = timed_getitem

# --- Patch the collate function in AnnDataLoader ---
# Save the original collate function (which is an instance method)
orig_collate_fn = crandata._dataloader.AnnDataLoader._collate_fn

def timed_collate_fn(self, batch):
    start = time.perf_counter()
    result = orig_collate_fn(self, batch)
    elapsed = time.perf_counter() - start
    print(f"Collate function took {elapsed:.4f} seconds for batch of {len(batch)} samples")
    return result

# Monkey-patch the collate function
crandata._dataloader.AnnDataLoader._collate_fn = timed_collate_fn

# --- Testing loop ---
meta_train_dl = meta_module.train_dataloader

print("\nIterating over training batches with timing instrumentation:")
for i, batch in enumerate(meta_train_dl.data):
    print(f"\nBatch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    if i >= 2:  # Process a few batches for testing.
        break





Iterating over training batches with timing instrumentation:
AnnDataset.__getitem__ for index 424667 took 0.0355 seconds
AnnDataset.__getitem__ for index 270952 took 0.0246 seconds
AnnDataset.__getitem__ for index 2680552 took 0.0159 seconds
AnnDataset.__getitem__ for index 743535 took 0.0179 seconds
AnnDataset.__getitem__ for index 87648 took 0.0514 seconds
AnnDataset.__getitem__ for index 211748 took 0.0152 seconds
AnnDataset.__getitem__ for index 1684820 took 0.0207 seconds
AnnDataset.__getitem__ for index 885137 took 0.0151 seconds
AnnDataset.__getitem__ for index 1467636 took 0.0189 seconds
AnnDataset.__getitem__ for index 373764 took 0.0160 seconds
AnnDataset.__getitem__ for index 2649283 took 0.0121 seconds
AnnDataset.__getitem__ for index 1231476 took 0.0222 seconds
AnnDataset.__getitem__ for index 715932 took 0.0109 seconds
AnnDataset.__getitem__ for index 783216 took 0.0178 seconds
AnnDataset.__getitem__ for index 2250074 took 0.0124 seconds
AnnDataset.__getitem__ for index 

In [None]:
import time
import crandata

# --- Patch the LazyH5Array __getitem__ method to measure disk I/O ---
orig_lazy_getitem = crandata.crandata.LazyH5Array.__getitem__

def timed_lazy_getitem(self, key):
    start = time.perf_counter()
    result = orig_lazy_getitem(self, key)
    elapsed = time.perf_counter() - start
    print(f"LazyH5Array.__getitem__ for key {key} took {elapsed:.4f} seconds")
    return result

crandata.crandata.LazyH5Array.__getitem__ = timed_lazy_getitem

# --- Then run your testing loop again ---
meta_train_dl = meta_module.train_dataloader

print("\nIterating over training batches with extended timing instrumentation:")
for i, batch in enumerate(meta_train_dl.data):
    print(f"\nBatch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    if i >= 2:  # Process a few batches for testing.
        break


In [26]:
model_architecture = crested.tl.zoo.chrombpnet(
    seq_len=2114, num_classes=51
)


In [27]:
import keras
# Create your own configuration
# I recommend trying this for peak regression with a weighted cosine mse log loss function
optimizer = keras.optimizers.Adam(learning_rate=1e-3)
loss = crested.tl.losses.CosineMSELogLoss(max_weight=100, multiplier=1)
metrics = [
    keras.metrics.MeanAbsoluteError(),
    keras.metrics.MeanSquaredError(),
    keras.metrics.CosineSimilarity(axis=1),
    crested.tl.metrics.PearsonCorrelation(),
    crested.tl.metrics.ConcordanceCorrelationCoefficient(),
    crested.tl.metrics.PearsonCorrelationLog(),
    crested.tl.metrics.ZeroPenaltyMetric(),
]

alternative_config = crested.tl.TaskConfig(optimizer, loss, metrics)
print(alternative_config)


NameError: name 'keras' is not defined

In [None]:
trainer = crested.tl.Crested(
    data=datamodule,
    model=model_architecture,
    config=alternative_config,
    project_name="mouse_biccn",  # change to your liking
    run_name="basemodel",  # change to your liking
    logger="wandb",  # or None, 'dvc', 'tensorboard'
    seed=7,  # For reproducibility
)
# train the model
trainer.fit(
    epochs=60,
    learning_rate_reduce_patience=3,
    early_stopping_patience=6,
)
