In [1]:
import os
import tempfile
from pathlib import Path
import pandas as pd
import numpy as np
import pyBigWig
import importlib
import xarray as xr
import tqdm
import pandas as pd
import crandata
crandata = importlib.reload(crandata)


import crandata.yanndata
import crandata.chrom_io
crandata.yanndata = importlib.reload(crandata.yanndata)
crandata.chrom_io = importlib.reload(crandata.chrom_io)
crandata._anndatamodule = importlib.reload(crandata._anndatamodule)
# crandata._dataloader = importlib.reload(crandata._dataloader)
# crandata._dataset = importlib.reload(crandata._dataset)
crandata._anndatamodule = importlib.reload(crandata._anndatamodule)
from crandata._anndatamodule import MetaAnnDataModule

# Create temporary directories for beds, bigwigs, etc.
temp_dir = tempfile.TemporaryDirectory()
base_dir = Path(temp_dir.name)
beds_dir = base_dir / "beds"
bigwigs_dir = base_dir / "bigwigs"
beds_dir.mkdir(exist_ok=True)
bigwigs_dir.mkdir(exist_ok=True)

# Create a chromsizes file
chromsizes_file = base_dir / "chrom.sizes"
with open(chromsizes_file, "w") as f:
    f.write("chr1\t1000\n")

# Create two BED files (ClassA and ClassB)
bed_data_A = pd.DataFrame({0: ["chr1", "chr1"],
                           1: [100, 300],
                           2: [200, 400]})
bed_data_B = pd.DataFrame({0: ["chr1", "chr1"],
                           1: [150, 350],
                           2: [250, 450]})
bed_file_A = beds_dir / "ClassA.bed"
bed_file_B = beds_dir / "ClassB.bed"
bed_data_A.to_csv(bed_file_A, sep="\t", header=False, index=False)
bed_data_B.to_csv(bed_file_B, sep="\t", header=False, index=False)

# Create a consensus BED file
consensus = pd.DataFrame({0: ["chr1", "chr1", "chr1"],
                          1: [100, 300, 350],
                          2: [200, 400, 450]})
consensus_file = base_dir / "consensus.bed"
consensus.to_csv(consensus_file, sep="\t", header=False, index=False)

# Create a bigWig file with a single chromosome region
bigwig_file = bigwigs_dir / "test.bw"
bw = pyBigWig.open(str(bigwig_file), "w")
bw.addHeader([("chr1", 1000)])
bw.addEntries(chroms=["chr1"], starts=[0], ends=[1000], values=[5.0])
bw.close()

# Set parameters for extraction
target_region_width = 100
backed_path = os.path.join(base_dir, "chrom_data.h5")

# Create the CrAnData object from the bigWig files and consensus regions
adata = crandata.chrom_io.import_bigwigs(
    bigwigs_folder=bigwigs_dir,
    regions_file=consensus_file,
    backed_path=backed_path,
    target_region_width=target_region_width,
    chromsizes_file=chromsizes_file,
    
)

# Add a random obsm entry
adata.obsm['gex'] = xr.DataArray(np.random.randn(adata.obs.shape[0], 100),
                                 dims=['types', 'genes'])

# Create a synthetic BEDP file for Hi-C contacts and add contacts to adata.varp
synthetic_bedp = pd.DataFrame({
    0: ["chr1", "chr1"],
    1: [100, 300],
    2: [200, 400],
    3: ["chr1", "chr1"],
    4: [150, 350],
    5: [250, 450],
    6: [10, 20]
})
synthetic_bedp_file = base_dir / "synthetic.bedp"
synthetic_bedp.to_csv(synthetic_bedp_file, sep="\t", header=False, index=False)

from crandata.chrom_io import add_contact_strengths_to_varp
contacts = add_contact_strengths_to_varp(adata, [str(synthetic_bedp_file)], key="hic_contacts")

print("Added Hi-C contact data to adata.varp['hic_contacts']:")
print("Shape:", adata.varp["hic_contacts"].shape)
print(adata.varp["hic_contacts"])


# Write to HDF5 and load back.
h5_path = os.path.join(base_dir, "adata.h5")
adata.to_h5(h5_path)
adata_loaded = crandata.yanndata.CrAnData.from_h5(h5_path,backed=['X'])
print("\nDirectory contents:", os.listdir(base_dir))
print("\nLoaded CrAnData from HDF5:")
print(adata_loaded)
print("obs:")
print(adata_loaded.obs)
print("var:")
print(adata_loaded.var)
print("varp keys:", list(adata_loaded.varp.keys()))
if "hic_contacts" in adata_loaded.varp:
    print("Hi-C contact data shape:", adata_loaded.varp["hic_contacts"].shape)
    print(adata_loaded.varp["hic_contacts"])

# ----- Extended test: Create multiple Yanndata-based modules and a MetaAnnDataModule, then test dataloading -----

# Create two copies of the loaded CrAnData (simulate two different datasets/species)
adata1 = adata_loaded.copy()
adata2 = adata_loaded.copy()
# Ensure each has a 'split' column
adata1.var["split"] = "train"
adata2.var["split"] = "train"

# Create a dummy FASTA file for the genome (with a single record for chr1)
fasta_file = base_dir / "chr1.fa"
with open(fasta_file, "w") as f:
    f.write(">chr1\n")
    f.write("A" * 1000 + "\n")

# Instead of passing a string, create a Genome object.
from crandata._genome import Genome
dummy_genome = Genome(str(fasta_file), chrom_sizes=str(chromsizes_file))

# Import MetaAnnDataModule (using the package name so that relative imports resolve)


# Instantiate MetaAnnDataModule with the two datasets and corresponding genomes.
meta_module = MetaAnnDataModule(
    adatas=[adata1, adata2],
    genomes=[dummy_genome, dummy_genome],
    data_sources={'y': 'X','hic':'varp/hic_contacts','gex':'obsm/gex'},
    in_memory=True,
    random_reverse_complement=True,
    max_stochastic_shift=5,
    deterministic_shift=False,
    shuffle_obs=True,
    shuffle=True,
    batch_size=3,    # small batch size for testing
    epoch_size=10    # small epoch size for quick testing
)

# Setup the meta module for the "fit" stage (train/val)
meta_module.setup("fit")

# Retrieve the training dataloader from the meta module and iterate over a couple of batches.
meta_train_dl = meta_module.train_dataloader

print("\nIterating over a couple of training batches from MetaAnnDataModule:")
for i, batch in enumerate(tqdm.tqdm(meta_train_dl.data)):
    print(f"Meta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    # if i == 1:
    #     break

print(os.listdir(base_dir))

temp_dir.cleanup()


[32m2025-03-07 20:01:56.898[0m | [1mINFO    [0m | [36mcrandata.chrom_io[0m:[36mimport_bigwigs[0m:[36m347[0m - [1mExtracting values from 1 bigWig files...[0m


Temporary BED lines: ['chr1\t100\t200\tchr1:100-200\t100\t200\n', 'chr1\t300\t400\tchr1:300-400\t300\t400\n', 'chr1\t350\t450\tchr1:350-450\t350\t450\n']
Extracted values shapes: [(100,), (100,), (100,)]
Temporary BED lines: ['chr1\t100\t200\tchr1:100-200\t100\t200\n', 'chr1\t300\t400\tchr1:300-400\t300\t400\n', 'chr1\t350\t450\tchr1:350-450\t350\t450\n']
Extracted values shapes: [(100,), (100,), (100,)]
Wrote row 1/1 from /scratch/fast/45820/tmpbx4dlr6q/bigwigs/test.bw
Added Hi-C contact data to adata.varp['hic_contacts']:
Shape: (3, 3, 1)
<xarray.DataArray (var_0: 3, var_1: 3, hic_file: 1)> Size: 140B
<COO: shape=(3, 3, 1), dtype=float32, nnz=5, fill_value=0.0>
Coordinates:
  * var_0     (var_0) object 24B 'chr1:100-200' 'chr1:300-400' 'chr1:350-450'
  * var_1     (var_1) object 24B 'chr1:100-200' 'chr1:300-400' 'chr1:350-450'
  * hic_file  (hic_file) int64 8B 0

Directory contents: ['beds', 'bigwigs', 'chrom.sizes', 'consensus.bed', 'chrom_data.h5', 'synthetic.bedp', 'adata.h5']

Lo

100%|██████████| 3/3 [00:00<00:00, 20197.29it/s]

2025-03-07T20:02:06.401042-0800 INFO Loading sequences into memory...



100%|██████████| 3/3 [00:00<00:00, 17573.90it/s]

2025-03-07T20:02:06.418742-0800 INFO Loading sequences into memory...



100%|██████████| 3/3 [00:00<00:00, 18669.01it/s]

2025-03-07T20:02:06.432898-0800 INFO Loading sequences into memory...



100%|██████████| 3/3 [00:00<00:00, 17697.49it/s]



Iterating over a couple of training batches from MetaAnnDataModule:


100%|██████████| 4/4 [00:02<00:00,  1.74it/s]

Meta Batch 0:
  sequence: shape torch.Size([100, 3, 4])
  y: shape torch.Size([1, 3, 100])
  hic: shape torch.Size([3, 3, 1])
  gex: shape torch.Size([1, 3, 100])
Meta Batch 1:
  sequence: shape torch.Size([100, 3, 4])
  y: shape torch.Size([1, 3, 100])
  hic: shape torch.Size([3, 3, 1])
  gex: shape torch.Size([1, 3, 100])
Meta Batch 2:
  sequence: shape torch.Size([100, 3, 4])
  y: shape torch.Size([1, 3, 100])
  hic: shape torch.Size([3, 3, 1])
  gex: shape torch.Size([1, 3, 100])
Meta Batch 3:
  sequence: shape torch.Size([100, 1, 4])
  y: shape torch.Size([1, 1, 100])
  hic: shape torch.Size([3, 1, 1])
  gex: shape torch.Size([1, 1, 100])
['beds', 'bigwigs', 'chrom.sizes', 'consensus.bed', 'chrom_data.h5', 'synthetic.bedp', 'adata.h5', 'chr1.fa', 'chr1.fa.fai']





In [2]:
adata.obs['file_path']

test    /scratch/fast/45820/tmpbx4dlr6q/bigwigs/test.bw
Name: file_path, dtype: object

In [3]:
adata_loaded.obs['file_path']

test    /scratch/fast/45820/tmpbx4dlr6q/bigwigs/test.bw
Name: file_path, dtype: object

In [4]:
type(adata_loaded.X)

crandata.yanndata._XWrapper

In [2]:
adata_loaded.X

NameError: name 'adata_loaded' is not defined