In [1]:
import os
import tempfile
from pathlib import Path
import pandas as pd
import numpy as np
import pyBigWig
import copy
import xarray as xr
import tqdm

# Import our new module system and utilities.
from crandata import CrAnDataModule, MetaCrAnDataModule, CrAnData
from crandata.chrom_io import import_bigwigs
from crandata.seq_io import add_genome_sequences_to_crandata, DNATransform

# Create temporary directories for synthetic data.
temp_dir = tempfile.TemporaryDirectory()
base_dir = Path(temp_dir.name)
beds_dir = base_dir / "beds"
bigwigs_dir = base_dir / "bigwigs"
beds_dir.mkdir(exist_ok=True)
bigwigs_dir.mkdir(exist_ok=True)

# Create a chromsizes file.
chromsizes_file = base_dir / "chrom.sizes"
with open(chromsizes_file, "w") as f:
    f.write("chr1\t1000\n")

# Create two BED files (simulate two different classes).
bed_data_A = pd.DataFrame({
    0: ["chr1", "chr1"],
    1: [100, 300],
    2: [200, 400]
})
bed_data_B = pd.DataFrame({
    0: ["chr1", "chr1"],
    1: [150, 350],
    2: [250, 450]
})
bed_file_A = beds_dir / "ClassA.bed"
bed_file_B = beds_dir / "ClassB.bed"
bed_data_A.to_csv(bed_file_A, sep="\t", header=False, index=False)
bed_data_B.to_csv(bed_file_B, sep="\t", header=False, index=False)

# Create a consensus BED file.
consensus = pd.DataFrame({
    0: ["chr1", "chr1", "chr1"],
    1: [100, 300, 350],
    2: [200, 400, 450]
})
consensus_file = base_dir / "consensus.bed"
consensus.to_csv(consensus_file, sep="\t", header=False, index=False)

# Create two bigWig files.
bigwig_file1 = bigwigs_dir / "test.bw"
bw1 = pyBigWig.open(str(bigwig_file1), "w")
bw1.addHeader([("chr1", 1000)])
bw1.addEntries(chroms=["chr1"], starts=[0], ends=[1000], values=[5.0])
bw1.close()

bigwig_file2 = bigwigs_dir / "test2.bw"
bw2 = pyBigWig.open(str(bigwig_file2), "w")
bw2.addHeader([("chr1", 1000)])
bw2.addEntries(chroms=["chr1"], starts=[0], ends=[1000], values=[4.0])
bw2.close()

# Set extraction parameters.
target_region_width = 100
backed_path = base_dir / "chrom_data.zarr"

# Create the CrAnData object from bigWig files and consensus regions.
adata = import_bigwigs(
    bigwigs_folder=str(bigwigs_dir),
    regions_file=str(consensus_file),
    backed_path=str(backed_path),
    target_region_width=target_region_width,
    chromsizes_file=str(chromsizes_file),
)

# Create a dummy FASTA file for a genome.
fasta_file = base_dir / "chr1.fa"
with open(fasta_file, "w") as f:
    f.write(">chr1\n")
    f.write("A" * 1000 + "\n")

# Create a Genome object.
from crandata._genome import Genome
dummy_genome = Genome(str(fasta_file), chrom_sizes=str(chromsizes_file))

# Add sequences to the CrAnData using the provided seq_io utility.
# Here we use the consensus regions as our ranges.
consensus.columns = ['chrom', 'start', 'end']
adata = add_genome_sequences_to_crandata(adata, consensus, dummy_genome)

# Write the CrAnData object to disk and then reload it to ensure sequences are out-of-memory.
adata.to_zarr(str(backed_path),mode='a')
adata_loaded = CrAnData.open_zarr(str(backed_path))
print("Loaded CrAnData:")
print(adata_loaded)

# Create two copies to simulate two datasets (e.g. two species), and add a "split" column in var metadata.
adata1 = copy.deepcopy(adata_loaded)
adata2 = copy.deepcopy(adata_loaded)
adata1["var-_-split"] = xr.DataArray(np.full(adata1.sizes["var"], "train"), dims=["var"])
adata2["var-_-split"] = xr.DataArray(np.full(adata2.sizes["var"], "train"), dims=["var"])

# Create a DNATransform instance.
transform = DNATransform(out_len=80, random_rc=True, max_shift=5)

# Instantiate the MetaCrAnDataModule with the two datasets.
# Note: The batch_size is now 3, matching the number of consensus regions (var dimension).
meta_module = MetaCrAnDataModule(
    adatas=[adata1, adata2],
    batch_size=3,        # adjust batch size to not exceed var length (3)
    shuffle=True,
    dnatransform=transform,
    epoch_size=10
)

# Setup each underlying module for the "train" stage.
for mod in meta_module.modules:
    mod.setup(state="train")

# Retrieve the training dataloader from the meta module and iterate over a couple of batches.
meta_train_dl = meta_module.train_dataloader
print("\nIterating over a couple of training batches from MetaCrAnDataModule:")
for i, batch in enumerate(tqdm.tqdm(meta_train_dl)):
    print(batch)
    print(f"\nMeta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    if i >= 1:
        break

print("\nTemporary directory contents:")
print(os.listdir(base_dir))
temp_dir.cleanup()


  cls = super().__new__(mcls, name, bases, namespace, **kwargs)
100%|██████████| 2/2 [00:00<00:00, 318.26it/s]
[32m2025-03-22 01:07:01.957[0m | [1mINFO    [0m | [36mcrandata.chrom_io[0m:[36mimport_bigwigs[0m:[36m308[0m - [1mExtracting values from 2 bigWig files...[0m
  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  ret

Loaded CrAnData:
CrAnData object
Array names: ['obs-_-index', 'var-_-chunk_index', 'X', 'var-_-end', 'var-_-chrom', 'obs-_-file_path', 'var-_-start', 'var-_-index', 'sequences']
Coordinates: ['obs', 'var', 'seq_bins']


Iterating over a couple of training batches from MetaCrAnDataModule:


1it [00:00,  1.13it/s]

samples [CrAnData object
Array names: ['obs-_-index', 'var-_-chunk_index', 'X', 'var-_-end', 'var-_-chrom', 'obs-_-file_path', 'var-_-start', 'var-_-index', 'sequences', 'var-_-split']
Coordinates: ['obs', 'var', 'seq_bins']
, CrAnData object
Array names: ['obs-_-index', 'var-_-chunk_index', 'X', 'var-_-end', 'var-_-chrom', 'obs-_-file_path', 'var-_-start', 'var-_-index', 'sequences', 'var-_-split']
Coordinates: ['obs', 'var', 'seq_bins']
, CrAnData object
Array names: ['obs-_-index', 'var-_-chunk_index', 'X', 'var-_-end', 'var-_-chrom', 'obs-_-file_path', 'var-_-start', 'var-_-index', 'sequences', 'var-_-split']
Coordinates: ['obs', 'var', 'seq_bins']
]
CrAnData object
Array names: ['obs-_-index', 'var-_-chunk_index', 'X', 'var-_-end', 'var-_-chrom', 'obs-_-file_path', 'var-_-start', 'var-_-index', 'sequences', 'var-_-split']
Coordinates: ['obs', 'var', 'seq_bins']


Meta Batch 0:
  obs-_-index: shape (2,)
  var-_-chunk_index: shape (3,)
  X: shape (2, 3, 100)
  var-_-end: shape (3,)





In [None]:
sdfs

In [None]:
# Should the fill in _extract_values_from_bigwig actually be 0? Can we filter var where all is 0/nan without loading everything into memory?

In [1]:
import crandata
import xarray as xr
import pandas as pd
import numpy as np
import os
import crested
from tqdm import tqdm

  cls = super().__new__(mcls, name, bases, namespace, **kwargs)


In [2]:
genomes = {}
beds = {}
chromsizes_files = {}
bed_files = {}
species = ['mouse','human','macaque']

WINDOW_SIZE = 2114
OFFSET = WINDOW_SIZE // 2  # e.g., 50% overlap
N_THRESHOLD = 0.3
n_bins = WINDOW_SIZE//50

In [4]:
for s in species:
    genome_path = '/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/Team/Matthew/genome/onehots/'+s
    fasta_file = os.path.join(genome_path,s+'.fa')
    chrom_sizes = os.path.join(genome_path,s+'.fa.sizes')
    annotation_gtf_file = os.path.join(genome_path,s+'.annotation.gtf')
    chromsizes_files[s] = chrom_sizes
    genome = crandata.Genome(fasta_file, chrom_sizes, annotation_gtf_file)
    genomes[s] = genome
    # Set parameters for binning.
    
    # Optionally specify an output path for the BED file.
    OUTPUT_BED = os.path.join(genome_path, "binned_genome.bed")
    bed_files[s] = OUTPUT_BED
    # Generate bins and optionally write to disk.
    binned_df = crandata.bin_genome(genome, WINDOW_SIZE, OFFSET, n_threshold=N_THRESHOLD, output_path=OUTPUT_BED).reset_index(drop=True)
    print("Filtered bins:")
    print(binned_df)

Calculating N content: 100%|██████████| 2583507/2583507 [02:37<00:00, 16430.01it/s]


Filtered bins:
              chrom    start      end    prop_n
0              chr1  2999767  3001881  0.110638
1              chr1  3002938  3005052  0.085579
2              chr1  3003995  3006109  0.000000
3              chr1  3005052  3007166  0.000000
4              chr1  3006109  3008223  0.000000
...             ...      ...      ...       ...
2509462  JH584292.1     8457    10571  0.000000
2509463  JH584292.1     9514    11628  0.000000
2509464  JH584292.1    10571    12685  0.000000
2509465  JH584292.1    11628    13742  0.000000
2509466  JH584292.1    12685    14799  0.000000

[2509467 rows x 4 columns]


Calculating N content: 100%|██████████| 2932321/2932321 [03:03<00:00, 15951.14it/s]


Filtered bins:
              chrom  start    end   prop_n
0              chr1   9514  11628  0.23026
1              chr1  10571  12685  0.00000
2              chr1  11628  13742  0.00000
3              chr1  12685  14799  0.00000
4              chr1  13742  15856  0.00000
...             ...    ...    ...      ...
2786513  KI270518.1      1   2115  0.00000
2786514  KI270530.1      1   2115  0.00000
2786515  KI270304.1      1   2115  0.00000
2786516  KI270418.1      1   2115  0.00000
2786517  KI270424.1      1   2115  0.00000

[2786518 rows x 4 columns]


Calculating N content: 100%|██████████| 2806701/2806701 [02:50<00:00, 16497.35it/s]


Filtered bins:
               chrom  start    end  prop_n
0        NC_041754.1      1   2115     0.0
1        NC_041754.1   1058   3172     0.0
2        NC_041754.1   2115   4229     0.0
3        NC_041754.1   3172   5286     0.0
4        NC_041754.1   4229   6343     0.0
...              ...    ...    ...     ...
2773980  NC_005943.1   9514  11628     0.0
2773981  NC_005943.1  10571  12685     0.0
2773982  NC_005943.1  11628  13742     0.0
2773983  NC_005943.1  12685  14799     0.0
2773984  NC_005943.1  13742  15856     0.0

[2773985 rows x 4 columns]


In [5]:
import importlib
crandata.chrom_io = importlib.reload(crandata.chrom_io)

In [3]:
adatas = {}

for s in species:
    bigwigs_dir = os.path.join('/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/SpinalCord/manuscript/ATAC',s,'Group_bigwig')
    # adata = crandata.chrom_io.import_bigwigs(
    #     bigwigs_folder=bigwigs_dir,
    #     regions_file=bed_files[s],
    #     backed_path='/home/matthew.schmitz/Matthew/'+s+'_spc_test.zarr',
    #     target_region_width=WINDOW_SIZE,
    #     chromsizes_file=chromsizes_files[s],
    #     target = 'raw',
    #     n_bins=n_bins
    # )
    # adatas[s] = adata
    adatas[s] = crandata.crandata.CrAnData.open_zarr('/home/matthew.schmitz/Matthew/'+s+'_spc_test.zarr')
    

  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)


In [4]:
adatas

{'mouse': CrAnData object
 Array names: ['obs-_-index', 'var-_-chunk_index', 'X', 'obs-_-file_path', 'var-_-chrom', 'var-_-end', 'var-_-start', 'var-_-index']
 Coordinates: ['obs', 'var', 'seq_bins'],
 'human': CrAnData object
 Array names: ['obs-_-index', 'obs-_-file_path', 'X', 'var-_-chunk_index', 'var-_-start', 'var-_-end', 'var-_-chrom', 'var-_-index']
 Coordinates: ['obs', 'seq_bins', 'var'],
 'macaque': CrAnData object
 Array names: ['obs-_-index', 'X', 'var-_-start', 'var-_-chunk_index', 'var-_-chrom', 'obs-_-file_path', 'var-_-end', 'var-_-index']
 Coordinates: ['obs', 'seq_bins', 'var']}

In [None]:
# import numpy as np
# adatas['mouse'].uns['chunk_size'] = 512
# adatas['human'].uns['chunk_size'] = 512
# adatas['macaque'].uns['chunk_size'] = 512
# adatas['mouse'].var["chunk_index"] = np.arange(adatas['mouse'].var.shape[0]) // 512
# adatas['human'].var["chunk_index"] = np.arange(adatas['human'].var.shape[0]) // 512
# adatas['macaque'].var["chunk_index"] = np.arange(adatas['macaque'].var.shape[0]) // 512


In [5]:
for s in adatas.keys():
    crested.pp.train_val_test_split(
        adatas[s], strategy="region", val_size=0.1, test_size=0.1, random_state=42
    )


AttributeError: 'CrAnData' object has no attribute 'var_names'

In [11]:
meta_module = crandata.MetaCrAnDataModule(
    adatas=list(adatas.values()),
    # genomes=list(genomes.values()),
    # data_sources={'y': 'X'},
    in_memory=False,
    random_reverse_complement=True,
    max_stochastic_shift=10,
    deterministic_shift=False,
    shuffle_obs=False, obs_alignment = 'intersect',
    shuffle=True,
    batch_size=32,    # small batch size for testing
    epoch_size=1000000    # small epoch size for quick testing
)

# Setup the meta module for the "fit" stage (train/val)
meta_module.setup("fit")

# Retrieve the training dataloader from the meta module and iterate over a couple of batches.
meta_train_dl = meta_module.train_dataloader

print("\nIterating over a couple of training batches from MetaAnnDataModule:")
for i, batch in enumerate(tqdm(meta_train_dl.data)):
    print(f"Meta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    if i == 5:
        break


TypeError: MetaCrAnDataModule.__init__() got an unexpected keyword argument 'in_memory'

In [12]:
import numpy as np
import xarray as xr
import xbatcher

# Create an xarray Dataset with 5 variables of various shapes and dimensions
ds = xr.Dataset({
    'var1': (('time', 'lat', 'lon'), np.random.rand(20, 10, 15)),      # e.g. climate data
    'var2': (('x', 'y'), np.random.rand(30, 40)),                        # e.g. 2D image-like array
    'var3': (('sample', 'feature'), np.random.rand(50, 5)),              # e.g. tabular features
    'var4': (('channel', 'height', 'width'), np.random.rand(3, 32, 32)),   # e.g. multi-channel image
    'var5': (('time', 'level'), np.random.rand(20, 8))                   # e.g. time-series with levels
})

# Example 1: Use xbatcher on var1 along the 'time' dimension
# Each batch will contain 5 time points. (20/5 = 4 batches expected.)
bgen1 = xbatcher.BatchGenerator(ds=ds[['var1']], input_dims={'time': 5})
print(f'bgen1 has {len(bgen1)} batches')
print("First batch from var1:")
print(bgen1[0])

# Example 2: Use xbatcher on var2 along the 'x' dimension
# Each batch will contain 10 x-points. (30/10 = 3 batches expected.)
bgen2 = xbatcher.BatchGenerator(ds=ds[['var2']], input_dims={'x': 10})
print(f'\nbgen2 has {len(bgen2)} batches')
print("First batch from var2:")
print(bgen2[0])

# Example 3: Use xbatcher on var5 along the 'time' dimension
# Here the chosen window size (7) does not evenly divide the 20 time points,
# so the remainder will be discarded.
bgen3 = xbatcher.BatchGenerator(ds=ds[['var5']], input_dims={'time': 7})
print(f'\nbgen3 has {len(bgen3)} batches')
print("First batch from var5:")
print(bgen3[0])

# Example 4: Batch generator for var1 with overlapping inputs
# Each sample has 5 time points and each batch is 10 time points long,
# with an overlap of 4 time points between consecutive samples.
bgen4 = xbatcher.BatchGenerator(
    ds=ds[['var1']],
    input_dims={'time': 5},
    batch_dims={'time': 10},
    concat_input_dims=True,
    input_overlap={'time': 4}
)
print(f'\nbgen4 (with overlap) has {len(bgen4)} batches')
print("First batch from var1 with overlap:")
print(bgen4[0])

# Example 5: Iterate through batches in bgen1 and print the shape of var1 in each batch
print("\nIterating over batches in bgen1:")
for i, batch in enumerate(bgen1):
    print(f"Batch {i}: var1 shape = {batch.var1.shape}")


bgen1 has 4 batches
First batch from var1:
<xarray.Dataset> Size: 10kB
Dimensions:  (time: 5, sample: 150)
Coordinates:
  * sample   (sample) object 1kB MultiIndex
  * lat      (sample) int64 1kB 0 0 0 0 0 0 0 0 0 0 0 ... 9 9 9 9 9 9 9 9 9 9 9
  * lon      (sample) int64 1kB 0 1 2 3 4 5 6 7 8 9 ... 5 6 7 8 9 10 11 12 13 14
Dimensions without coordinates: time
Data variables:
    var1     (sample, time) float64 6kB 0.2816 0.607 0.9085 ... 0.815 0.3227

bgen2 has 3 batches
First batch from var2:
<xarray.Dataset> Size: 3kB
Dimensions:  (x: 10, y: 40)
Dimensions without coordinates: x, y
Data variables:
    var2     (x, y) float64 3kB 0.4145 0.2682 0.2274 ... 0.9241 0.4045 0.5468

bgen3 has 2 batches
First batch from var5:
<xarray.Dataset> Size: 448B
Dimensions:  (time: 7, level: 8)
Dimensions without coordinates: time, level
Data variables:
    var5     (time, level) float64 448B 0.6049 0.9134 0.6541 ... 0.5835 0.5764

bgen4 (with overlap) has 2 batches
First batch from var1 with overlap:

In [None]:
for i, batch in enumerate(tqdm(meta_train_dl.data)):
    print(f"Meta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.dtype}")
    if i == 5:
        break


In [None]:
import cProfile

code = '''
for i, batch in enumerate(meta_train_dl.data):
    # print(f"Meta Batch {i}:")
    # for key, tensor in batch.items():
    #     print(f"  {key}: shape {tensor.shape}")
    if i == 5:
        break
'''

out = cProfile.run(code,sort=True)


In [None]:
model_architecture = crested.tl.zoo.simple_convnet(
    seq_len=2114, num_classes=batch['y'].shape[1]
)


In [None]:
import keras
# Create your own configuration
# I recommend trying this for peak regression with a weighted cosine mse log loss function
optimizer = keras.optimizers.Adam(learning_rate=1e-5)
loss = crested.tl.losses.CosineMSELogLoss(max_weight=100, multiplier=1)
loss = crested.tl.losses.PoissonLoss()

metrics = [
    keras.metrics.MeanAbsoluteError(),
    # keras.metrics.MeanSquaredError(),
    # keras.metrics.CosineSimilarity(axis=1),
    crested.tl.metrics.PearsonCorrelation(),
    # crested.tl.metrics.ConcordanceCorrelationCoefficient(),
    # crested.tl.metrics.PearsonCorrelationLog(),
    # crested.tl.metrics.ZeroPenaltyMetric(),
]

alternative_config = crested.tl.TaskConfig(optimizer, loss, metrics)
print(alternative_config)


In [None]:
# initialize some lazy model parameters *yawn*
model_architecture(batch)

In [None]:
trainer = crested.tl.Crested(
    data=meta_module,
    model=model_architecture,
    config=alternative_config,
    project_name="mouse_biccn",  # change to your liking
    run_name="basemodel",  # change to your liking
    logger=None,  # or None, 'dvc', 'tensorboard'
    seed=7,  # For reproducibility
)
# train the model
trainer.fit(
    epochs=60,
    learning_rate_reduce_patience=3,
    early_stopping_patience=6,
)
