In [7]:
import numpy as np
import icechunk
import zarr
import psutil
import os
from tqdm import tqdm

# === Parameters for demonstration ===
OBS = 10
VAR = int(2e4)      # 2,000,000 variables per observation
SEQ_LEN = 2114      # Sequence length per variable
CHUNK_SIZE = 512    # Chunk size for the 'var' dimension

# Create local zarr
storage = icechunk.local_filesystem_storage('/home/matthew.schmitz/Matthew/test.zarr')
repo = icechunk.Repository.create(storage)
session = repo.writable_session("main")
store = session.store

group = zarr.group(store=store, overwrite=True)
array = group.create(
    name='X',
    shape=(OBS, VAR, SEQ_LEN),
    chunks=(OBS, CHUNK_SIZE, SEQ_LEN),
    dtype='float32',
    fill_value=np.nan
)
array.attrs["_ARRAY_DIMENSIONS"] = ["obs", "var", "seq_bins"]

# Display the initial memory usage.
process = psutil.Process(os.getpid())
print("Initial memory usage (bytes):", process.memory_info().rss)

# === Loop to write random data into each observation ===
for i in tqdm(range(OBS), desc="Writing observations"):
    # Generate random data for a single observation.
    data = np.random.random((VAR, SEQ_LEN)).astype('float32')
    
    array[i, :, :] = data

    # Report memory usage after this observation.
    current_mem = process.memory_info().rss
    print(f"After writing observation {i+1}, memory usage: {current_mem} bytes")
    
# Commit the changes for this observation.
print(session.commit('dummy commit'))
print("Finished writing all observations.")
icechunk.print_debug_info()

Initial memory usage (bytes): 3963637760


Writing observations:  10%|█         | 1/10 [00:08<01:19,  8.85s/it]

After writing observation 1, memory usage: 4134625280 bytes


Writing observations:  20%|██        | 2/10 [00:30<02:10, 16.26s/it]

After writing observation 2, memory usage: 4152950784 bytes


Writing observations:  30%|███       | 3/10 [01:03<02:46, 23.77s/it]

After writing observation 3, memory usage: 4159426560 bytes


Writing observations:  40%|████      | 4/10 [01:57<03:34, 35.77s/it]

After writing observation 4, memory usage: 4211236864 bytes


Writing observations:  50%|█████     | 5/10 [02:37<03:06, 37.39s/it]

After writing observation 5, memory usage: 4157231104 bytes


Writing observations:  60%|██████    | 6/10 [03:31<02:52, 43.22s/it]

After writing observation 6, memory usage: 4331810816 bytes


Writing observations:  70%|███████   | 7/10 [04:41<02:35, 51.83s/it]

After writing observation 7, memory usage: 4463869952 bytes


Writing observations:  80%|████████  | 8/10 [06:15<02:10, 65.23s/it]

After writing observation 8, memory usage: 4745273344 bytes


Writing observations:  90%|█████████ | 9/10 [07:24<01:06, 66.28s/it]

After writing observation 9, memory usage: 4913094656 bytes


Writing observations: 100%|██████████| 10/10 [09:16<00:00, 55.61s/it]

After writing observation 10, memory usage: 5011529728 bytes
FBC3FSDPD1TV47Z7R8J0
Finished writing all observations.
platform:  Linux-5.14.0-427.42.1.el9_4.x86_64-x86_64-with-glibc2.34
python:  3.12.7
icechunk:  0.2.12
zarr:  3.0.0
numcodecs:  0.15.1





xarray:  2025.1.2


In [13]:
import numpy as np
import icechunk
import zarr
import psutil
import os
from tqdm import tqdm

# === Parameters for demonstration ===
OBS = 10
VAR = int(2e4)      # 20,000 variables per observation
SEQ_LEN = 2114      # Sequence length per variable
CHUNK_SIZE = 512    # Chunk size for the 'var' dimension

# === Create an Icechunk repository on the local filesystem ===
# Note: replace the path with a location you can write to.
storage = icechunk.local_filesystem_storage('/home/matthew.schmitz/Matthew/test.zarr')
repo = icechunk.Repository.create(storage)
session = repo.writable_session("main")
store = session.store

# Create the Zarr group and array on the Icechunk store.
group = zarr.group(store=store, overwrite=True)
array = group.create(
    name='X',
    shape=(OBS, VAR, SEQ_LEN),
    chunks=(OBS, CHUNK_SIZE, SEQ_LEN),
    dtype='float32',
    fill_value=np.nan
)
array.attrs["_ARRAY_DIMENSIONS"] = ["obs", "var", "seq_bins"]

# Display the initial memory usage.
process = psutil.Process(os.getpid())
print("Initial memory usage (bytes):", process.memory_info().rss)

# === Loop to write random data into each observation ===
for i in tqdm(range(OBS), desc="Writing observations"):
    # Generate random data for a single observation.
    data = np.random.random((VAR, SEQ_LEN)).astype('float32')
    
    # Write the random data to the corresponding slice in the Zarr array.
    array[i, :, :] = data
    
    # Commit the changes for this observation.
    commit_id = session.commit(f'dummy commit_{i}')
    print(f"Committed observation {i+1} with id: {commit_id}")
    
    # Re-open a new writable session for further writes.
    session = repo.writable_session("main")
    store = session.store
    # Re-open the updated Zarr array from the new store.
    group = zarr.open_group(store, mode='a')
    array = group['X']
    current_mem = process.memory_info().rss
    print(f"After writing observation {i+1}, memory usage: {current_mem} bytes")

final_commit_id = session.commit('final dummy commit')
print(f"Final commit id: {final_commit_id}")
print("Finished writing all observations.")

icechunk.print_debug_info()


Initial memory usage (bytes): 3762229248


Writing observations:  10%|█         | 1/10 [00:08<01:18,  8.75s/it]

Committed observation 1 with id: V4638RB2KFB5EJYSPQ0G
After writing observation 1, memory usage: 3934769152 bytes


Writing observations:  20%|██        | 2/10 [01:05<04:54, 36.86s/it]

Committed observation 2 with id: NNQ6WTAF3FAE46E1J2R0
After writing observation 2, memory usage: 3412848640 bytes


Writing observations:  30%|███       | 3/10 [01:29<03:37, 31.10s/it]

Committed observation 3 with id: M5ATGZ0G1J1Z9KKENR7G
After writing observation 3, memory usage: 3388350464 bytes


Writing observations:  40%|████      | 4/10 [02:14<03:39, 36.60s/it]

Committed observation 4 with id: N7NTR5FQPGXN7MMKQ1H0
After writing observation 4, memory usage: 3486969856 bytes


Writing observations:  50%|█████     | 5/10 [03:16<03:47, 45.60s/it]

Committed observation 5 with id: DGM4BD24MCPJ8J6JGT50
After writing observation 5, memory usage: 3025842176 bytes


Writing observations:  60%|██████    | 6/10 [04:18<03:25, 51.41s/it]

Committed observation 6 with id: 3DCAPHPQ4P222297PJ4G
After writing observation 6, memory usage: 3045105664 bytes


Writing observations:  70%|███████   | 7/10 [05:40<03:03, 61.30s/it]

Committed observation 7 with id: GKQY747MM33GE1Q61SGG
After writing observation 7, memory usage: 3195998208 bytes


Writing observations:  80%|████████  | 8/10 [06:48<02:06, 63.32s/it]

Committed observation 8 with id: DG9Y13M1GY6S9T4GS7C0
After writing observation 8, memory usage: 3324370944 bytes


Writing observations:  90%|█████████ | 9/10 [07:56<01:04, 64.87s/it]

Committed observation 9 with id: TM6CMCZQ9JYKE61ZCHQG
After writing observation 9, memory usage: 3345354752 bytes


Writing observations: 100%|██████████| 10/10 [09:11<00:00, 55.12s/it]

Committed observation 10 with id: WNZJ0SP4ZFAMDNGWJ4QG
After writing observation 10, memory usage: 3365064704 bytes





IcechunkError:   x session error: cannot commit, no changes made to the session
  | 
  | context:
  |    0: icechunk::session::commit
  |            with final dummy commit
  |              at icechunk/src/session.rs:804
  | 


In [9]:
import numpy as np
import zarr
import psutil
import os
from tqdm import tqdm

# === Parameters for demonstration ===
OBS = 10
VAR = int(2e4)      # 20,000 variables per observation (adjust as needed)
SEQ_LEN = 2114      # Sequence length per variable
CHUNK_SIZE = 512    # Chunk size along the 'var' dimension

# === Create a local Zarr DirectoryStore ===
store_dir = '/home/matthew.schmitz/Matthew/test.zarr'
print("Using Zarr store directory:", store_dir)
store = zarr.storage.LocalStore(store_dir)

# Create a Zarr group and array with the specified dimensions and chunking.
group = zarr.group(store=store, overwrite=True)
array = group.create(
    name='X',
    shape=(OBS, VAR, SEQ_LEN),
    chunks=(OBS, CHUNK_SIZE, SEQ_LEN),
    dtype='float32',
    fill_value=np.nan
)
# (Optional) Set dimension metadata for downstream tools.
array.attrs["_ARRAY_DIMENSIONS"] = ["obs", "var", "seq_bins"]

# Display the initial memory usage.
process = psutil.Process(os.getpid())
print("Initial memory usage (bytes):", process.memory_info().rss)

# === Loop to write random data into each observation ===
for i in tqdm(range(OBS), desc="Writing observations"):
    # Generate random data for the observation.
    data = np.random.random((VAR, SEQ_LEN)).astype('float32')
    
    # Write the random data to the corresponding slice of the Zarr array.
    array[i, :, :] = data

    # Report memory usage after this observation.
    current_mem = process.memory_info().rss
    print(f"After writing observation {i+1}, memory usage: {current_mem} bytes")

print("Finished writing all observations.")


Using Zarr store directory: /home/matthew.schmitz/Matthew/test.zarr
Initial memory usage (bytes): 3242545152


Writing observations:  10%|█         | 1/10 [00:02<00:18,  2.03s/it]

After writing observation 1, memory usage: 3413397504 bytes


Writing observations:  20%|██        | 2/10 [00:05<00:21,  2.70s/it]

After writing observation 2, memory usage: 3420995584 bytes


Writing observations:  30%|███       | 3/10 [00:08<00:21,  3.13s/it]

After writing observation 3, memory usage: 3430469632 bytes


Writing observations:  40%|████      | 4/10 [00:12<00:21,  3.53s/it]

After writing observation 4, memory usage: 3447918592 bytes


Writing observations:  50%|█████     | 5/10 [00:18<00:20,  4.09s/it]

After writing observation 5, memory usage: 3281653760 bytes


Writing observations:  60%|██████    | 6/10 [00:23<00:18,  4.71s/it]

After writing observation 6, memory usage: 3469582336 bytes


Writing observations:  70%|███████   | 7/10 [00:30<00:15,  5.29s/it]

After writing observation 7, memory usage: 3719208960 bytes


Writing observations:  80%|████████  | 8/10 [00:37<00:11,  5.91s/it]

After writing observation 8, memory usage: 3823661056 bytes


Writing observations:  90%|█████████ | 9/10 [00:46<00:06,  6.73s/it]

After writing observation 9, memory usage: 3718164480 bytes


Writing observations: 100%|██████████| 10/10 [00:54<00:00,  5.45s/it]

After writing observation 10, memory usage: 3718926336 bytes
Finished writing all observations.





In [None]:
dfsfjldj

In [1]:
import os
import tempfile
from pathlib import Path
import pandas as pd
import numpy as np
import pyBigWig
import copy
import xarray as xr
import tqdm


# Create temporary directories for synthetic data.
temp_dir = tempfile.TemporaryDirectory()
base_dir = Path(temp_dir.name)
beds_dir = base_dir / "beds"
bigwigs_dir = base_dir / "bigwigs"
beds_dir.mkdir(exist_ok=True)
bigwigs_dir.mkdir(exist_ok=True)

# Create a chromsizes file.
chromsizes_file = base_dir / "chrom.sizes"
with open(chromsizes_file, "w") as f:
    f.write("chr1\t1000\n")

# Create two BED files (simulate two different classes).
bed_data_A = pd.DataFrame({
    0: ["chr1", "chr1"],
    1: [100, 300],
    2: [200, 400]
})
bed_data_B = pd.DataFrame({
    0: ["chr1", "chr1"],
    1: [150, 350],
    2: [250, 450]
})
bed_file_A = beds_dir / "ClassA.bed"
bed_file_B = beds_dir / "ClassB.bed"
bed_data_A.to_csv(bed_file_A, sep="\t", header=False, index=False)
bed_data_B.to_csv(bed_file_B, sep="\t", header=False, index=False)

# Create a consensus BED file.
consensus = pd.DataFrame({
    0: ["chr1", "chr1", "chr1"],
    1: [100, 300, 350],
    2: [200, 400, 450]
})
consensus_file = base_dir / "consensus.bed"
consensus.to_csv(consensus_file, sep="\t", header=False, index=False)

# Create two bigWig files.
bigwig_file1 = bigwigs_dir / "test.bw"
bw1 = pyBigWig.open(str(bigwig_file1), "w")
bw1.addHeader([("chr1", 1000)])
bw1.addEntries(chroms=["chr1"], starts=[0], ends=[1000], values=[5.0])
bw1.close()

bigwig_file2 = bigwigs_dir / "test2.bw"
bw2 = pyBigWig.open(str(bigwig_file2), "w")
bw2.addHeader([("chr1", 1000)])
bw2.addEntries(chroms=["chr1"], starts=[0], ends=[1000], values=[4.0])
bw2.close()

# Set extraction parameters.
target_region_width = 100
backed_path = base_dir / "chrom_data.zarr"
print(backed_path)
# Create the CrAnData object from bigWig files and consensus regions.
adata = import_bigwigs(
    bigwigs_folder=str(bigwigs_dir),
    regions_file=str(consensus_file),
    backed_path=str(backed_path),
    target_region_width=target_region_width,
    chromsizes_file=str(chromsizes_file),
)

crandata.train_val_test_split(adata,strategy='chr_auto')

# Create a dummy FASTA file for a genome.
fasta_file = base_dir / "chr1.fa"
with open(fasta_file, "w") as f:
    f.write(">chr1\n")
    f.write("A" * 1000 + "\n")

# Create a Genome object.
from crandata._genome import Genome
dummy_genome = Genome(str(fasta_file), chrom_sizes=str(chromsizes_file))

# Add sequences to the CrAnData using the provided seq_io utility.
# Here we use the consensus regions as our ranges.
consensus.columns = ['chrom', 'start', 'end']
adata = add_genome_sequences_to_crandata(adata, consensus, dummy_genome)

# Write the CrAnData object to disk and then reload it to ensure sequences are out-of-memory.
adata.to_icechunk(mode='a')
adata_loaded = CrAnData.open_zarr(str(backed_path))
print("Loaded CrAnData:")
print(adata_loaded)

# Create two copies to simulate two datasets (e.g. two species), and add a "split" column in var metadata.
adata1 = copy.deepcopy(adata_loaded)
adata2 = copy.deepcopy(adata_loaded)
adata1["var-_-split"] = xr.DataArray(np.full(adata1.sizes["var"], "train"), dims=["var"])
adata2["var-_-split"] = xr.DataArray(np.full(adata2.sizes["var"], "train"), dims=["var"])

# Create a DNATransform instance.
transform = DNATransform(out_len=80, random_rc=True, max_shift=5)

# Instantiate the MetaCrAnDataModule with the two datasets.
# Note: The batch_size is now 3, matching the number of consensus regions (var dimension).
meta_module = MetaCrAnDataModule(
    adatas=[adata1, adata2],
    batch_size=[2,2],        # adjust batch size to not exceed var length (3)
    load_keys={'sequences':'sequences','X':'X'},
    shuffle=True,
    dnatransform=transform,
    epoch_size=10
)

meta_module.setup('train')

# Retrieve the training dataloader from the meta module and iterate over a couple of batches.
meta_train_dl = meta_module.train_dataloader
print("\nIterating over a couple of training batches from MetaCrAnDataModule:")
for i, batch in enumerate(tqdm.tqdm(meta_train_dl)):
    print(batch)
    print(f"\nMeta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    if i >= 1:
        break

print("\nTemporary directory contents:")
print(os.listdir(base_dir))
temp_dir.cleanup()


  cls = super().__new__(mcls, name, bases, namespace, **kwargs)


/scratch/fast/145989/tmpbnw_6wvo/chrom_data.zarr


100%|██████████| 2/2 [00:00<00:00, 6021.97it/s]
[32m2025-04-08 20:45:55.040[0m | [1mINFO    [0m | [36mcrandata.chrom_io[0m:[36mimport_bigwigs[0m:[36m326[0m - [1mExtracting values from 2 bigWig files...[0m


Initial memory usage (bytes): 617652224


  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)


After 0 files, memory usage (bytes): 618373120


  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
100%|██████████| 2/2 [00:00<00:00, 19.24it/s]
  adata['X'] = adata['X'].chunk({'obs':adata.dims['obs'],'var':chunk_size,'seq_bins':adata.dims['seq_bins']}) #enforce the same as before
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)


After 0 files (post-write), memory usage (bytes): 619814912
After 1 files, memory usage (bytes): 619814912
After 1 files (post-write), memory usage (bytes): 619814912


  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
100%|██████████| 3/3 [00:00<00:00, 1800.39it/s]
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)


Loaded CrAnData:
CrAnData object
Array names: ['X', 'obs-_-index', 'obs-_-file_path', 'var-_-end', 'var-_-index', 'var-_-start', 'var-_-split', 'sequences', 'var-_-chrom', 'var-_-chunk_index']
Coordinates: ['obs', 'seq_bins', 'var']

CrAnData object
Array names: ['X', 'obs-_-index', 'obs-_-file_path', 'var-_-end', 'var-_-index', 'var-_-start', 'var-_-split', 'sequences', 'var-_-chrom', 'var-_-chunk_index']
Coordinates: ['obs', 'seq_bins', 'var']
 ['X', '_HANDLED_TYPES', '__abs__', '__abstractmethods__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__bool__', '__class__', '__class_getitem__', '__contains__', '__copy__', '__dask_graph__', '__dask_keys__', '__dask_layers__', '__dask_optimize__', '__dask_postcompute__', '__dask_postpersist__', '__dask_scheduler__', '__dask_tokenize__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__floordiv__', '__format__', '__ge__', '

1it [00:00, 30.44it/s]

{'sequences': array([[[1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        ...,
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0]],

       [[1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        ...,
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0]],

       [[1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        ...,
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0]],

       [[1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        ...,
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0]]], dtype=uint8), 'X': array([[[6.91939507e-310, 6.91939507e-310, 4.65960369e-310,
         4.65960369e-310, 6.91918633e-310, 6.91918542e-310,
         6.91918673e-310, 4.65960369e-310, 6.91918642e-310,
         6.91937747e-310, 6.91918642e-310, 0.00000000e+000,
         6.91918750e-310, 6.91937747e-310, 6.91918750e-310,
         6.91918599e-310, 6.91937747e-310, 6.91918599e-310,
         6.91918599




In [2]:
sdfs

NameError: name 'sdfs' is not defined

In [None]:
# Should the fill in _extract_values_from_bigwig actually be 0? Can we filter var where all is 0/nan without loading everything into memory?

In [3]:
import crandata
import xarray as xr
import pandas as pd
import numpy as np
import os
import crested
from tqdm import tqdm

In [4]:
genomes = {}
beds = {}
chromsizes_files = {}
bed_files = {}
species = ['human','macaque','mouse']
species_codes = {'human':0,'macaque':1,'mouse':2}

MAX_SHIFT = 5
WINDOW_SIZE = 2114
WINDOW_SIZE = WINDOW_SIZE #+ 2*MAX_SHIFT
OFFSET = WINDOW_SIZE // 2  # e.g., 50% overlap
N_THRESHOLD = 0.3
n_bins = WINDOW_SIZE//50


In [5]:
for s in species:
    genome_path = '/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/Team/Matthew/genome/onehots/'+s
    fasta_file = os.path.join(genome_path,s+'.fa')
    chrom_sizes = os.path.join(genome_path,s+'.fa.sizes')
    annotation_gtf_file = os.path.join(genome_path,s+'.annotation.gtf')
    chromsizes_files[s] = chrom_sizes
    genome = crandata.Genome(fasta_file, chrom_sizes, annotation_gtf_file)
    genome.to_memory()
    genomes[s] = genome
    OUTPUT_BED = os.path.join(genome_path, "binned_genome.bed")
    bed_files[s] = OUTPUT_BED
    # Generate bins and optionally write to disk.
    binned_df = crandata.bin_genome(genome, WINDOW_SIZE, OFFSET, n_threshold=N_THRESHOLD, output_path=OUTPUT_BED).reset_index(drop=True)
    print("Filtered bins:")
    print(binned_df)


2025-04-08T20:46:52.865713-0700 INFO Genome sequences loaded into memory.


Calculating N content: 100%|██████████| 2932321/2932321 [02:20<00:00, 20883.45it/s]


Filtered bins:
              chrom  start    end   prop_n
0              chr1   9514  11628  0.23026
1              chr1  10571  12685  0.00000
2              chr1  11628  13742  0.00000
3              chr1  12685  14799  0.00000
4              chr1  13742  15856  0.00000
...             ...    ...    ...      ...
2786513  KI270518.1      1   2115  0.00000
2786514  KI270530.1      1   2115  0.00000
2786515  KI270304.1      1   2115  0.00000
2786516  KI270418.1      1   2115  0.00000
2786517  KI270424.1      1   2115  0.00000

[2786518 rows x 4 columns]
2025-04-08T20:49:44.314540-0700 INFO Genome sequences loaded into memory.


Calculating N content: 100%|██████████| 2806701/2806701 [02:14<00:00, 20889.47it/s]


Filtered bins:
               chrom  start    end  prop_n
0        NC_041754.1      1   2115     0.0
1        NC_041754.1   1058   3172     0.0
2        NC_041754.1   2115   4229     0.0
3        NC_041754.1   3172   5286     0.0
4        NC_041754.1   4229   6343     0.0
...              ...    ...    ...     ...
2773980  NC_005943.1   9514  11628     0.0
2773981  NC_005943.1  10571  12685     0.0
2773982  NC_005943.1  11628  13742     0.0
2773983  NC_005943.1  12685  14799     0.0
2773984  NC_005943.1  13742  15856     0.0

[2773985 rows x 4 columns]
2025-04-08T20:52:27.456905-0700 INFO Genome sequences loaded into memory.


Calculating N content: 100%|██████████| 2583507/2583507 [02:03<00:00, 20999.46it/s]


Filtered bins:
              chrom    start      end    prop_n
0              chr1  2999767  3001881  0.110638
1              chr1  3002938  3005052  0.085579
2              chr1  3003995  3006109  0.000000
3              chr1  3005052  3007166  0.000000
4              chr1  3006109  3008223  0.000000
...             ...      ...      ...       ...
2509462  JH584292.1     8457    10571  0.000000
2509463  JH584292.1     9514    11628  0.000000
2509464  JH584292.1    10571    12685  0.000000
2509465  JH584292.1    11628    13742  0.000000
2509466  JH584292.1    12685    14799  0.000000

[2509467 rows x 4 columns]


In [None]:
adatas = {}

for s in species:
    print(s)
    bigwigs_dir = os.path.join('/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/SpinalCord/manuscript/ATAC',s,'Group_bigwig')
    adatas[s] = crandata.chrom_io.import_bigwigs(
        bigwigs_folder=bigwigs_dir,
        regions_file=bed_files[s],
        backed_path='/home/matthew.schmitz/Matthew/'+s+'_spc_test.zarr',
        target_region_width=WINDOW_SIZE,
        chromsizes_file=chromsizes_files[s],
        target = 'raw',
        max_stochastic_shift=5,
        chunk_size=512,
        n_bins=n_bins
    )
    bed = adatas[s].get_dataframe('var').loc[:,['chrom','start','end']]
    adatas[s] = crandata.seq_io.add_genome_sequences_to_crandata(adatas[s], bed, genomes[s])
    print(adatas[s]['sequences'])
    adatas[s]['var-_-species'] = xr.DataArray(np.repeat(species_codes[s],adatas[s].sizes['var']),dims='var').chunk({'var':adatas[s].attrs['chunk_size']})
    adatas[s].to_icehunk(mode='a',commit_name='add_genome_seqs')
    adatas[s] = crandata.crandata.CrAnData.open_zarr('/home/matthew.schmitz/Matthew/'+s+'_spc_test.zarr')
    

human


100%|██████████| 49/49 [00:00<00:00, 239.24it/s]


2025-04-08T20:55:06.169372-0700 INFO Extracting values from 49 bigWig files...


  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(


Initial memory usage (bytes): 10448941056


  0%|          | 0/49 [00:00<?, ?it/s]

After 0 files, memory usage (bytes): 58556633088


  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)
  result = await AsyncArray._create_v3(
  return cls(**configuration_parsed)
  2%|▏         | 1/49 [02:39<2:07:25, 159.28s/it]

After 0 files (post-write), memory usage (bytes): 58628796416
After 1 files, memory usage (bytes): 58842828800


  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  4%|▍         | 2/49 [05:37<2:13:18, 170.18s/it]

After 1 files (post-write), memory usage (bytes): 58858549248
After 2 files, memory usage (bytes): 58966355968


  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  6%|▌         | 3/49 [08:22<2:08:43, 167.90s/it]

After 2 files (post-write), memory usage (bytes): 58974941184
After 3 files, memory usage (bytes): 59028606976


  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  8%|▊         | 4/49 [11:05<2:04:31, 166.03s/it]

After 3 files (post-write), memory usage (bytes): 59042476032
After 4 files, memory usage (bytes): 59017474048


  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
 10%|█         | 5/49 [19:32<3:31:59, 289.07s/it]

After 4 files (post-write), memory usage (bytes): 59026755584
After 5 files, memory usage (bytes): 59144916992


  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
  return cls(**configuration_parsed)
 12%|█▏        | 6/49 [51:15<10:00:24, 837.77s/it]

After 5 files (post-write), memory usage (bytes): 59147771904


In [None]:
for s in adatas.keys():
    crandata.train_val_test_split(
        adatas[s], strategy="region", val_size=0.1, test_size=0.1, random_state=42
    )
    adatas[s].to_icechunk(mode='a',commit_name='train_val_test_split')
    


In [None]:
adatas[s]['X']

In [None]:
adatas[s]['sequences']

In [None]:
transform = crandata.seq_io.DNATransform(out_len=WINDOW_SIZE, random_rc=True, max_shift=MAX_SHIFT)

meta_module = crandata.MetaCrAnDataModule(
    adatas=list(adatas.values()),
    batch_size=[16,16,16],
    load_keys={'X': 'y','sequences':'sequence','var-_-species':'species'},
    dnatransform=transform,
    join='inner',
    num_workers=0,
    epoch_size=1000000    # small epoch size for quick testing
)

# Setup the meta module for the "fit" stage (train/val)
meta_module.setup("train")

# Retrieve the training dataloader from the meta module and iterate over a couple of batches.
meta_train_dl = meta_module.train_dataloader

print("\nIterating over a couple of training batches from MetaAnnDataModule:")
for i, batch in enumerate(tqdm(meta_train_dl)):
    print(f"Meta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    if i == 5:
        break


In [None]:
import cProfile

code = '''
for i, batch in enumerate(tqdm(meta_train_dl)):
    print(f"Meta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    if i == 5:
        break
'''

out = cProfile.run(code,sort=True)


In [None]:
meta_module.load()
meta_train_dl = meta_module.train_dataloader


In [None]:
code = '''
for i, batch in enumerate(tqdm(meta_train_dl)):
    print(f"Meta Batch {i}:")
    for key, tensor in batch.items():
        print(f"  {key}: shape {tensor.shape}")
    if i == 50:
        break
'''

out = cProfile.run(code,sort=True)


In [None]:
model_architecture = crested.tl.zoo.simple_convnet(
    seq_len=2114, num_classes=batch['y'].shape[1]
)


In [None]:
import keras
# Create your own configuration
# I recommend trying this for peak regression with a weighted cosine mse log loss function
optimizer = keras.optimizers.Adam(learning_rate=1e-5)
loss = crested.tl.losses.CosineMSELogLoss(max_weight=100, multiplier=1)
loss = crested.tl.losses.PoissonLoss()

metrics = [
    keras.metrics.MeanAbsoluteError(),
    # keras.metrics.MeanSquaredError(),
    # keras.metrics.CosineSimilarity(axis=1),
    crested.tl.metrics.PearsonCorrelation(),
    # crested.tl.metrics.ConcordanceCorrelationCoefficient(),
    # crested.tl.metrics.PearsonCorrelationLog(),
    # crested.tl.metrics.ZeroPenaltyMetric(),
]

alternative_config = crested.tl.TaskConfig(optimizer, loss, metrics)
print(alternative_config)


In [None]:
batch['sequence'].shape

In [None]:
# initialize some lazy model parameters *yawn*
model_architecture(batch['sequence'].float().mean(0).unsqueeze(0))

In [None]:
trainer = crested.tl.Crested(
    data=meta_module,
    model=model_architecture,
    config=alternative_config,
    project_name="mouse_biccn",  # change to your liking
    run_name="basemodel",  # change to your liking
    logger=None,  # or None, 'dvc', 'tensorboard'
    seed=7,  # For reproducibility
)
# train the model
trainer.fit(
    epochs=60,
    learning_rate_reduce_patience=3,
    early_stopping_patience=6,
)
