In [1]:
# Core imports
import os
import numpy as np
import h5py
import pickle
import random

# TensorFlow/Keras imports for model loading
import tensorflow as tf
from keras.models import model_from_json

# SQUID imports for mutagenesis
import squid

2026-01-23 10:47:53.767455: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-23 10:47:53.891948: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-23 10:47:53.895798: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/local/apps/gcc/9.2.0/lib:/cm/lo

In [2]:
# Load Dev_20 library
with open('Dev_20_library/Dev_20/dev_20_library.pkl', 'rb') as f:
    libraries = pickle.load(f)
    dev_loci = libraries['dev']

print(f"Loaded {len(dev_loci)} Dev_20 sequences")
print(f"Columns: {dev_loci.columns.tolist()}")

# Load DeepSTARR model
model_dir = '../data_and_models/models'

model_json_file = os.path.join(model_dir, 'deepstarr.model.json')
model_weights_file = os.path.join(model_dir, 'deepstarr.model.h5')

with open(model_json_file, 'r') as f:
    model_json = f.read()

model = model_from_json(model_json, custom_objects={'Functional': tf.keras.Model})
model.load_weights(model_weights_file)

print("Model loaded successfully!")

Loaded 23 Dev_20 sequences
Columns: ['test_idx', 'sequence', 'activity', 'ohe_seq']
Model loaded successfully!


2026-01-23 10:47:57.131845: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/local/apps/gcc/9.2.0/lib:/cm/local/apps/gcc/9.2.0/lib64:/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64:/cm/local/apps/python37/lib
2026-01-23 10:47:57.131957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/local/apps/gcc/9.2.0/lib:/cm/local/apps/gcc/9.2.0/lib64:/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64:/cm/local/apps/python37/lib
2026-01-23 10:47:57.132207: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.s

In [3]:
# Helper function to save library to HDF5
def save_library(filepath, sequences, predictions, original_idx):
    """Save mutagenesis library to HDF5 file."""
    n_samples = len(sequences)
    with h5py.File(filepath, 'w') as f:
        f.create_dataset('sequences', data=sequences, compression='gzip', compression_opts=4)
        f.create_dataset('predictions', data=predictions, compression='gzip', compression_opts=4)
        # Add library_index for consistent subsetting (0 to n_samples-1)
        f.create_dataset('library_index', data=np.arange(n_samples), compression='gzip', compression_opts=4)
        f.attrs['original_idx'] = original_idx
        f.attrs['n_samples'] = n_samples

In [None]:
# Generate 100K mutagenesis libraries for each Dev_20 sequence
task_index = 0  # 0 for Dev
x_seqs = dev_loci["ohe_seq"]
seq_indices = dev_loci["test_idx"]

print(f"Processing {len(x_seqs)} sequences")

for i, (x_seq, idx) in enumerate(zip(x_seqs, seq_indices)):
    output_dir = f'Dev_20_mutagenisis_library/Dev/seq_{idx}'
    output_file = f'{output_dir}/100K.h5'
    
    # Check if library already exists
    if os.path.exists(output_file):
        print(f"Skipping seq_{idx} - already exists")
        continue
    
    os.makedirs(output_dir, exist_ok=True)
    
    x_seq = np.array(x_seq)
    
    # Create predictor
    pred_generator = squid.predictor.ScalarPredictor(
        pred_fun=model.predict_on_batch,
        task_idx=task_index,
        batch_size=512
    )
    
    # Create mutagenizer
    mut_generator = squid.mutagenizer.RandomMutagenesis(
        mut_rate=0.10,
        seed=42
    )
    
    # Create MAVE
    mave = squid.mave.InSilicoMAVE(
        mut_generator,
        pred_generator,
        249,
        mut_window=[0, 249]
    )
    
    # Generate 100K mutant sequences
    x_mut, y_mut = mave.generate(x_seq, num_sim=100000)
    
    # Save library
    save_library(output_file, x_mut, y_mut, idx)
    print(f"[{i+1}/{len(x_seqs)}] Created seq_{idx}/100K.h5")

print("\nDone!")

Processing 23 sequences

Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:04<00:00, 24959.74it/s]
Inference: 100%|██████████| 195/195 [00:06<00:00, 30.12it/s]


[1/23] Created seq_22612/100K.h5

Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27890.59it/s]
Inference:  47%|████▋     | 91/195 [00:02<00:03, 31.25it/s]