In [1]:
# Core imports
import os
import subprocess
import numpy as np
import pandas as pd
import h5py
import random
from urllib.request import urlretrieve
import matplotlib.pyplot as plt


# TensorFlow/Keras imports for model loading
import tensorflow as tf
from keras.models import model_from_json

# SEAM imports
import seam
from seam import Compiler, Attributer, Clusterer, MetaExplainer, Identifier
from seam.logomaker_batch.batch_logo import BatchLogo


2026-01-21 14:53:37.548032: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-21 14:53:37.654802: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-21 14:53:37.658157: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/local/apps/gcc/9.2.0/lib:/cm/lo

In [2]:
#open libraries from library_selection.ipynb
import pickle

with open('../libraries/hyperparam_libraries.pkl', 'rb') as f:
    libraries = pickle.load(f)
    dev_loci = libraries['dev']
    hk_loci = libraries['hk']

print(len(dev_loci), len(hk_loci))

## import model
import os
from urllib.request import urlretrieve
from keras.models import model_from_json

model_dir = '../models/deepstarr'
os.makedirs(model_dir, exist_ok=True)

model_json_file = os.path.join(model_dir, 'deepstarr.model.json')
model_weights_file = os.path.join(model_dir, 'deepstarr.model.h5')

with open(model_json_file, 'r') as f:
    model_json = f.read()

model = model_from_json(model_json, custom_objects={'Functional': tf.keras.Model})
model.load_weights(model_weights_file)

print("Model loaded successfully!")

## load deepSHAP
def seam_deepshap(x_mut, task_index):
    x_ref = x_mut
    import time
    import tensorflow as tf
    from keras.models import model_from_json
    import numpy as np
    import random

    # Configuration
    attribution_method = 'deepshap'  # or 'gradientshap', 'integratedgradients', etc.
    task_index = task_index  # 0 for Dev, 1 for Hk
    gpu = 0  # GPU device number
    save_data = True
    save_path = './attributions'  # Where to save results
    os.makedirs(save_path, exist_ok=True)

    # Model paths
    keras_model_json = 'models/deepstarr/deepstarr.model.json'
    keras_model_weights = 'models/deepstarr/deepstarr.model.h5'

    if attribution_method == 'deepshap':
        try:
            # Disable eager execution first
            tf.compat.v1.disable_eager_execution()
            tf.compat.v1.disable_v2_behavior()
            print("TensorFlow eager execution disabled for DeepSHAP compatibility")
            
            # Import SHAP to configure handlers
            try:
                import shap
            except ImportError:
                print("ERROR: SHAP package is not installed.")
                print("To install SHAP for DeepSHAP attribution, run:")
                print("pip install kundajelab-shap==1")
                raise ImportError("SHAP package required for DeepSHAP attribution")
            
            # Handle AddV2 operation (element-wise addition) as a linear operation
            shap.explainers.deep.deep_tf.op_handlers["AddV2"] = shap.explainers.deep.deep_tf.passthrough

            # Load the model after eager execution is disabled
            keras_model = model_from_json(open(keras_model_json).read(), custom_objects={'Functional': tf.keras.Model})
            np.random.seed(113)
            random.seed(0)
            keras_model.load_weights(keras_model_weights)
            model = keras_model
            
            # Rebuild model to ensure proper graph construction
            _ = model(tf.keras.Input(shape=model.input_shape[1:]))
            
        except ImportError:
            raise
        except Exception as e:
            print(f"Warning: Could not setup TensorFlow for DeepSHAP. Error: {e}")
            print("DeepSHAP may not work properly.")
        
        # Create attributer for DeepSHAP
        def deepstarr_compress(x):
            """DeepSTARR compression function for DeepSHAP."""
            if hasattr(x, 'outputs'):
                return tf.reduce_sum(x.outputs[task_index], axis=-1)
            else:
                return x

        attributer = Attributer(
            model,
            method=attribution_method,
            task_index=task_index,
            compress_fun=deepstarr_compress
        )

        attributer.show_params(attribution_method)

        t1 = time.time()
        attributions = attributer.compute(
            x_ref=x_ref,
            x=x_mut,
            save_window=None,
            batch_size=16,
            gpu=gpu,
        )
        t2 = time.time() - t1
        print(f'Attribution time: {t2/60:.2f} minutes')
    else:
        # Use unified Attributer for other methods
        attributer = Attributer(
            model,
            method=attribution_method,
            task_index=task_index,
            compress_fun=lambda x: x,
            pred_fun=model.predict_on_batch,
        )

        attributer.show_params(attribution_method)

        t1 = time.time()
        attributions = attributer.compute(
            x_ref=x_ref,
            x=x_mut,
            save_window=None,
            batch_size=256,
            gpu=gpu
        )
        t2 = time.time() - t1
        print(f'Attribution time: {t2/60:.2f} minutes')

    return attributions

8 8
Model loaded successfully!


2026-01-21 14:53:40.473280: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/local/apps/gcc/9.2.0/lib:/cm/local/apps/gcc/9.2.0/lib64:/grid/hpc/software/code-server/4.103.2-1/lib:/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64:/cm/local/apps/python37/lib
2026-01-21 14:53:40.473401: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/local/apps/gcc/9.2.0/lib:/cm/local/apps/gcc/9.2.0/lib64:/grid/hpc/software/code-server/4.103.2-1/lib:/cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64:/cm/local/apps/python37/lib
2026-01-21 14:53:40.473502: W tensorflow/compiler/xla/stream

In [3]:
import h5py
import numpy as np

# Save each sequence's library to a single HDF5 file
def save_library(filepath, sequences, predictions, original_idx):
    with h5py.File(filepath, 'w') as f:
        f.create_dataset('sequences', data=sequences, compression='gzip', compression_opts=4)
        f.create_dataset('predictions', data=predictions, compression='gzip', compression_opts=4)
        f.attrs['original_idx'] = original_idx
        f.attrs['n_samples'] = len(sequences)

In [4]:
## Create 100K -> 1K libraries for each sequence: DEV
import squid
task_index = 0  # 0 for dev, 1 for hk
x_seqs = dev_loci["ohe_seq"]
seq_indices = dev_loci["test_idx"]  # Add this line
print(x_seqs.shape)


for i, (x_seq, idx) in enumerate(zip(x_seqs, seq_indices)):

    x_seq = np.array(x_seq)

    pred_generator = squid.predictor.ScalarPredictor(
            pred_fun=model.predict_on_batch,
            task_idx=task_index,
            batch_size=512
        )

    mut_generator = squid.mutagenizer.RandomMutagenesis(
        mut_rate=0.10,
        seed=42
    )

    mave = squid.mave.InSilicoMAVE(
            mut_generator,
            pred_generator,
            249,
            mut_window=[0, 249]
        )

    x_mut, y_mut = mave.generate(x_seq, num_sim=100000)

    save_library(f'mutagenisis_library/dev_seq_{idx}_100K.h5', x_mut, y_mut, idx)

(8,)

Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28908.43it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 34.60it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27571.74it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 37.52it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27043.06it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 38.52it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28020.46it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 38.58it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28209.15it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.07it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28046.35it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 38.41it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27989.67it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 37.67it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 25684.39it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 38.85it/s]


In [5]:
## Create 100K -> 1K libraries for each sequence: Hk
import squid
task_index = 1  # 0 for dev, 1 for hk
x_seqs = hk_loci["ohe_seq"]
seq_indices = hk_loci["test_idx"]  # Add this line
print(x_seqs.shape)


for i, (x_seq, idx) in enumerate(zip(x_seqs, seq_indices)):

    x_seq = np.array(x_seq)

    pred_generator = squid.predictor.ScalarPredictor(
            pred_fun=model.predict_on_batch,
            task_idx=task_index,
            batch_size=512
        )

    mut_generator = squid.mutagenizer.RandomMutagenesis(
        mut_rate=0.10,
        seed=42
    )

    mave = squid.mave.InSilicoMAVE(
            mut_generator,
            pred_generator,
            249,
            mut_window=[0, 249]
        )

    x_mut, y_mut = mave.generate(x_seq, num_sim=100000)

    save_library(f'mutagenisis_library/hk_seq_{idx}_100K.h5', x_mut, y_mut, idx)

(8,)

Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28784.71it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 37.17it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27126.95it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.21it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28478.17it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 37.78it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28891.91it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 37.16it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27550.91it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.29it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:04<00:00, 24758.58it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.22it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28989.52it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 37.26it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:04<00:00, 23459.90it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.14it/s]


In [6]:
import os
import h5py
import numpy as np

# Define subset sizes
subset_sizes = [75000, 50000, 25000, 10000, 5000, 1000]
size_labels = ['75K', '50K', '25K', '10K', '5K', '1K']

def load_full_library(filepath):
    with h5py.File(filepath, 'r') as f:
        return f['sequences'][:], f['predictions'][:], f.attrs['original_idx']

def save_subset(filepath, sequences, predictions, original_idx, subset_idx=None):
    with h5py.File(filepath, 'w') as f:  # 'w' mode overwrites
        f.create_dataset('sequences', data=sequences, compression='gzip', compression_opts=4)
        f.create_dataset('predictions', data=predictions, compression='gzip', compression_opts=4)
        if subset_idx is not None:
            f.create_dataset('subset_idx', data=subset_idx, compression='gzip', compression_opts=4)
        f.attrs['original_idx'] = original_idx
        f.attrs['n_samples'] = len(sequences)

# Process Dev libraries
dev_indices = dev_loci["test_idx"].tolist()
for idx in dev_indices:
    seq_dir = f'mutagenisis_library/Dev/seq_{idx}'
    os.makedirs(seq_dir, exist_ok=True)
    
    src_path = f'mutagenisis_library/dev_seq_{idx}_100K.h5'
    seqs, preds, orig_idx = load_full_library(src_path)
    
    # Save 100K (no subset_idx needed)
    save_subset(f'{seq_dir}/100K.h5', seqs, preds, orig_idx, subset_idx=None)
    
    # Create random subsets
    indices_100k = np.arange(100000)
    np.random.seed(42)
    np.random.shuffle(indices_100k)
    
    for size, label in zip(subset_sizes, size_labels):
        subset_idx = indices_100k[:size]
        save_subset(f'{seq_dir}/{label}.h5', seqs[subset_idx], preds[subset_idx], orig_idx, subset_idx=subset_idx)
    
    print(f"Dev seq_{idx}: all subsets created with subset_idx")

# Process Hk libraries
hk_indices = hk_loci["test_idx"].tolist()
for idx in hk_indices:
    seq_dir = f'mutagenisis_library/Hk/seq_{idx}'
    os.makedirs(seq_dir, exist_ok=True)
    
    src_path = f'mutagenisis_library/hk_seq_{idx}_100K.h5'
    seqs, preds, orig_idx = load_full_library(src_path)
    
    # Save 100K (no subset_idx needed)
    save_subset(f'{seq_dir}/100K.h5', seqs, preds, orig_idx, subset_idx=None)
    
    # Create random subsets
    indices_100k = np.arange(100000)
    np.random.seed(42)
    np.random.shuffle(indices_100k)
    
    for size, label in zip(subset_sizes, size_labels):
        subset_idx = indices_100k[:size]
        save_subset(f'{seq_dir}/{label}.h5', seqs[subset_idx], preds[subset_idx], orig_idx, subset_idx=subset_idx)
    
    print(f"Hk seq_{idx}: all subsets created with subset_idx")

print("\nDone!")

Dev seq_17977: all subsets created with subset_idx
Dev seq_21916: all subsets created with subset_idx
Dev seq_21289: all subsets created with subset_idx
Dev seq_3881: all subsets created with subset_idx
Dev seq_266: all subsets created with subset_idx
Dev seq_22612: all subsets created with subset_idx
Dev seq_21069: all subsets created with subset_idx
Dev seq_13748: all subsets created with subset_idx
Hk seq_31742: all subsets created with subset_idx
Hk seq_12962: all subsets created with subset_idx
Hk seq_12053: all subsets created with subset_idx
Hk seq_24723: all subsets created with subset_idx
Hk seq_12279: all subsets created with subset_idx
Hk seq_20647: all subsets created with subset_idx
Hk seq_4071: all subsets created with subset_idx
Hk seq_22627: all subsets created with subset_idx

Done!
