In [22]:
# Core imports
import os
import subprocess
import numpy as np
import pandas as pd
import h5py
import random
from urllib.request import urlretrieve
import matplotlib.pyplot as plt


# TensorFlow/Keras imports for model loading
import tensorflow as tf
from keras.models import model_from_json

# SEAM imports
import seam
from seam import Compiler, Attributer, Clusterer, MetaExplainer, Identifier
from seam.logomaker_batch.batch_logo import BatchLogo


In [23]:
#open libraries from library_selection.ipynb
import pickle

with open('../libraries/hyperparam_libraries.pkl', 'rb') as f:
    libraries = pickle.load(f)
    dev_loci = libraries['dev']
    hk_loci = libraries['hk']

print(len(dev_loci), len(hk_loci))

## import model
import os
from urllib.request import urlretrieve
from keras.models import model_from_json

model_dir = '../models/deepstarr'
os.makedirs(model_dir, exist_ok=True)

model_json_file = os.path.join(model_dir, 'deepstarr.model.json')
model_weights_file = os.path.join(model_dir, 'deepstarr.model.h5')

with open(model_json_file, 'r') as f:
    model_json = f.read()

model = model_from_json(model_json, custom_objects={'Functional': tf.keras.Model})
model.load_weights(model_weights_file)

print("Model loaded successfully!")

## load deepSHAP
def seam_deepshap(x_mut, task_index):
    x_ref = x_mut
    import time
    import tensorflow as tf
    from keras.models import model_from_json
    import numpy as np
    import random

    # Configuration
    attribution_method = 'deepshap'  # or 'gradientshap', 'integratedgradients', etc.
    task_index = task_index  # 0 for Dev, 1 for Hk
    gpu = 0  # GPU device number
    save_data = True
    save_path = './attributions'  # Where to save results
    os.makedirs(save_path, exist_ok=True)

    # Model paths
    keras_model_json = 'models/deepstarr/deepstarr.model.json'
    keras_model_weights = 'models/deepstarr/deepstarr.model.h5'

    if attribution_method == 'deepshap':
        try:
            # Disable eager execution first
            tf.compat.v1.disable_eager_execution()
            tf.compat.v1.disable_v2_behavior()
            print("TensorFlow eager execution disabled for DeepSHAP compatibility")
            
            # Import SHAP to configure handlers
            try:
                import shap
            except ImportError:
                print("ERROR: SHAP package is not installed.")
                print("To install SHAP for DeepSHAP attribution, run:")
                print("pip install kundajelab-shap==1")
                raise ImportError("SHAP package required for DeepSHAP attribution")
            
            # Handle AddV2 operation (element-wise addition) as a linear operation
            shap.explainers.deep.deep_tf.op_handlers["AddV2"] = shap.explainers.deep.deep_tf.passthrough

            # Load the model after eager execution is disabled
            keras_model = model_from_json(open(keras_model_json).read(), custom_objects={'Functional': tf.keras.Model})
            np.random.seed(113)
            random.seed(0)
            keras_model.load_weights(keras_model_weights)
            model = keras_model
            
            # Rebuild model to ensure proper graph construction
            _ = model(tf.keras.Input(shape=model.input_shape[1:]))
            
        except ImportError:
            raise
        except Exception as e:
            print(f"Warning: Could not setup TensorFlow for DeepSHAP. Error: {e}")
            print("DeepSHAP may not work properly.")
        
        # Create attributer for DeepSHAP
        def deepstarr_compress(x):
            """DeepSTARR compression function for DeepSHAP."""
            if hasattr(x, 'outputs'):
                return tf.reduce_sum(x.outputs[task_index], axis=-1)
            else:
                return x

        attributer = Attributer(
            model,
            method=attribution_method,
            task_index=task_index,
            compress_fun=deepstarr_compress
        )

        attributer.show_params(attribution_method)

        t1 = time.time()
        attributions = attributer.compute(
            x_ref=x_ref,
            x=x_mut,
            save_window=None,
            batch_size=16,
            gpu=gpu,
        )
        t2 = time.time() - t1
        print(f'Attribution time: {t2/60:.2f} minutes')
    else:
        # Use unified Attributer for other methods
        attributer = Attributer(
            model,
            method=attribution_method,
            task_index=task_index,
            compress_fun=lambda x: x,
            pred_fun=model.predict_on_batch,
        )

        attributer.show_params(attribution_method)

        t1 = time.time()
        attributions = attributer.compute(
            x_ref=x_ref,
            x=x_mut,
            save_window=None,
            batch_size=256,
            gpu=gpu
        )
        t2 = time.time() - t1
        print(f'Attribution time: {t2/60:.2f} minutes')

    return attributions

6 8
Model loaded successfully!


In [24]:
import h5py
import numpy as np

# Save each sequence's library to a single HDF5 file
def save_library(filepath, sequences, predictions, original_idx):
    with h5py.File(filepath, 'w') as f:
        f.create_dataset('sequences', data=sequences, compression='gzip', compression_opts=4)
        f.create_dataset('predictions', data=predictions, compression='gzip', compression_opts=4)
        f.attrs['original_idx'] = original_idx
        f.attrs['n_samples'] = len(sequences)

In [27]:
## Create 100K -> 1K libraries for each sequence: DEV
import squid
task_index = 0  # 0 for dev, 1 for hk
x_seqs = dev_loci["ohe_seq"]
seq_indices = dev_loci["test_idx"]  # Add this line
print(x_seqs.shape)


for i, (x_seq, idx) in enumerate(zip(x_seqs, seq_indices)):

    x_seq = np.array(x_seq)

    pred_generator = squid.predictor.ScalarPredictor(
            pred_fun=model.predict_on_batch,
            task_idx=task_index,
            batch_size=512
        )

    mut_generator = squid.mutagenizer.RandomMutagenesis(
        mut_rate=0.10,
        seed=42
    )

    mave = squid.mave.InSilicoMAVE(
            mut_generator,
            pred_generator,
            249,
            mut_window=[0, 249]
        )

    x_mut, y_mut = mave.generate(x_seq, num_sim=100000)

    save_library(f'mutagenisis_library/dev_seq_{idx}_100K.h5', x_mut, y_mut, idx)

(6,)

Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28713.48it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 38.67it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27201.95it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 40.01it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27264.86it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.37it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 26750.51it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.99it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27420.52it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 38.70it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28594.93it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.64it/s]


In [28]:
## Create 100K -> 1K libraries for each sequence: Hk
import squid
task_index = 1  # 0 for dev, 1 for hk
x_seqs = hk_loci["ohe_seq"]
seq_indices = hk_loci["test_idx"]  # Add this line
print(x_seqs.shape)


for i, (x_seq, idx) in enumerate(zip(x_seqs, seq_indices)):

    x_seq = np.array(x_seq)

    pred_generator = squid.predictor.ScalarPredictor(
            pred_fun=model.predict_on_batch,
            task_idx=task_index,
            batch_size=512
        )

    mut_generator = squid.mutagenizer.RandomMutagenesis(
        mut_rate=0.10,
        seed=42
    )

    mave = squid.mave.InSilicoMAVE(
            mut_generator,
            pred_generator,
            249,
            mut_window=[0, 249]
        )

    x_mut, y_mut = mave.generate(x_seq, num_sim=100000)

    save_library(f'mutagenisis_library/hk_seq_{idx}_100K.h5', x_mut, y_mut, idx)

(8,)

Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 26964.20it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.55it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 26676.32it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.72it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28842.15it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 38.10it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27937.60it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.52it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 27783.66it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 39.83it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 28689.71it/s]
Inference: 100%|██████████| 195/195 [00:05<00:00, 38.78it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 26400.77it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 41.07it/s]



Building in silico MAVE...


Mutagenesis: 100%|██████████| 100000/100000 [00:03<00:00, 29060.52it/s]
Inference: 100%|██████████| 195/195 [00:04<00:00, 40.68it/s]


In [29]:
import os
import h5py
import numpy as np

# Define subset sizes
subset_sizes = [75000, 50000, 25000, 10000, 5000, 1000]
size_labels = ['75K', '50K', '25K', '10K', '5K', '1K']

def load_full_library(filepath):
    with h5py.File(filepath, 'r') as f:
        return f['sequences'][:], f['predictions'][:], f.attrs['original_idx']

def save_subset(filepath, sequences, predictions, original_idx):
    with h5py.File(filepath, 'w') as f:
        f.create_dataset('sequences', data=sequences, compression='gzip', compression_opts=4)
        f.create_dataset('predictions', data=predictions, compression='gzip', compression_opts=4)
        f.attrs['original_idx'] = original_idx
        f.attrs['n_samples'] = len(sequences)

# Process Dev libraries
dev_indices = dev_loci["test_idx"].tolist()
for idx in dev_indices:
    # Create directory for this sequence
    seq_dir = f'mutagenisis_library/Dev/seq_{idx}'
    os.makedirs(seq_dir, exist_ok=True)
    
    src_path = f'mutagenisis_library/dev_seq_{idx}_100K.h5'
    seqs, preds, orig_idx = load_full_library(src_path)
    
    # Copy 100K directly (no subsetting)
    save_subset(f'{seq_dir}/100K.h5', seqs, preds, orig_idx)
    
    # Create random subsets
    indices_100k = np.arange(100000)
    np.random.seed(42)
    np.random.shuffle(indices_100k)
    
    for size, label in zip(subset_sizes, size_labels):
        subset_idx = indices_100k[:size]
        save_subset(f'{seq_dir}/{label}.h5', seqs[subset_idx], preds[subset_idx], orig_idx)
    
    print(f"Dev seq_{idx}: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)")

# Process Hk libraries
hk_indices = hk_loci["test_idx"].tolist()
for idx in hk_indices:
    seq_dir = f'mutagenisis_library/Hk/seq_{idx}'
    os.makedirs(seq_dir, exist_ok=True)
    
    src_path = f'mutagenisis_library/hk_seq_{idx}_100K.h5'
    seqs, preds, orig_idx = load_full_library(src_path)
    
    # Copy 100K directly (no subsetting)
    save_subset(f'{seq_dir}/100K.h5', seqs, preds, orig_idx)
    
    # Create random subsets
    indices_100k = np.arange(100000)
    np.random.seed(42)
    np.random.shuffle(indices_100k)
    
    for size, label in zip(subset_sizes, size_labels):
        subset_idx = indices_100k[:size]
        save_subset(f'{seq_dir}/{label}.h5', seqs[subset_idx], preds[subset_idx], orig_idx)
    
    print(f"Hk seq_{idx}: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)")

print("\nDone!")

Dev seq_17977: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Dev seq_21916: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Dev seq_21289: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Dev seq_3881: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Dev seq_266: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Dev seq_22612: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Hk seq_31742: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Hk seq_12962: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Hk seq_12053: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Hk seq_24723: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Hk seq_12279: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Hk seq_20647: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Hk seq_4071: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)
Hk seq_22627: all subsets created (100K, 75K, 50K, 25K, 10K, 5K, 1K)

Done!
