In [16]:
# Core imports
import os
import numpy as np
import h5py
import pickle
import random

# TensorFlow/Keras imports for model loading
import tensorflow as tf
from keras.models import model_from_json

# SEAM imports
import seam
from seam import Compiler, Attributer, Clusterer, MetaExplainer

# SQUID imports for mutagenesis
import squid

In [17]:
# load dev 20

dev_path = "/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/library_creation/Dev_20_library/Dev_20"

import os
import pandas as pd
import pickle

#open pickle file
path = os.path.join(dev_path, "dev_20_library.pkl")

dev_pkl = pd.read_pickle(path)

dev_pkl = dev_pkl["dev"]

#dev_pd = pd.DataFrame(dev_pkl, index=["test_idx"])
print(len(dev_pkl))
#remove the removed seqs (21916, 1693, 8389)

dev_pkl = dev_pkl[~dev_pkl["test_idx"].isin([21916, 1693, 8389])]
dev_pkl = dev_pkl.reset_index(drop=True)
print(len(dev_pkl))
#save the new dev_pkl in data and model dir

save_path = "/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/data_and_models/dev_20_library/"

os.makedirs(save_path, exist_ok=True)
with open(os.path.join(save_path, 'dev_20_library.pkl'), 'wb') as f:
    pickle.dump({'dev': dev_pkl}, f)

23
20


In [18]:
## load dev 20 pickle

dev_20_path = "/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/data_and_models/dev_20_library/"
dev_pkl = pd.read_pickle(os.path.join(save_path, 'dev_20_library.pkl'))
print(len(dev_pkl["dev"]))
dev_pkl = dev_pkl["dev"]

dev_pkl.iloc[0]

20


test_idx                                                22612
sequence    TTTTAATGACTGAAATTAAAACATCATTAAGGCGAATTGGCCACCG...
activity                                             3.265582
ohe_seq     [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0], [...
Name: 0, dtype: object

In [19]:
# Download and load the DeepSTARR model
model_dir = "/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/data_and_models/models/"
MODEL_DIR = model_dir

# Download model files if not present
model_json_file = os.path.join(model_dir, 'deepstarr.model.json')
model_weights_file = os.path.join(model_dir, 'deepstarr.model.h5')

if not os.path.exists(model_json_file):
    print("Downloading deepstarr.model.json...")
    url = 'https://www.dropbox.com/scl/fi/y1mwsqpv2e514md9t68jz/deepstarr.model.json?rlkey=cdwhstqf96fibshes2aov6t1e&st=9a0c5skz&dl=1'
    urlretrieve(url, model_json_file)
else:
    print(f"Using existing {model_json_file}")

if not os.path.exists(model_weights_file):
    print("Downloading deepstarr.model.h5...")
    url = 'https://www.dropbox.com/scl/fi/6nl6e2hofyw70lh99h3uk/deepstarr.model.h5?rlkey=hqfnivn199xa54bjh8dn2jpaf&st=l4jig4ky&dl=1'
    urlretrieve(url, model_weights_file)
else:
    print(f"Using existing {model_weights_file}")



# Load the model architecture from JSON
with open(model_json_file, 'r') as f:
    model_json = f.read()

model = model_from_json(model_json, custom_objects={'Functional': tf.keras.Model})

# Set random seeds for reproducibility
np.random.seed(113)
random.seed(0)

# Load the model weights
model.load_weights(model_weights_file)
num_tasks = 2  # Dev [0] and Hk [1]

alphabet = ['A','C','G','T']

x_ref = dev_pkl.iloc[0]["ohe_seq"]
x_ref = np.expand_dims(x_ref,0)


# Define mutagenesis window for sequence
seq_length = x_ref.shape[1]
mut_window = [0, seq_length]  # [start_position, stop_position]
print("\nModel loaded successfully!")

# Forward pass to get output for the specific head
output = model(x_ref)
predd,predh = model.predict(x_ref)[0], model.predict(x_ref)[1]
print(f"\nWild-type predictions: {predd[0][0], predh[0][0]}")
print(f"Model input shape: {model.input_shape}")
print(f"Model output shape: {model.output_shape}")

Using existing /grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/data_and_models/models/deepstarr.model.json
Using existing /grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/data_and_models/models/deepstarr.model.h5


2026-01-25 21:25:32.498926: W tensorflow/c/c_api.cc:291] Operation '{name:'batch_normalization_73_1/beta/Assign' id:1573 op device:{requested: '', assigned: ''} def:{{{node batch_normalization_73_1/beta/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](batch_normalization_73_1/beta, batch_normalization_73_1/beta/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Model loaded successfully!


  updates=self.state_updates,
2026-01-25 21:25:32.842703: W tensorflow/c/c_api.cc:291] Operation '{name:'Dense_Dev_1/BiasAdd' id:2155 op device:{requested: '', assigned: ''} def:{{{node Dense_Dev_1/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](Dense_Dev_1/MatMul, Dense_Dev_1/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Wild-type predictions: (3.2655823, 0.6504629)
Model input shape: (None, 249, 4)
Model output shape: [(None, 1), (None, 1)]


In [20]:
# Helper function to save library to HDF5
def save_library(filepath, sequences, predictions, original_idx):
    """Save mutagenesis library to HDF5 file."""
    n_samples = len(sequences)
    with h5py.File(filepath, 'w') as f:
        f.create_dataset('sequences', data=sequences, compression='gzip', compression_opts=4)
        f.create_dataset('predictions', data=predictions, compression='gzip', compression_opts=4)
        # Add library_index for consistent subsetting (0 to n_samples-1)
        f.create_dataset('library_index', data=np.arange(n_samples), compression='gzip', compression_opts=4)
        f.attrs['original_idx'] = original_idx
        f.attrs['n_samples'] = n_samples

In [21]:



## DeepSHAP attribution function with checkpointing
def seam_deepshap(x_mut, task_index, checkpoint_path=None, checkpoint_every=5000):
    """Compute DeepSHAP attributions with optional checkpointing."""
    x_ref = x_mut
    print(f"Computing attributions for task_index: {task_index}")
    import time
    import tensorflow as tf
    from keras.models import model_from_json
    import numpy as np
    import random

    # Check for existing checkpoint
    if checkpoint_path and os.path.exists(checkpoint_path):
        with h5py.File(checkpoint_path, 'r') as f:
            start_idx = f.attrs['last_completed_idx'] + 1
            attributions_partial = f['attributions'][:start_idx]
        print(f"Resuming from checkpoint at index {start_idx}")
    else:
        start_idx = 0
        attributions_partial = None

    # If already complete, return
    if start_idx >= len(x_mut):
        print("Attributions already complete, loading from checkpoint")
        with h5py.File(checkpoint_path, 'r') as f:
            return f['attributions'][:]

    # Configuration
    attribution_method = 'deepshap'
    gpu = 0
    
    # Model paths
    keras_model_json = os.path.join(MODEL_DIR, 'deepstarr.model.json')
    keras_model_weights = os.path.join(MODEL_DIR, 'deepstarr.model.h5')

    if attribution_method == 'deepshap':
        try:
            tf.compat.v1.disable_eager_execution()
            tf.compat.v1.disable_v2_behavior()
            print("TensorFlow eager execution disabled for DeepSHAP compatibility")
            
            try:
                import shap
            except ImportError:
                raise ImportError("SHAP package required for DeepSHAP attribution")
            
            shap.explainers.deep.deep_tf.op_handlers["AddV2"] = shap.explainers.deep.deep_tf.passthrough

            keras_model = model_from_json(open(keras_model_json).read(), custom_objects={'Functional': tf.keras.Model})
            np.random.seed(113)
            random.seed(0)
            keras_model.load_weights(keras_model_weights)
            model_local = keras_model
            
            _ = model_local(tf.keras.Input(shape=model_local.input_shape[1:]))
            
        except ImportError:
            raise
        except Exception as e:
            print(f"Warning: Could not setup TensorFlow for DeepSHAP. Error: {e}")
            print("DeepSHAP may not work properly.")
        
        def deepstarr_compress(x):
            if hasattr(x, 'outputs'):
                return tf.reduce_sum(x.outputs[task_index], axis=-1)
            else:
                return x

        attributer = Attributer(
            model_local,
            method=attribution_method,
            task_index=task_index,
            compress_fun=deepstarr_compress
        )

        attributer.show_params(attribution_method)

        t1 = time.time()
        
        # Process in chunks with checkpointing
        n_samples = len(x_mut)
        all_attributions = []
        
        # Add previously computed attributions if resuming
        if attributions_partial is not None:
            all_attributions.append(attributions_partial)
        
        for chunk_start in range(start_idx, n_samples, checkpoint_every):
            chunk_end = min(chunk_start + checkpoint_every, n_samples)
            print(f"\nProcessing samples {chunk_start} to {chunk_end} of {n_samples}")
            
            x_chunk = x_mut[chunk_start:chunk_end]
            x_ref_chunk = x_chunk
            
            chunk_attributions = attributer.compute(
                x_ref=x_ref_chunk,
                x=x_chunk,
                save_window=None,
                batch_size=64,
                gpu=gpu,
            )
            
            all_attributions.append(chunk_attributions)
            
            # Save checkpoint
            if checkpoint_path:
                attributions_so_far = np.concatenate(all_attributions, axis=0)
                os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
                with h5py.File(checkpoint_path, 'w') as f:
                    f.create_dataset('attributions', data=attributions_so_far, compression='gzip', compression_opts=4)
                    f.attrs['last_completed_idx'] = chunk_end - 1
                    f.attrs['n_samples'] = n_samples
                print(f"Checkpoint saved at index {chunk_end - 1}")
        
        attributions = np.concatenate(all_attributions, axis=0)
        
        t2 = time.time() - t1
        print(f'Attribution time: {t2/60:.2f} minutes')
        
        return attributions


### Helper functions

def load_library_25k(seq_idx):
    """Load the full 100K library for a Dev_20 sequence."""
    filepath = f'/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/b_mutation_rate_sweep/seq_libraries/mut_sweep/Dev/seq_{seq_idx}/{mut_rate*100}%/25K.h5'
    with h5py.File(filepath, 'r') as f:
        sequences = f['sequences'][:]
        predictions = f['predictions'][:]
        original_idx = f.attrs['original_idx']
        library_index = f['library_index'][:] if 'library_index' in f else np.arange(len(sequences))
    return sequences, predictions, original_idx, library_index


def create_subset_indices(library_index, subset_size, seed=42):
    """Create subset indices by shuffling library_index with a fixed seed."""
    indices = library_index.copy()
    np.random.seed(seed)
    np.random.shuffle(indices)
    return indices[:subset_size]


def save_attributions(filepath, attributions, original_idx, subset_idx=None):
    """Save attributions to HDF5 file with optional subset indices."""
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with h5py.File(filepath, 'w') as f:
        f.create_dataset('attributions', data=attributions, compression='gzip', compression_opts=4)
        if subset_idx is not None:
            f.create_dataset('subset_idx', data=subset_idx, compression='gzip', compression_opts=4)
        f.attrs['original_idx'] = original_idx
        f.attrs['n_samples'] = len(attributions)


def load_attributions(filepath):
    """Load attributions from HDF5 file."""
    with h5py.File(filepath, 'r') as f:
        return f['attributions'][:]


def attributions_exist(seq_idx):
    """Check if 100K attributions already exist for a sequence."""
    filepath = f'{RESULTS_DIR}/attribution_maps/deepSHAP/Dev/seq_{seq_idx}/100K.h5'
    return os.path.exists(filepath)


def all_attributions_exist(seq_idx):
    """Check if ALL attribution files exist for a given sequence."""
    for size_label in subset_sizes.keys():
        attr_path = f'{RESULTS_DIR}/attribution_maps/deepSHAP/Dev/seq_{seq_idx}/{size_label}.h5'
        if not os.path.exists(attr_path):
            return False
    return True


In [None]:
# Generate 25K mutagenesis libraries for each Dev_20 sequence 
# and sweep through mutation rates

from typing import Any


mutation_rates = [.75, .50, .25, .10, .5, .1]  
lib_size = 25000


for mut_rate in mutation_rates:

    task_index = 0  # 0 for Dev
    x_seqs = dev_pkl["ohe_seq"]
    seq_indices = dev_pkl["test_idx"]

    for i, (x_seq, idx) in enumerate(zip(x_seqs, seq_indices)):
        output_dir = f'/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/b_mutation_rate_sweep/seq_libraries/mut_sweep/Dev/seq_{idx}/{mut_rate*100}%/'
        output_file = f'{output_dir}/25K.h5'
        
        # Check if library already exists
        if os.path.exists(output_file):
            print(f"Skipping seq_{idx} - already exists")
            continue
        
        os.makedirs(output_dir, exist_ok=True)
        
        x_seq = np.array(x_seq)
        
        # Create predictor
        pred_generator = squid.predictor.ScalarPredictor(
            pred_fun=model.predict_on_batch,
            task_idx=task_index,
            batch_size=512
        )
        
        # Create mutagenizer
        mut_generator = squid.mutagenizer.RandomMutagenesis(
            mut_rate=mut_rate,
            seed=42
        )
        
        # Create MAVE
        mave = squid.mave.InSilicoMAVE(
            mut_generator,
            pred_generator,
            seq_length = 249,
            mut_window=[0, 249]
        )
        
        # Generate 25k mutant sequences
        x_mut, y_mut = mave.generate(x_seq, num_sim=lib_size)
        
        # save each in 

        # add subset_idx col to ../seq_libraries/Seq_X/10%/25K.h5
        subset_idx = np.arange(lib_size)
        save_library(output_file, x_mut, y_mut, idx)
        print(f"[{i+1}/{len(x_seqs)}] Created seq_{idx}/25K.h5 with Mutation Rate {mut_rate*100}%")

    # get deepshap attributions for each library
    for idx in seq_indices:
        output_dir = f'/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/b_mutation_rate_sweep/seq_libraries/mut_sweep/deepshap/Dev/seq_{idx}/{mut_rate*100}%/'
        output_file = f'{output_dir}/25K.h5'

        if os.path.exists(output_file):
            print(f"Skipping seq_{idx} - already exists")
            continue
        
        # load library
        x_mut, y_mut, original_idx, library_index = load_library_25k(idx)
        
        # get deepshap attributions
        attributions = seam_deepshap(x_mut, task_index, checkpoint_path=output_file)

        # save attributions
        save_attributions(output_file, attributions, original_idx)
        
        print(f"Saved attributions for seq_{idx} with Mutation Rate {mut_rate*100}%")


            
            
            
            



    

print("\nDone!")

Skipping seq_22612 - already exists
Skipping seq_21069 - already exists
Skipping seq_13748 - already exists
Skipping seq_3881 - already exists
Skipping seq_2974 - already exists
Skipping seq_22386 - already exists
Skipping seq_25078 - already exists
Skipping seq_20726 - already exists
Skipping seq_28234 - already exists
Skipping seq_705 - already exists
Skipping seq_8092 - already exists
Skipping seq_30858 - already exists
Skipping seq_35428 - already exists
Skipping seq_28088 - already exists
Skipping seq_21045 - already exists
Skipping seq_21289 - already exists
Skipping seq_266 - already exists
Skipping seq_562 - already exists
Skipping seq_22498 - already exists
Skipping seq_1006 - already exists
Skipping seq_22612 - already exists
Skipping seq_21069 - already exists
Skipping seq_13748 - already exists
Skipping seq_3881 - already exists
Skipping seq_2974 - already exists
Skipping seq_22386 - already exists
Skipping seq_25078 - already exists
Skipping seq_20726 - already exists
Skip

2026-01-25 21:25:35.022630: W tensorflow/c/c_api.cc:291] Operation '{name:'Dense_2_2/bias/Assign' id:3447 op device:{requested: '', assigned: ''} def:{{{node Dense_2_2/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](Dense_2_2/bias, Dense_2_2/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000

Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 5

2026-01-25 21:32:28.626344: W tensorflow/c/c_api.cc:291] Operation '{name:'batch_normalization_76_3/moving_variance/Assign' id:5178 op device:{requested: '', assigned: ''} def:{{{node batch_normalization_76_3/moving_variance/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](batch_normalization_76_3/moving_variance, batch_normalization_76_3/moving_variance/Initializer/ones)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000
Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 50

2026-01-25 21:37:54.569155: W tensorflow/c/c_api.cc:291] Operation '{name:'Dense_Dev_4/bias/Assign' id:7230 op device:{requested: '', assigned: ''} def:{{{node Dense_Dev_4/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](Dense_Dev_4/bias, Dense_Dev_4/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000
Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 50

2026-01-25 21:43:18.177145: W tensorflow/c/c_api.cc:291] Operation '{name:'batch_normalization_75_5/gamma/Assign' id:8710 op device:{requested: '', assigned: ''} def:{{{node batch_normalization_75_5/gamma/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](batch_normalization_75_5/gamma, batch_normalization_75_5/gamma/Initializer/ones)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000
Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 50

2026-01-25 21:48:42.393892: W tensorflow/c/c_api.cc:291] Operation '{name:'Conv1D_4_6/bias/Assign' id:10526 op device:{requested: '', assigned: ''} def:{{{node Conv1D_4_6/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](Conv1D_4_6/bias, Conv1D_4_6/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000
Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 50

2026-01-25 21:54:06.127461: W tensorflow/c/c_api.cc:291] Operation '{name:'Dense_Dev_7/kernel/Assign' id:12715 op device:{requested: '', assigned: ''} def:{{{node Dense_Dev_7/kernel/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](Dense_Dev_7/kernel, Dense_Dev_7/kernel/Initializer/random_uniform)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000
Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 50

2026-01-25 21:59:29.357543: W tensorflow/c/c_api.cc:291] Operation '{name:'batch_normalization_73_8/gamma/Assign' id:13968 op device:{requested: '', assigned: ''} def:{{{node batch_normalization_73_8/gamma/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](batch_normalization_73_8/gamma, batch_normalization_73_8/gamma/Initializer/ones)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000
Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 50

2026-01-25 22:04:53.809919: W tensorflow/c/c_api.cc:291] Operation '{name:'batch_normalization_77_9/moving_mean/Assign' id:16276 op device:{requested: '', assigned: ''} def:{{{node batch_normalization_77_9/moving_mean/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](batch_normalization_77_9/moving_mean, batch_normalization_77_9/moving_mean/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000
Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 50

2026-01-25 22:10:16.615641: W tensorflow/c/c_api.cc:291] Operation '{name:'Conv1D_3_10/kernel/Assign' id:17725 op device:{requested: '', assigned: ''} def:{{{node Conv1D_3_10/kernel/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](Conv1D_3_10/kernel, Conv1D_3_10/kernel/Initializer/random_uniform)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000
Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 50

2026-01-25 22:15:39.526209: W tensorflow/c/c_api.cc:291] Operation '{name:'batch_normalization_75_11/moving_mean/Assign' id:19700 op device:{requested: '', assigned: ''} def:{{{node batch_normalization_75_11/moving_mean/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](batch_normalization_75_11/moving_mean, batch_normalization_75_11/moving_mean/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



Parameters for deepshap:

Required:
x: array, Input sequences to compute attributions for

Optional:
background: array, Background sequences for DeepSHAP (optional). Shape: (N, L, A). If not provided, will generate shuffled backgrounds using num_shuffles.

Common Optional:
x_ref: array, Reference sequence for comparison (optional). Shape: (1, L, A). Used for padding in windowed analysis when save_window is specified. Not used for DeepSHAP background.
save_window: list, Window [start, end] to compute attributions (optional). When provided with x_ref, allows computing attributions for a subset of positions while maintaining full sequence context. Input x should contain only the windowed region with shape (N, end-start, A), and x_ref provides the full-length context with shape (1, L, A). Example: [100, 200] computes attributions for positions 100-200.

Processing samples 0 to 5000 of 25000
Done 0 examples of 5000
Done 100 examples of 5000
Done 200 examples of 5000
Done 300 examples of 50

In [None]:
### Hierarchical Clustering with 30 clusters for all seqs and mutation rates

# Parameters
mutation_rates = [.75, .50, .25, .10, .05, .01]
cluster_number = 30
RESULTS_DIR = '/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/b_mutation_rate_sweep/scripts/results'

def get_mut_label(mut_rate):
    """Convert mutation rate to label string (e.g., 0.75 -> '75.0%')"""
    return f"{mut_rate*100}%"

def load_library_75k(seq_idx, mut_rate):
    """Load the 75K library for a sequence and mutation rate."""
    mut_label = get_mut_label(mut_rate)
    filepath = f'/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/b_mutation_rate_sweep/seq_libraries/mut_sweep/Dev/seq_{seq_idx}/{mut_label}/25K.h5'
    with h5py.File(filepath, 'r') as f:
        sequences = f['sequences'][:]
        predictions = f['predictions'][:]
        original_idx = f.attrs['original_idx']
        library_index = f['library_index'][:] if 'library_index' in f else np.arange(len(sequences))
    return sequences, predictions, original_idx, library_index

def load_attributions_mut(seq_idx, mut_rate):
    """Load attributions for a sequence and mutation rate."""
    mut_label = get_mut_label(mut_rate)
    filepath = f'/grid/wsbs/home_norepl/pmantill/SEAM_revisions/SEAM_revisions/hyperparameter_selection/b_mutation_rate_sweep/seq_libraries/mut_sweep/deepshap/Dev/seq_{seq_idx}/{mut_label}/25K.h5'
    with h5py.File(filepath, 'r') as f:
        return f['attributions'][:]

def cluster_and_save(attrs, seq_idx, mut_rate, n_clusters=30):
    """Cluster attributions and save results. Skip if already exists."""
    mut_label = get_mut_label(mut_rate)
    cluster_dir = f'{RESULTS_DIR}/cluster_metadata/Dev/seq_{seq_idx}/{mut_label}'
    linkage_path = os.path.join(cluster_dir, 'hierarchical_linkage_ward.npy')
    labels_path = os.path.join(cluster_dir, 'cluster_labels.npy')
    
    # Skip if already exists
    if os.path.exists(linkage_path) and os.path.exists(labels_path):
        print(f"Skipping clustering for {mut_label} - already exists")
        return np.load(linkage_path), np.load(labels_path)
    
    os.makedirs(cluster_dir, exist_ok=True)

    import time
    print(f"Clustering {len(attrs)} samples...")
    t_start = time.time()

    clusterer = Clusterer(attrs, gpu=True)

    linkage = clusterer.cluster(
        method='hierarchical',
        link_method='ward',
        batch_size=20000
    )
    print(f"Clustering completed in {(time.time() - t_start)/60:.1f} min")
    
    labels, cut_level = clusterer.get_cluster_labels(
        linkage,
        criterion='maxclust',
        n_clusters=n_clusters
    )
    
    np.save(linkage_path, linkage)
    np.save(labels_path, labels)
    print(f"Saved clustering for {mut_label}")

    return linkage, labels

# Run clustering for all sequences and mutation rates
for seq_idx in seq_indices:
    print(f"\n{'='*50}")
    print(f"Clustering Dev seq_{seq_idx}")
    print(f"{'='*50}")

    for mut_rate in mutation_rates:
        mut_label = get_mut_label(mut_rate)
        print(f"\n--- {mut_label} ---")

        # Load attributions
        try:
            attrs = load_attributions_mut(seq_idx, mut_rate)
        except Exception as e:
            print(f"Skipping {mut_label} - no attributions found: {e}")
            continue

        linkage, labels = cluster_and_save(attrs, seq_idx, mut_rate, n_clusters=cluster_number)


### MetaExplainer - Generate MSM from cluster metadata

def generate_and_save_msm(seq_idx, mut_rate, n_clusters=30, gpu=True):
    """Load cluster metadata and generate MSM."""
    mut_label = get_mut_label(mut_rate)

    # Define paths
    cluster_dir = f'{RESULTS_DIR}/cluster_metadata/Dev/seq_{seq_idx}/{mut_label}'
    linkage_path = os.path.join(cluster_dir, 'hierarchical_linkage_ward.npy')
    labels_path = os.path.join(cluster_dir, 'cluster_labels.npy')

    csm_dir = f'{RESULTS_DIR}/CSM/Dev/seq_{seq_idx}/{mut_label}'
    msm_path = os.path.join(csm_dir, 'msm.csv')

    # Skip if MSM already exists
    if os.path.exists(msm_path):
        print(f"Skipping seq_{seq_idx}/{mut_label} - MSM already exists")
        return None

    # Check if cluster metadata exists
    if not os.path.exists(linkage_path) or not os.path.exists(labels_path):
        print(f"Skipping seq_{seq_idx}/{mut_label} - cluster metadata not found")
        return None

    # Load cluster metadata
    linkage = np.load(linkage_path)
    labels = np.load(labels_path)

    # Load attributions
    try:
        attributions = load_attributions_mut(seq_idx, mut_rate)
    except:
        print(f"Skipping seq_{seq_idx}/{mut_label} - attributions not found")
        return None

    # Load sequences and predictions
    seqs, preds, orig_idx, library_index = load_library_75k(seq_idx, mut_rate)

    # Get reference sequence (first sequence, index 0)
    x_ref = seqs[0:1]

    # Create Clusterer and set cluster_labels
    clusterer = Clusterer(attributions, gpu=gpu)
    clusterer.cluster_labels = labels

    # Create mave_df using Compiler
    compiler = Compiler(
        x=seqs,
        y=preds,
        x_ref=x_ref,
        alphabet=['A', 'C', 'G', 'T'],
        gpu=gpu
    )
    mave_df = compiler.compile()

    # Initialize MetaExplainer
    meta = MetaExplainer(
        clusterer=clusterer,
        mave_df=mave_df,
        attributions=attributions,
        sort_method='median',
        ref_idx=0,
        mut_rate=mut_rate
    )

    # Generate MSM
    msm = meta.generate_msm(gpu=gpu)

    # Save outputs
    os.makedirs(csm_dir, exist_ok=True)

    # Save MSM
    msm.to_csv(msm_path, index=False)
    print(f"Saved MSM to {msm_path}")

    # Compute and save cluster statistics
    cluster_stats = []
    for k in meta.cluster_indices:
        k_mask = meta.mave['Cluster'] == k
        k_scores = meta.mave.loc[k_mask, 'DNN']
        cluster_stats.append({
            'Cluster': k,
            'Occupancy': k_mask.sum(),
            'Median_DNN': k_scores.median(),
            'Mean_DNN': k_scores.mean(),
            'Std_DNN': k_scores.std()
        })
    cluster_stats_df = pd.DataFrame(cluster_stats)
    cluster_stats_df.to_csv(os.path.join(csm_dir, 'cluster_stats.csv'), index=False)
    print(f"Saved cluster stats")

    # Add Cluster_Sorted column to membership_df
    if meta.cluster_order is not None:
        mapping_dict = {old_k: new_k for new_k, old_k in enumerate(meta.cluster_order)}
        meta.membership_df['Cluster_Sorted'] = meta.membership_df['Cluster'].map(mapping_dict)

    # Save membership dataframe
    meta.membership_df.to_csv(os.path.join(csm_dir, 'membership_df.csv'), index=False)
    print(f"Saved membership dataframe")

    # Save WT cluster info (ref_idx=0 is the WT sequence)
    ref_cluster = meta.membership_df.loc[0, 'Cluster']
    ref_cluster_sorted = meta.membership_df.loc[0, 'Cluster_Sorted'] if 'Cluster_Sorted' in meta.membership_df.columns else ref_cluster
    wt_info = pd.DataFrame({
        'ref_idx': [0],
        'WT_Cluster': [ref_cluster],
        'WT_Cluster_Sorted': [ref_cluster_sorted]
    })
    wt_info.to_csv(os.path.join(csm_dir, 'wt_cluster_info.csv'), index=False)
    print(f"Saved WT cluster info")

    return msm


# Run MSM generation for all sequences and mutation rates
for seq_idx in seq_indices:
    print(f"\n{'='*50}")
    print(f"Generating MSM for Dev seq_{seq_idx}")
    print(f"{'='*50}")

    for mut_rate in mutation_rates:
        mut_label = get_mut_label(mut_rate)
        print(f"\n--- {mut_label} ---")
        generate_and_save_msm(seq_idx, mut_rate, n_clusters=cluster_number, gpu=True)


### Variance Summary and Correlation Analysis

def compute_and_save_variance_summary(seq_idx, mut_rate):
    """Load MSM and compute variance of entropy across clusters for each position."""
    mut_label = get_mut_label(mut_rate)

    csm_dir = f'{RESULTS_DIR}/CSM/Dev/seq_{seq_idx}/{mut_label}'
    msm_path = os.path.join(csm_dir, 'msm.csv')
    variance_path = os.path.join(csm_dir, 'variance_summary.csv')

    # Skip if variance summary already exists
    if os.path.exists(variance_path):
        print(f"Skipping seq_{seq_idx}/{mut_label} - variance summary already exists")
        return None

    # Check if MSM exists
    if not os.path.exists(msm_path):
        print(f"Skipping seq_{seq_idx}/{mut_label} - MSM not found")
        return None

    # Load MSM
    msm = pd.read_csv(msm_path)

    # Pivot to get Cluster x Position matrix of entropy values
    entropy_matrix = msm.pivot(index='Cluster', columns='Position', values='Entropy')

    # Compute variance across clusters for each position
    variance_per_position = entropy_matrix.var(axis=0)

    # Create DataFrame and save
    variance_df = pd.DataFrame({
        'Position': variance_per_position.index,
        'Variance': variance_per_position.values
    })

    variance_df.to_csv(variance_path, index=False)
    print(f"Saved variance summary to {variance_path}")

    return variance_df


# Run variance summary computation for all sequences and mutation rates
for seq_idx in seq_indices:
    print(f"\n{'='*50}")
    print(f"Computing variance summaries for Dev seq_{seq_idx}")
    print(f"{'='*50}")

    for mut_rate in mutation_rates:
        mut_label = get_mut_label(mut_rate)
        print(f"\n--- {mut_label} ---")
        compute_and_save_variance_summary(seq_idx, mut_rate)


### Compute correlations with 10% reference using variance summaries
from scipy.stats import spearmanr, pearsonr

reference_mut_rate = 0.10  # Compare against 10% mutation rate

def compute_mut_rate_correlations(seq_idx):
    """Compute Pearson and Spearman correlations with 10% reference on variance summaries."""
    ref_label = get_mut_label(reference_mut_rate)
    output_dir = f'{RESULTS_DIR}/hyperparam_set_correlation/Dev/seq_{seq_idx}'
    corr_path = os.path.join(output_dir, f'correlation_with_{ref_label}.csv')

    # Skip if correlations already exist
    if os.path.exists(corr_path):
        print(f"Skipping seq_{seq_idx} - correlations already exist")
        return None

    # Load reference variance summary
    variance_ref_path = f'{RESULTS_DIR}/CSM/Dev/seq_{seq_idx}/{ref_label}/variance_summary.csv'
    if not os.path.exists(variance_ref_path):
        print(f"Skipping seq_{seq_idx} - {ref_label} variance summary not found")
        return None

    variance_ref = pd.read_csv(variance_ref_path)['Variance'].values

    # Compute correlations for each mutation rate
    results = []
    for mut_rate in mutation_rates:
        mut_label = get_mut_label(mut_rate)
        variance_path = f'{RESULTS_DIR}/CSM/Dev/seq_{seq_idx}/{mut_label}/variance_summary.csv'
        if not os.path.exists(variance_path):
            continue

        variance_values = pd.read_csv(variance_path)['Variance'].values
        pearson_corr, _ = pearsonr(variance_ref, variance_values)
        spearman_corr, _ = spearmanr(variance_ref, variance_values)

        results.append({
            'Mut_Rate': mut_label,
            'Mut_Rate_Numeric': mut_rate,
            'Pearson': pearson_corr,
            'Spearman': spearman_corr
        })

    if len(results) < 2:
        print(f"Skipping seq_{seq_idx} - need at least 2 mutation rates")
        return None

    corr_df = pd.DataFrame(results)
    os.makedirs(output_dir, exist_ok=True)
    corr_df.to_csv(corr_path, index=False)
    print(f"Saved correlations to {corr_path}")
    return corr_df

# Run correlation computation for all sequences
for seq_idx in seq_indices:
    print(f"\n{'='*50}")
    print(f"Computing correlations for Dev seq_{seq_idx}")
    print(f"{'='*50}")
    compute_mut_rate_correlations(seq_idx)

print("\n" + "="*50)
print("Mutation Rate sweep complete!")
print("="*50)