In [28]:
# Core imports
import os
import subprocess
import numpy as np
import pandas as pd
import h5py
import random
from urllib.request import urlretrieve

# TensorFlow/Keras imports for model loading
import tensorflow as tf
from keras.models import model_from_json

# SEAM imports
import seam
from seam import Compiler, Attributer, Clusterer, MetaExplainer, Identifier

In [29]:
import os
import subprocess
import numpy as np
import pandas as pd
import h5py

# Create data directory if it doesn't exist
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Download data if not present
data_file = os.path.join(data_dir, 'deepstarr_data.h5')
if not os.path.exists(data_file):
    print("Downloading deepstarr_data.h5...")
    url = 'https://www.dropbox.com/scl/fi/cya4ntqk2o8yftxql52lu/deepstarr_data.h5?rlkey=5ly363vqjb3vaw2euw2dhsjo3&st=6eod6fg8&dl=1'
    subprocess.run(['wget', '-O', data_file, url], check=True)
else:
    print(f"Using existing {data_file}")

# Load data
with h5py.File(data_file, 'r') as dataset:
    X_test = np.array(dataset['x_test']).astype(np.float32)

# Create a summary dataframe
df = pd.DataFrame({
    'shape': [X_test.shape],
    'dtype': [X_test.dtype],
    'min': [X_test.min()],
    'max': [X_test.max()],
    'num_sequences': [X_test.shape[0]]
})

print(f"\nData loaded successfully!")
print(f"X_test shape: {X_test.shape}")
print(f"\nSummary:")
print(df)
df.keys()

Using existing data/deepstarr_data.h5

Data loaded successfully!
X_test shape: (41186, 249, 4)

Summary:
             shape    dtype  min  max  num_sequences
0  (41186, 249, 4)  float32  0.0  1.0          41186


Index(['shape', 'dtype', 'min', 'max', 'num_sequences'], dtype='object')

In [30]:
# Download and load the DeepSTARR model
model_dir = '../data'

# Download model files if not present
model_json_file = os.path.join(model_dir, 'deepstarr.model.json')
model_weights_file = os.path.join(model_dir, 'deepstarr.model.h5')

if not os.path.exists(model_json_file):
    print("Downloading deepstarr.model.json...")
    url = 'https://www.dropbox.com/scl/fi/y1mwsqpv2e514md9t68jz/deepstarr.model.json?rlkey=cdwhstqf96fibshes2aov6t1e&st=9a0c5skz&dl=1'
    urlretrieve(url, model_json_file)
else:
    print(f"Using existing {model_json_file}")

if not os.path.exists(model_weights_file):
    print("Downloading deepstarr.model.h5...")
    url = 'https://www.dropbox.com/scl/fi/6nl6e2hofyw70lh99h3uk/deepstarr.model.h5?rlkey=hqfnivn199xa54bjh8dn2jpaf&st=l4jig4ky&dl=1'
    urlretrieve(url, model_weights_file)
else:
    print(f"Using existing {model_weights_file}")

# Load the model architecture from JSON
with open(model_json_file, 'r') as f:
    model_json = f.read()

model = model_from_json(model_json, custom_objects={'Functional': tf.keras.Model})

# Set random seeds for reproducibility
np.random.seed(113)
random.seed(0)

# Load the model weights
model.load_weights(model_weights_file)
num_tasks = 2  # Dev [0] and Hk [1]

alphabet = ['A','C','G','T']

x_ref = X_test[0]
x_ref = np.expand_dims(x_ref,0)


# Define mutagenesis window for sequence
seq_length = x_ref.shape[1]
mut_window = [0, seq_length]  # [start_position, stop_position]
print("\nModel loaded successfully!")

# Forward pass to get output for the specific head
output = model(x_ref)
predd,predh = model.predict(x_ref)[0], model.predict(x_ref)[1]
print(f"\nWild-type predictions: {predd[0][0], predh[0][0]}")
print(f"Model input shape: {model.input_shape}")
print(f"Model output shape: {model.output_shape}")

Using existing ../data/deepstarr.model.json
Using existing ../data/deepstarr.model.h5

Model loaded successfully!

Wild-type predictions: (2.8394015, 0.8605407)
Model input shape: (None, 249, 4)
Model output shape: [(None, 1), (None, 1)]


In [31]:
import os
from urllib.request import urlretrieve
from keras.models import model_from_json

# Create a dedicated model directory
model_dir = 'models/deepstarr'
os.makedirs(model_dir, exist_ok=True)

# Define file paths
model_json_file = os.path.join(model_dir, 'deepstarr.model.json')
model_weights_file = os.path.join(model_dir, 'deepstarr.model.h5')

# Download model architecture if not present
if not os.path.exists(model_json_file):
    print("Downloading deepstarr.model.json...")
    url = 'https://www.dropbox.com/scl/fi/y1mwsqpv2e514md9t68jz/deepstarr.model.json?rlkey=cdwhstqf96fibshes2aov6t1e&st=9a0c5skz&dl=1'
    urlretrieve(url, model_json_file)
    print(f"Saved to {model_json_file}")
else:
    print(f"Using existing model architecture: {model_json_file}")

# Download model weights if not present
if not os.path.exists(model_weights_file):
    print("Downloading deepstarr.model.h5...")
    url = 'https://www.dropbox.com/scl/fi/6nl6e2hofyw70lh99h3uk/deepstarr.model.h5?rlkey=hqfnivn199xa54bjh8dn2jpaf&st=l4jig4ky&dl=1'
    urlretrieve(url, model_weights_file)
    print(f"Saved to {model_weights_file}")
else:
    print(f"Using existing model weights: {model_weights_file}")

# Load the model
with open(model_json_file, 'r') as f:
    model_json = f.read()

model = model_from_json(model_json, custom_objects={'Functional': tf.keras.Model})
model.load_weights(model_weights_file)

print("✓ Model loaded successfully!")

Using existing model architecture: models/deepstarr/deepstarr.model.json
Using existing model weights: models/deepstarr/deepstarr.model.h5
✓ Model loaded successfully!


In [32]:
## paper seqs

paper_loci_idx = [20647, 22612, 4071, 22627, 21069, 13748]
dev_mask = [0,1,0,0, 1, 1]
paper_loci = []
for i in paper_loci_idx:
    seq = X_test[i]
    paper_loci.append(seq)

paper_loci[0].shape

(249, 4)

In [33]:
## randomly selectt 5 seq of high (<2 activity) for each head
heads = [0,1]
dev_loci = []
dev_pred = []
hk_loci = []
hk_pred = []
dev_idx=[]
hk_idx =[]
h=0
j=0
go=True
threshold = 2

for i in heads:
    print("Starting or switching heads")
    if h != 5:
        go = True
    while go:
        idx = int(np.random.uniform(0, len(X_test)))
        seq = X_test[idx]
        seq = np.expand_dims(seq,0)

        pred = model.predict(seq, verbose=0)[i][0][0]
        if i==0 and pred > threshold:
            print(idx)
            print(pred)
            dev_loci.append(seq[0])
            dev_idx.append(idx)
            dev_pred.append(pred)
            j+=1
            if j==5:
                go=False
                print("5 dev found")
        if i==1 and pred>threshold:
            print(idx)
            print(pred)
            hk_loci.append(seq[0])
            hk_idx.append(idx)
            hk_pred.append(pred)

            h+=1
            if h == 5:
                go=False
                print("5 hk found")


print(f"We have isolated {len(dev_loci)} dev loci and{len(hk_loci)} hk loci with activity > {threshold}")
print(hk_pred)


Starting or switching heads
17977
2.9907937
21916
2.2963743
21289
2.7399046
3881
2.1586962
266
3.8290648
5 dev found
Starting or switching heads
31742
3.5934138
12962
2.2718976
12053
5.3012533
24723
4.471285
12279
3.426753
5 hk found
We have isolated 5 dev loci and5 hk loci with activity > 2
[3.5934138, 2.2718976, 5.3012533, 4.471285, 3.426753]


In [34]:
## Add random and paper together for each head

dev_mask_bool = np.array(dev_mask, dtype=bool)
paper_loci_array = np.array(paper_loci)



dev_paper_loci = paper_loci_array[dev_mask_bool].tolist()
hk_paper_loci = paper_loci_array[~dev_mask_bool].tolist()


dev_paper_pred = model.predict(np.array(dev_paper_loci))[0].flatten()
hk_paper_pred = model.predict(np.array(hk_paper_loci))[1].flatten() 

dev_loci.extend(dev_paper_loci)
hk_loci.extend(hk_paper_loci)

hk_paper_pred = np.squeeze(hk_paper_pred)
print(hk_paper_pred.shape)
dev_pred.extend(dev_paper_pred)
hk_pred.extend(hk_paper_pred)


dev_paper_idx = [paper_loci_idx[i] for i, m in enumerate(dev_mask) if m]
hk_paper_idx = [paper_loci_idx[i] for i, m in enumerate(dev_mask) if not m]

dev_idx.extend(dev_paper_idx)
hk_idx.extend(hk_paper_idx)

print(f'Paper loci has added {len(dev_paper_loci)} dev seqs and {len(hk_paper_loci)} hk seqs')
print(f'Total is now Dev: {len(dev_loci)} and Hk: {len(hk_loci)}')





(3,)
Paper loci has added 3 dev seqs and 3 hk seqs
Total is now Dev: 8 and Hk: 8


In [35]:
## Save library
from seam.utils import oh2seq
string_seqs_dev = []
string_seqs_hk = []

for i in dev_loci:
    loci = np.squeeze(i)
    str = oh2seq(loci)
    string_seqs_dev.append(str)

for i in hk_loci:
    loci = np.squeeze(i)
    str = oh2seq(loci)
    string_seqs_hk.append(str)

dev_hyperparam_library = pd.DataFrame({
    "test_idx": dev_idx,
    "sequence": string_seqs_dev,
    "activity": dev_pred,
    "ohe_seq": dev_loci
})
print(len(hk_pred))

hk_hyperparam_library = pd.DataFrame({
    "test_idx": hk_idx,
    "sequence": string_seqs_hk,
    "activity": hk_pred,
    "ohe_seq": hk_loci
})


hk_hyperparam_library.head(10)





8


Unnamed: 0,test_idx,sequence,activity,ohe_seq
0,31742,ACCATCGGGTAGTGCCGCTGATTGCAGCACAGCTGATCACGTTGCC...,3.593414,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [..."
1,12962,ACGCAAAGGTATAATTAGATACAATGAAAATAAGTTTCTTGCATGC...,2.271898,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [..."
2,12053,TATCCAGTCGGTGACCTGGTCGGGCGTCCACTCGGCGATGTTGATG...,5.301253,"[[0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 0.0, 0.0], [..."
3,24723,CCTCACCGCCTAAAACAACAAGCGCATATGTTTGGCTTATCGATAG...,4.471285,"[[0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [..."
4,12279,TCCGTTTTCTAGCCGTTTAATAGCTAGAGCTCCATCACTGTCGGCG...,3.426753,"[[0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [..."
5,20647,ATAACTTTAATAGCAAGCGAGTCTCTTTATTATCAAATCGCTTAAT...,5.298523,"[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
6,4071,AATAATCAAGGCGCGGCTGGCATCATCTTCCTCACTCCGATCCGCG...,-0.491585,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
7,22627,GTTAACTAGCTATGCGAGTACAACTTGTAAATAGAACATTCAAATT...,3.446424,"[[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."


In [36]:
import pickle

# Save both libraries together
with open('libraries/hyperparam_libraries.pkl', 'wb') as f:
    pickle.dump({
        'dev': dev_hyperparam_library,
        'hk': hk_hyperparam_library
    }, f)

# Load later
#with open('libraries/hyperparam_libraries.pkl', 'rb') as f:
    #libraries = pickle.load(f)
    #dev_lib = libraries['dev']
    #hk_lib = libraries['hk']