# Libraries, paths, functions, and constants

In [1]:
!nvidia-smi

Fri Apr  4 12:22:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.77                 Driver Version: 565.77         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40                     Off |   00000000:28:00.0 Off |                    0 |
| N/A   45C    P0             89W /  300W |       1MiB /  46068MiB |    100%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

* libraries

In [2]:
import os

import sys
sys.path.append(f"../../3_train_and_test_models")

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import warnings

from tqdm import tqdm
from torch.utils.data import DataLoader
from torch import Tensor
from einops import rearrange
from typing import Callable, List, Optional, Union
from params import Params, ROOT
from generators import TrainGenerator_MultiSpecies, ValGenerator_MultiSpecies, TestGenerator_SingleSpecies
from sklearn.metrics import average_precision_score, roc_auc_score, confusion_matrix, log_loss

2025-04-04 12:22:49.947720: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-04 12:22:49.959871: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743783769.972774  350708 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743783769.976469  350708 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-04 12:22:49.991763: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [4]:
# gReLU "Genomics" Zoo!
import grelu.resources

from grelu.model.blocks import ChannelTransformBlock, LinearBlock
from grelu.model.layers import AdaptivePool
from grelu.model.layers import (
    Activation,
    Attention,
    ChannelTransform,
    Crop,
    Dropout,
    Norm,
    Pool,
)

  from .autonotebook import tqdm as notebook_tqdm


## Define helper functions

In [None]:
def get_model_files(tf, test_species):
    # This function returns the filepath where the model for a given
    # TF, training species, and run is saved.
    # By default, the file for the best model across all training epochs
    # is returned and we grab the latest trained model.

    model_path = ROOT + "/".join(["/models", tf, test_species + "_tested", "MORALE/"])
    feature_extractor_suffix = ".feature_extractor.pt"

    # get all files that match the prefix and suffix
    feature_extractor_files = [f for f in os.listdir(model_path) if f.endswith(feature_extractor_suffix)]
    
    # sort files and return the one that is most recent
    latest_feature_extractor_file   = max([model_path + f for f in feature_extractor_files], key=os.path.getctime)

    return latest_feature_extractor_file

def get_embedding_file(tf, test_species):
    preds_root = ROOT + "/embeddings"
    os.makedirs(preds_root, exist_ok=True)
    return f"{preds_root}/MORALE_{tf}_{test_species}-tested.embedding"

def get_label_file(tf, test_species):
    preds_root = ROOT + "/embeddings"
    os.makedirs(preds_root, exist_ok=True)
    return f"{preds_root}/MORALE_{tf}_{test_species}-tested.labels"

## Load in model

In [6]:
'''
Our baseline model built off of code from the gReLU model zoo. Anecdotally,
I find this inital convolutional block we have been using performs quite well,
so we port it over and add the bidirectional GRU here, instead of the LSTM
we use for the two-species model.
'''

class ConvHead(nn.Module):
    """
    A 1x1 Conv layer that transforms the the number of channels in the input and then
    optionally pools along the length axis.

    Args:
        n_tasks: Number of tasks (output channels)
        in_channels: Number of channels in the input
        norm: If True, batch normalization will be included.
        act_func: Activation function for the convolutional layer
        pool_func: Pooling function.
        dtype: Data type for the layers.
        device: Device for the layers.
    """

    def __init__(
        self,
        n_tasks: int,
        in_channels: int,
        act_func: Optional[str] = None,
        pool_func: Optional[str] = None,
        norm: bool = False,
        dtype=None,
        device=None,
    ) -> None:
        super().__init__()
        # Save all params
        self.n_tasks = n_tasks
        self.in_channels = in_channels
        self.act_func = act_func
        self.pool_func = pool_func
        self.norm = norm

        # Create layers
        self.channel_transform = ChannelTransformBlock(
            self.in_channels,
            self.n_tasks,
            act_func=self.act_func,
            norm=self.norm#,
            # dtype=dtype,
            # device=device
        )
        self.pool = AdaptivePool(self.pool_func)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x : Input data.
        """
        x = self.channel_transform(x)
        x = self.pool(x)
        return x

class LinearBlock(nn.Module):
    """
    Linear layer followed by optional normalization,
    activation and dropout.

    Args:
        in_len: Length of input
        out_len: Length of output
        act_func: Name of activation function
        dropout: Dropout probability
        norm: If True, apply layer normalization
        bias: If True, include bias term.
        dtype: Data type of the weights
        device: Device on which to store the weights
    """

    def __init__(
        self,
        in_len: int,
        out_len: int,
        act_func: str = "relu",
        dropout: float = 0.0,
        norm: bool = False,
        bias: bool = True,
        dtype=None,
        device=None,
    ) -> None:
        super().__init__()

        self.norm = Norm(
            func="layer" if norm else None, in_dim=in_len, dtype=dtype, device=device
        )
        self.linear = nn.Linear(in_len, out_len, bias=bias, dtype=dtype, device=device)
        self.dropout = Dropout(dropout)
        self.act = Activation(act_func)

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass

        Args:
            x : Input tensor of shape (N, C, L)

        Returns:
            Output tensor
        """
        x = self.norm(x)
        x = self.linear(x)
        x = self.dropout(x)
        x = self.act(x)
        return x
    
class FeedForwardBlock(nn.Module):
    """
    2-layer feed-forward network. Can be used to follow layers such as GRU and attention.

    Args:
        in_len: Length of the input tensor
        dropout: Dropout probability
        act_func: Name of the activation function
        kwargs: Additional arguments to be passed to the linear layers
    """

    def __init__(
        self,
        in_len: int,
        dropout: float = 0.0,
        act_func: str = "relu",
        **kwargs,
    ) -> None:
        super().__init__()
        self.dense1 = LinearBlock(
            in_len,
            in_len * 2,
            norm=True,
            dropout=dropout,
            act_func=act_func,
            bias=True,
            **kwargs,
        )
        self.dense2 = LinearBlock(
            in_len * 2,
            in_len,
            norm=False,
            dropout=dropout,
            act_func=None,
            bias=True,
            **kwargs,
        )

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass

        Args:
            x : Input tensor of shape (N, C, L)

        Returns:
            Output tensor
        """
        x = self.dense1(x)
        x = self.dense2(x)
        return x
    
class GRUBlock(nn.Module):
    """
    Stacked bidirectional GRU layers followed by a feed-forward network.

    Args:
        in_channels: The number of channels in the input
        n_layers: The number of GRU layers
        gru_hidden_size: Number of hidden elements in GRU layers
        dropout: Dropout probability
        act_func: Name of the activation function for feed-forward network
        norm: If True, include layer normalization in feed-forward network.
        dtype: Data type of the weights
        device: Device on which to store the weights
    """

    def __init__(
        self,
        in_channels: int,
        n_layers: int = 1,
        dropout: float = 0.0,
        act_func: str = "relu",
        norm: bool = False,
        dtype=None,
        device=None,
    ) -> None:
        super().__init__()

        self.gru = nn.GRU(
            input_size=in_channels,
            hidden_size=in_channels,
            dropout=dropout,
            bidirectional=True,
            batch_first=True,
            num_layers=n_layers,
            dtype=dtype,
            device=device,
        )
        self.ffn = FeedForwardBlock(
            in_len=in_channels,
            dropout=dropout,
            act_func=act_func,
            dtype=dtype,
            device=device,
        )

    def forward(self, x: Tensor) -> Tensor:
        """
        Forward pass

        Args:
            x : Input tensor of shape (N, C, L)

        Returns:
            Output tensor
        """
        x = rearrange(x, "b t l -> b l t")
        x = self.gru(x)[0]
        # Combine output of forward and reverse GRU
        x = x[:, :, : self.gru.hidden_size] + x[:, :, self.gru.hidden_size :]
        x = self.ffn(x)
        x = rearrange(x, "b l t -> b t l")
        return x

In [7]:
class FeatureExtractor(nn.Module):
    def __init__(self, params):
        super(FeatureExtractor, self).__init__()
        self.conv1          = nn.Conv1d(in_channels=4, out_channels=params.convfilters, kernel_size=params.filtersize, padding="same")
        self.pool           = nn.MaxPool1d(kernel_size=params.pool_size+1, stride=params.strides+1, padding=params.pool_size // 2)
        self.gru_tower      = GRUBlock(
            in_channels=params.convfilters,
            n_layers=1,
            dropout= 0.0,
            act_func="relu",
            norm=False,
            device=None,
            dtype=None
        )
        self.pooled_embedding=ConvHead(
            n_tasks=(params.lstmnodes*2)-1,
            in_channels=params.convfilters,
            pool_func="avg",
            act_func=None,
            norm=False,
            dtype=None,
            device=None
        )
    
    def forward(self, x):
        x = x.transpose(1, 2)                   # -> [B, C, S]
        x = self.conv1(x)
        x = torch.nn.functional.relu(x)
        x = self.pool(x)
        embedding = self.gru_tower(x)
        pooled_embedding = self.pooled_embedding(embedding)
        return pooled_embedding

# `Main`

In [8]:
SAVE = False

seed = 1182024

torch.manual_seed(seed)
 
np.random.seed(seed)

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"\nUsing device: {device}")


Using device: cuda


In [10]:
args        = ["HNF6", "hg38"]
tf, target  = args
params      = Params(args = ["Testing", tf, target], verbose=True)

{'bindingtrainnegfiles': {'canFam6': '/net/talisker/home/benos/mae117/Documents/research/dennis/domain_adaptation/MORALE/multi-species/data/canFam6/HNF6/train_neg_shuf.bed',
                          'hg38': '/net/talisker/home/benos/mae117/Documents/research/dennis/domain_adaptation/MORALE/multi-species/data/hg38/HNF6/train_neg_shuf.bed',
                          'mm10': '/net/talisker/home/benos/mae117/Documents/research/dennis/domain_adaptation/MORALE/multi-species/data/mm10/HNF6/train_neg_shuf.bed',
                          'rheMac10': '/net/talisker/home/benos/mae117/Documents/research/dennis/domain_adaptation/MORALE/multi-species/data/rheMac10/HNF6/train_neg_shuf.bed',
                          'rn7': '/net/talisker/home/benos/mae117/Documents/research/dennis/domain_adaptation/MORALE/multi-species/data/rn7/HNF6/train_neg_shuf.bed'},
 'bindingtrainposfiles': {'canFam6': '/net/talisker/home/benos/mae117/Documents/research/dennis/domain_adaptation/MORALE/multi-species/data/canFam6

In [11]:
feature_extractor = FeatureExtractor(params)
feature_extractor = feature_extractor.to(device)
feature_params = sum(p.numel() for p in feature_extractor.parameters())
print(f"Feature Extractor Architecture:\n{feature_extractor}\n")

Feature Extractor Architecture:
FeatureExtractor(
  (conv1): Conv1d(4, 240, kernel_size=(20,), stride=(1,), padding=same)
  (pool): MaxPool1d(kernel_size=16, stride=16, padding=7, dilation=1, ceil_mode=False)
  (gru_tower): GRUBlock(
    (gru): GRU(240, 240, batch_first=True, bidirectional=True)
    (ffn): FeedForwardBlock(
      (dense1): LinearBlock(
        (norm): Norm(
          (layer): LayerNorm((240,), eps=1e-05, elementwise_affine=True)
        )
        (linear): Linear(in_features=240, out_features=480, bias=True)
        (dropout): Dropout(
          (layer): Identity()
        )
        (act): Activation(
          (layer): ReLU()
        )
      )
      (dense2): LinearBlock(
        (norm): Norm(
          (layer): Identity()
        )
        (linear): Linear(in_features=480, out_features=240, bias=True)
        (dropout): Dropout(
          (layer): Identity()
        )
        (act): Activation(
          (layer): Identity()
        )
      )
    )
  )
  (pooled_embed

In [12]:
total_params = feature_params
print(f"Total number of parameters: {total_params}")

Total number of parameters: 960303


In [None]:
feature_extractor_file  = get_model_files(tf, target)
embedding_file          = get_embedding_file(tf, target)
label_file              = get_label_file(tf, target)

In [None]:
print(f"Loading feature extractor from {feature_extractor_file}\n")
feature_extractor.load_state_dict(torch.load(feature_extractor_file))

Loading feature extractor from /net/talisker/home/benos/mae117/Documents/research/dennis/domain_adaptation/MORALE/multi-species/models/HNF6/hg38_tested/MORALE/2025-03-06_15-35-48.feature_extractor.pt



  feature_extractor.load_state_dict(torch.load(feature_extractor_file))


<All keys matched successfully>

* Create data generator (test, on the single, target species)

In [15]:
def test_collate(batch):
    data    = torch.cat([dict_item['sequence'] for dict_item in batch]).float()
    label   = torch.stack([dict_item['label'] for dict_item in batch]).int()
    
    return {
        "sequence": data,
        "label": label
    }

batch_size  = 10000
teg_ss      = TestGenerator_SingleSpecies(params=params, percent_to_batch=1.0)

test_loader = DataLoader(
    dataset=teg_ss,
	batch_size=batch_size,
    pin_memory=True,
    collate_fn=test_collate
)

* loop over test data and save embeddings

In [None]:
seed        = 1182024

torch.manual_seed(seed)

np.random.seed(seed)

  0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
test_bar    = tqdm(enumerate(test_loader), total=len(test_loader))

with torch.no_grad():

    print(f"Generating predictions for {target}-on-{target}.\n")

    # Turn on evaluation mode
    feature_extractor.eval()

    # A list to store the embeddings
    batch_embeddings_list       = []
    batch_labels_list           = []
    for batch_idx, data in test_bar:
        curr_data = torch.tensor(data['sequence'], dtype=torch.float32).to(device)
        batch_embeddings_list.append(feature_extractor(curr_data).squeeze().detach().cpu())
        batch_labels_list.extend(data['label'].detach().cpu())

    if SAVE:
        # Combine all embeddings and save
        all_embedding = torch.cat(batch_embeddings_list, dim=0).detach().cpu().numpy()
        np.save(embedding_file, all_embedding)

        # Combine all labels and save
        all_labels = np.array(batch_labels_list, dtype=np.float32).T
        np.save(label_file, all_labels)

Generating predictions for hg38-on-hg38.



  curr_data = torch.tensor(data['sequence'], dtype=torch.float32).to(device)
  return F.conv1d(
100%|██████████| 200/200 [14:10<00:00,  4.25s/it]  


----