### Question 2.1

#### Dataset

In [1]:
from utils import load_rnacompete_data

In [2]:

ds_train = load_rnacompete_data("RBFOX1", split="train")
ds_val = load_rnacompete_data("RBFOX1", split="val")
ds_test = load_rnacompete_data("RBFOX1", split="test")

Loading Metadata from metadata.xlsx...
  > Metadata loaded in 0.10 seconds.
Loading Data from norm_data.txt...


  warn(msg)


  > Data Matrix loaded in 36.73 seconds.
Saving processed data to data/RBFOX1_train_data.pt...
Loading Metadata from metadata.xlsx...
  > Metadata loaded in 0.03 seconds.
Loading Data from norm_data.txt...


  warn(msg)


  > Data Matrix loaded in 35.63 seconds.
Saving processed data to data/RBFOX1_val_data.pt...
Loading Metadata from metadata.xlsx...
  > Metadata loaded in 0.02 seconds.
Loading Data from norm_data.txt...


  warn(msg)


  > Data Matrix loaded in 34.81 seconds.
Saving processed data to data/RBFOX1_test_data.pt...


##### CNN Implementation

Model definition

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Simple CNN model inspired by DeepBind (Alipanahi et al, 2015)
class CNN(nn.Module):
    
    def __init__(self, motif_len=24, num_filters=64, hidden=32, dropout=0.5):
        super().__init__()
        self.conv = nn.Conv2d(1, num_filters, kernel_size=(motif_len, 4))
        self.drop = nn.Dropout(dropout)
        self.fc1 = nn.Linear(num_filters, hidden)
        self.fc2 = nn.Linear(hidden, 1)

    def forward(self, x):
        x = F.relu(self.conv(x).squeeze(-1))   # (B, C, L')
        x = torch.max(x, dim=2).values         # global max pool -> (B, C)
        x = self.drop(x)
        x = F.relu(self.fc1(x))
        x = self.drop(x)
        x = self.fc2(x)                        # linear output
        return x

        

In [4]:
x = torch.rand(1,1,41,4)
model = CNN().to("cpu")
out = model(x)
print(out.shape)

torch.Size([1, 1])


Training loop

In [5]:

import pickle
from torch.utils.data import DataLoader
from utils import masked_mse_loss, masked_spearman_correlation, configure_seed


def train(model, epochs, batch_size, lr, ds_train, ds_val, checkpoint_path, verbose=True, seed=None):
    """
    Trains model with specified hyperparams on given train and validation datasets.
    Saves model checkpoint to specified path. 
    """
    
    if seed is not None:
        configure_seed(seed)
        
    # Data loaders
    train_loader = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(ds_val, batch_size=len(ds_val), shuffle=False)

    # Optimizer
    optim = torch.optim.Adam(model.parameters(), lr=lr)

    # Save best state
    best_state_dict = {}
    best_val_corr = 0.0
    best_epoch = 0

    # Training loop 
    for epoch in range(epochs):
        
        train_loss = 0.0
        train_corr = 0.0
        
        # Set model to training mode
        model.train()
            
        # Loop over batches
        for x, y, mask in train_loader:
            
            # Reset optimizer
            optim.zero_grad()
            
            # Add channel dimension to x
            x = x.unsqueeze(1)
            
            # Forward pass
            y_hat = model(x)
            
            # Loss
            loss = masked_mse_loss(y_hat, y, mask)
            train_loss += loss.item()
            train_corr += masked_spearman_correlation(y_hat, y, mask)
            
            # Backward pass
            loss.backward()
            
            # Optimizer step
            optim.step()
            
        
        # Average  
        train_loss /= len(train_loader)
        train_corr /= len(train_loader)
        
        # Set model to evalution mode
        model.eval()
        
        # Validation
        val_corr = 0.0
        for x, y, mask in val_loader:
            
            # Add channel dimension to x 
            x = x.unsqueeze(1)
            
            # Compute spearman correlation
            val_corr += masked_spearman_correlation( model(x), y, mask)
            
        # Average
        val_corr /= len(val_loader)
        
        # Update best state
        if val_corr > best_val_corr:
            best_val_corr = val_corr
            best_state_dict = model.state_dict()
            best_epoch = epoch
            
            # Save it to file
            with open(checkpoint_path,"wb") as file:
                pickle.dump(best_state_dict, file)
            
        if verbose:
            print(f"Epoch: {epoch}, Train loss: {train_loss}, Val corr: {val_corr }")
      
    if verbose:  
        print(f"Best model at epoch {best_epoch} with validation spearman correlation {best_val_corr}")
    
    return best_val_corr


In [6]:

# Device
device = "cpu"

# Model 
model = CNN().to(device)

# Train
train(model, 
      epochs=30, 
      batch_size=256, 
      lr=1e-3,
      ds_train=ds_train,
      ds_val=ds_val,
      checkpoint_path="q2_cnn_best.model")

Epoch: 0, Train loss: 0.9461544831172225, Val corr: 0.4329873025417328
Epoch: 1, Train loss: 0.7004308587518232, Val corr: 0.4810325801372528
Epoch: 2, Train loss: 0.6235661557245634, Val corr: 0.4916272759437561
Epoch: 3, Train loss: 0.6062124314611723, Val corr: 0.49794822931289673
Epoch: 4, Train loss: 0.5868577840947978, Val corr: 0.5055224299430847
Epoch: 5, Train loss: 0.5856072598332119, Val corr: 0.5067200660705566
Epoch: 6, Train loss: 0.5817493397139744, Val corr: 0.5134938359260559
Epoch: 7, Train loss: 0.5664813454690283, Val corr: 0.5118592381477356
Epoch: 8, Train loss: 0.5593826341534167, Val corr: 0.5156391263008118
Epoch: 9, Train loss: 0.5562774644922515, Val corr: 0.513916552066803
Epoch: 10, Train loss: 0.5538510314191368, Val corr: 0.5178666114807129
Epoch: 11, Train loss: 0.5540623706357864, Val corr: 0.5205375552177429
Epoch: 12, Train loss: 0.5545746827947682, Val corr: 0.5219163298606873
Epoch: 13, Train loss: 0.5487678908700968, Val corr: 0.5255662202835083
Ep

tensor(0.5372)

Evaluate on test dataset

In [7]:

# Evaluate on test dataset
model = CNN().to(device)

# Data loader for test dataset
test_loader = DataLoader(ds_test, len(ds_test), shuffle=False)

with open("q2_cnn_best.model","rb") as file:
    model.load_state_dict(pickle.load(file))
    
test_corr = 0.0
for x, y, mask in test_loader:
        
    # Add channel dimension to x 
    x = x.unsqueeze(1)
    
    # Compute spearman correlation
    test_corr = masked_spearman_correlation( model(x), y, mask)
    
    # Break (single batch)
    break

print(f"Spearman correlation on test set is {test_corr}")
    


Spearman correlation on test set is 0.45878487825393677


Hyperparameters tuning

##### RNN Implementation

Model definition

In [8]:

import torch
import torch.nn as nn 
import torch.nn.functional as F 

# Simple RNN inspired by DeeperBind ( ..., 2016)
class RNN(nn.Module):
    
    def __init__(
        self,
        seq_length: int = 41,          # kept for API compatibility; not strictly needed
        motif_len: int = 11,           # paper commonly uses motif length ~11
        conv_out_ch: int = 32,         # number of motif detectors / filters
        lstm_hidden_size: int = 128,
        lstm_layers: int = 2,
        dropout: float = 0.5
    ):
        super().__init__()

        # Single convolutional motif detector bank (spans full alphabet width=4)
        self.conv = nn.Conv2d(
            in_channels=1,
            out_channels=conv_out_ch,
            kernel_size=(motif_len, 4)
        )

        # Stacked LSTM over the sequence of conv features
        # batch_first=True so input/output is (B, T, C)
        self.lstm = nn.LSTM(
            input_size=conv_out_ch,
            hidden_size=lstm_hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=dropout if lstm_layers > 1 else 0.0
        )

        # Small MLP head (at most one hidden layer, as described in the paper)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(lstm_hidden_size, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Expected x shape: (B, 1, L, 4)  (one-hot sequence with channel dim)
        Returns: (B, 1)
        """

        # Conv + ReLU -> (B, C, T, 1) then squeeze -> (B, C, T)
        z = F.relu(self.conv(x).squeeze(-1))

        # Prepare for LSTM: (B, T, C)
        v = z.transpose(1, 2).contiguous()

        # LSTM over positions
        out, _ = self.lstm(v)          # out: (B, T, H)

        # "Last unrolled LSTM block makes the final decision"
        h = out[:, -1, :]              # (B, H)

        # MLP head (linear output for regression)
        h = self.dropout(h)
        h = F.relu(self.fc1(h))
        h = self.dropout(h)
        y = self.fc2(h)                # (B, 1)

        return y



In [None]:
x = torch.rand(1,1,41,4)
model = RNN().to("cpu")

out = model(x)

Training loop

In [11]:
# Device 
device = "cpu"

# Model
model = RNN().to(device)

train(model, 20, 512, 1e-3, ds_train, ds_val, "q2_rnn_best.model")

Epoch: 0, Train loss: 1.003463015669868, Val corr: -0.2452460527420044
Epoch: 1, Train loss: 0.9938512233199266, Val corr: 0.20198918879032135
Epoch: 2, Train loss: 0.9459036129807669, Val corr: 0.27277666330337524
Epoch: 3, Train loss: 0.8337126538236305, Val corr: 0.4138477146625519
Epoch: 4, Train loss: 0.6838430880554138, Val corr: 0.4574824571609497
Epoch: 5, Train loss: 0.6046485933204176, Val corr: 0.4754132926464081
Epoch: 6, Train loss: 0.5500769388108027, Val corr: 0.4974198043346405
Epoch: 7, Train loss: 0.5298565075826392, Val corr: 0.5067995190620422
Epoch: 8, Train loss: 0.5172585751959886, Val corr: 0.5115245580673218
Epoch: 9, Train loss: 0.5166842547989396, Val corr: 0.5152186155319214
Epoch: 10, Train loss: 0.4981591330949592, Val corr: 0.5267212986946106
Epoch: 11, Train loss: 0.49201246804345855, Val corr: 0.5272833108901978
Epoch: 12, Train loss: 0.4811716394291984, Val corr: 0.5358620882034302
Epoch: 13, Train loss: 0.4801191819644479, Val corr: 0.5370001792907715

tensor(0.5584)