In [8]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
#import pyBigWig
import glob
from tqdm import tqdm
from datetime import datetime
import random
import os
from IPython.display import clear_output


## Work Package 1.1 - Modeling Choices & Data Pre-processing

In [9]:
SIGNALS_CNN = ['DNase', 'H3K4me1', 'H3K4me3', 'H3K27ac', 'H3K36me3']   # List of signal tracks to be used
N_SIGNALS_CNN = len(SIGNALS_CNN)
SIGNAL_CNN_WINDOW = 1e4 # Window size in base pairs
N_BINS = 100 # Number of base pairs per bin
PREPROCESSED_BASE_PATH = "Data/preprocessed"

In [10]:
from scipy import signal


half_window = int(SIGNAL_CNN_WINDOW // 2)
chromstrs = []
ranges = []


def get_signals_bins(df: pd.DataFrame, cell_type: str, n_bins: int=N_BINS) -> np.ndarray:
    df["neg_strand"] = mask = df['strand'] == '-'
    df.loc[mask, ['TSS_start', 'TSS_end']] = df.loc[mask, ['TSS_end', 'TSS_start']].values
    df["center"] = ((df["TSS_start"] + df["TSS_end"]) // 2).astype(int)
    df["window_start"] = df["center"] - half_window
    df["window_end"] = df["center"] + half_window

    bins_signal_gene = np.zeros((len(df), len(SIGNALS_CNN), N_BINS))
    for i, signal in enumerate(SIGNALS_CNN):
        print("Processing signal:", signal, f"{i+1}/{len(SIGNALS_CNN)}")
        fname = glob.glob(f"Data/bigwig/{signal}-bigwig/{cell_type}*")[0]
        bw = pyBigWig.open(fname)
        for j, (chromstr, window_start, window_end, neg_strand) in enumerate(tqdm(df[["chr", "window_start", "window_end", "neg_strand"]].itertuples(index=False), total=len(df))):
            bins = bw.stats(chromstr, window_start, window_end, type="mean", nBins=n_bins)
            if neg_strand:
                bins = bins[::-1]
            bins_signal_gene[j, i] = bins

        bw.close()
            
    return bins_signal_gene

In [12]:
cell_types_per_set = {
    "train": ["X1", "X2"],
    "val": ["X1", "X2"],
    "test": ["X3"]
}

for train_test_val, cell_types in cell_types_per_set.items():
    print(f"Processing {train_test_val} set.")
    for cell_type in cell_types:
        print(f"Processing cell type: {cell_type}")
        df = pd.read_csv(f'./Data/CAGE-train/{cell_type}_{train_test_val}_info.tsv', sep='\t', usecols=[0,1,4,5,6])
        signal_bins = get_signals_bins(df, cell_type, n_bins=N_BINS)
        np.save(f'{PREPROCESSED_BASE_PATH}/cnn_input_{cell_type}_{train_test_val}_{N_BINS}.npy', signal_bins)

Processing train set.
Processing cell type: X1
Processing signal: DNase 1/5


IndexError: list index out of range

Group the data for training:

In [13]:
for train_test_val, cell_types in cell_types_per_set.items():
    all_data = []
    for cell_type in cell_types:
        data = np.load(f'{PREPROCESSED_BASE_PATH}/cnn_input_{cell_type}_{train_test_val}_{N_BINS}.npy')
        all_data.append(data)
    grouped = np.concatenate(all_data, axis=0)
    np.save(f"{PREPROCESSED_BASE_PATH}/cnn_input_{train_test_val}.npy", grouped)    

FileNotFoundError: [Errno 2] No such file or directory: 'Data/preprocessed/cnn_input_X1_train_100.npy'

Do the same with the outputs:

In [14]:
for train_test_val, cell_types in cell_types_per_set.items():
    if train_test_val == "test":
        continue
    all_y = []
    for cell_type in cell_types:
        y_df = pd.read_csv(f"Data/CAGE-train/{cell_type}_{train_test_val}_y.tsv", sep="\t")
        all_y.append(y_df["gex"].to_numpy())
    np.save(f"{PREPROCESSED_BASE_PATH}/cnn_y_{train_test_val}.npy", np.concatenate(all_y, axis=0))       

FileNotFoundError: [Errno 2] No such file or directory: 'Data/preprocessed/cnn_y_train.npy'

## Work Package 1.2 - Model Building

### Define the model

The model used to estimate the gene expression from binned signal is the following:

In [15]:
class GeneCNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(N_SIGNALS_CNN, 32, kernel_size=3, padding="same"),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(32, 16, kernel_size=3, padding="same"),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(16, 1, kernel_size=3, padding="same"),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.LazyLinear(1),
        )

    def forward(self, x):
        # x shape: (batch, features=2, genes)
        out = self.net(x)
        return out.squeeze(1)  # shape: (batch, genes)

### Train the model
Since we are scoring with Spearman correlation, by which only the rank of the predicted elements is considered, we train our model to predict the $\log(y + 1)$. This doesn't affect the score, since $\log$ is monotonically increasing, and will help the training, given that some very high values in $y$ are difficult to model.

In [16]:
class BigWigDataset(Dataset):
    def __init__(self, X, y):
        self.missing_val = np.any(np.isnan(X) | np.isinf(X), axis=(1,2))
        X = X[~self.missing_val, :, :]
        y = y[~self.missing_val]
        self.X = torch.from_numpy(X).float()
        self.y_orig = torch.from_numpy(y).float()
        self.y = torch.log1p(self.y_orig)  # log(y + 1) transformation
    
    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [17]:
BATCH_SIZE = 2056
EPOCHS = 200
LEARNING_RATE = 0.001

SAVE_BASED_ON_SPEARMAN = True

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x26b24d3eb50>

In [18]:
x_train = np.load(f"{PREPROCESSED_BASE_PATH}/cnn_input_train.npy")
y_train = np.load(f"{PREPROCESSED_BASE_PATH}/cnn_y_train.npy")

x_val = np.load(f"{PREPROCESSED_BASE_PATH}/cnn_input_val.npy")
y_val = np.load(f"{PREPROCESSED_BASE_PATH}/cnn_y_val.npy")

training_loader = DataLoader(
    BigWigDataset(x_train, y_train),
    batch_size=BATCH_SIZE,
    shuffle=True
)

x_val_torch, y_val_torch = BigWigDataset(x_val, y_val)[:]

FileNotFoundError: [Errno 2] No such file or directory: 'Data/preprocessed/cnn_input_train.npy'

In [19]:
net = GeneCNNModel()
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)
best_val_loss = float('inf')
best_spearman_corr = float('-inf')
spearman_model_path = None
val_model_path = None

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
for epoch in range(EPOCHS):
    best_loss = float("inf")
    last_loss = 0

    net.train()

    running_loss = .0
    print(f"Epoch: {epoch + 1}/{EPOCHS}")
    
    for i, (X, y) in tqdm(enumerate(training_loader), total=len(training_loader)):            
        optimizer.zero_grad()
        pred = net(X)
        loss = loss_fn(pred.flatten(), y)
        
        loss.backward()

        optimizer.step()
        batch_loss = loss.item()
        running_loss += batch_loss
        best_loss = min(batch_loss, best_loss)
    
    

    avg_loss = running_loss / (i + 1)

    print(f"Training loss: best {best_loss} and avg {avg_loss}")

    net.eval()

    running_vloss = .0

    pred_val = net(x_val_torch)
    loss = loss_fn(pred_val.flatten(), y_val_torch)
    val_loss = loss.item()
    
    print('Validation loss: {}'.format(val_loss))

    spearman_corr = spearmanr(pred_val.detach().numpy(), y_val_torch.detach().numpy()).statistic
    print("Spearman's correlation:", spearman_corr)
    
    def save_model(criterion: str, model_path: str | None) -> str:
        if model_path is not None and os.path.exists(model_path):
            os.remove(model_path)

        model_path = "models/cnn_{}_{}_BS{}_LR{}_BIN{}_W{}_EPOCH{}".format(timestamp, criterion, BATCH_SIZE, LEARNING_RATE, int(N_BINS), int(SIGNAL_CNN_WINDOW), epoch + 1)

        print(f"Saving as best model in {model_path}")
        torch.save(net, model_path)

        return model_path
    
    
    if spearman_corr > best_spearman_corr:
        best_spearman_corr = spearman_corr
        if SAVE_BASED_ON_SPEARMAN:
            spearman_model_path = save_model("SPEARMAN{:.4f}".format(best_spearman_corr * 100), spearman_model_path)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        val_model_path = save_model("VALLOSS{:.4f}".format(best_val_loss), val_model_path)

    if (epoch + 1) % 10 == 0:
        clear_output(wait=True)

print("Training complete. Best avg validation loss:", best_val_loss)

Epoch: 1/200


NameError: name 'training_loader' is not defined

## Work Package 1.3 - Prediction on Test Data (Evaluation Metric)

In [None]:

x_test = np.load(f"{PREPROCESSED_BASE_PATH}/cnn_input_test.npy")

if SAVE_BASED_ON_SPEARMAN:
    cnn_net = torch.load(spearman_model_path, weights_only=False) # Load the best model based on Spearman correlation
else:
    cnn_net = torch.load(val_model_path, weights_only=False) # Load the best model based on validation loss

# Or manually specify the model path:
# cnn_net = torch.load("models/cnn_your_model_path_here", weights_only=False)

cnn_net.eval()
y_test_pred = cnn_net(torch.tensor(x_test, dtype=torch.float32)).detach().numpy()

test_genes = pd.read_csv('./Data/CAGE-train/X3_test_info.tsv', sep='\t', usecols=[0,1])
test_genes['gex_predicted'] = np.exp(y_test_pred.flatten()) - 1

#### Store Predictions in the Required Format

In [None]:
# Store predictions in a ZIP. 
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".

save_dir = 'Data/submission'
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "Laffranchi_Paolo_Project1.zip"
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

test_genes[['gene_name', 'gex_predicted']].to_csv(save_path, compression=compression_options)