During the creation of the multi CNN model generative AI was used for debugging and improvement of code.

In [None]:
!pip install d2l==1.0.3
! pip install biopython
!pip install wandb -qU
!pip install focal_loss_torch

Collecting focal_loss_torch
  Using cached focal_loss_torch-0.1.2-py3-none-any.whl.metadata (2.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->focal_loss_torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->focal_loss_torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->focal_loss_torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->focal_loss_torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->focal_loss_torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (fro

In [None]:
import os
! git clone https://git.wur.nl/bioinformatics/grs34806-deep-learning-project-data.git
os.chdir("grs34806-deep-learning-project-data")

Cloning into 'grs34806-deep-learning-project-data'...
remote: Enumerating objects: 21, done.[K
remote: Total 21 (delta 0), reused 0 (delta 0), pack-reused 21 (from 1)[K
Receiving objects: 100% (21/21), 8.74 MiB | 5.33 MiB/s, done.


In [None]:
import random
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score

import wandb

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x788690294ab0>

In [None]:

def read_labels(seqfile, posfiles):
    """
    seqfile: lines of "protein_id sequence"
    posfiles: list of GO_*.annotprot files, each line a protein_id
    Returns: seqs (list of str), labels (list of int 0..K-1), K = len(posfiles)+1
    """
    prot_ids, seqs = [], []
    with open(seqfile) as f:
        for line in f:
            pid, seq = line.strip().split()
            prot_ids.append(pid)
            seqs.append(seq)

    go_terms = [os.path.splitext(os.path.basename(pf))[0] for pf in posfiles]
    pid2go = {}
    multi_count = 0
    seen_multi = set()
    for pf in posfiles:
        term = os.path.splitext(os.path.basename(pf))[0]
        with open(pf) as g:
            for line in g:
                pid = line.strip()
                if pid not in pid2go:
                    pid2go[pid] = term
                else:
                    if pid not in seen_multi:
                        seen_multi.add(pid)
                        multi_count += 1
    print(f"{multi_count} proteins had multiple annotations; only first used")

    labels = []
    for pid in prot_ids:
        if pid in pid2go:
            labels.append(go_terms.index(pid2go[pid]) + 1)
        else:
            labels.append(0)
    return seqs, labels, len(go_terms) + 1


In [None]:
AA2IDX = {aa: i for i, aa in enumerate("ACDEFGHIKLMNPQRSTVWY")}
PAD_IDX = len(AA2IDX)  # 20

def encode(sequences, maxlen):
    N = len(sequences)
    arr = np.full((N, maxlen), PAD_IDX, dtype=np.int64)
    for i, seq in enumerate(sequences):
        toks = [AA2IDX.get(c, PAD_IDX) for c in seq[:maxlen]]
        arr[i, :len(toks)] = toks
    return torch.from_numpy(arr)

In [None]:
class MultiClassCnn(nn.Module):
    def __init__(self, num_aa, emb_dim, num_classes, dropout):
        super().__init__()
        self.embed = nn.Embedding(num_aa + 1, emb_dim, padding_idx=PAD_IDX)
        self.cnn = nn.Sequential(
            nn.Conv1d(emb_dim, 128, kernel_size=7, padding=3),
            nn.BatchNorm1d(128), nn.ReLU(),
            nn.MaxPool1d(3, stride=2, padding=1), nn.Dropout(dropout),

            nn.Conv1d(128, 256, kernel_size=5, padding=2),
            nn.BatchNorm1d(256), nn.ReLU(),
            nn.MaxPool1d(3, stride=2, padding=1), nn.Dropout(dropout),

            nn.Conv1d(256, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128), nn.ReLU(),
            nn.AdaptiveMaxPool1d(1)
        )
        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.embed(x)
        x = x.permute(0, 2, 1)
        x = self.cnn(x)
        x = x.squeeze(-1)
        return self.classifier(x)



In [None]:
if __name__ == "__main__":
    # hyperparameters
    wandb.init(
        project="deep-learning-MBF4",
        config={
            "seq_file":              "expr5Tseq_filtGO_100-1000.lis",
            "pos_files":            [
                                        "GO_3A0005576.annotprot",
                                        "GO_3A0005739.annotprot",
                                        "GO_3A0007165.annotprot",
                                        "GO_3A0043066.annotprot",
                                        "GO_3A0055085.annotprot"
                                    ],
            "max_length":            1000,
            "batch_size":            64,
            "lr":                    1e-3,
            "epochs":               20,
            "dropout":              0.3,
            "emb_dim":              128,
        }
    )
    config = wandb.config

    # Read & split data
    seqs, labels, num_classes = read_labels(config.seq_file, config.pos_files)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=SEED)
    train_idx, test_idx = next(sss.split(seqs, labels))

    X_train = encode([seqs[i] for i in train_idx], config.max_length)
    y_train = torch.tensor([labels[i] for i in train_idx], dtype=torch.long)
    X_test  = encode([seqs[i] for i in test_idx],  config.max_length)
    y_test  = torch.tensor([labels[i] for i in test_idx], dtype=torch.long)

    train_ds = TensorDataset(X_train, y_train)
    test_ds  = TensorDataset(X_test,  y_test)
    train_loader = DataLoader(train_ds,
                              batch_size=config.batch_size,
                              shuffle=True)
    test_loader  = DataLoader(test_ds,
                              batch_size=config.batch_size)

    # Model, loss, optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultiClassCnn(
        num_aa=len(AA2IDX),
        emb_dim=config.emb_dim,
        num_classes=num_classes,
        dropout=config.dropout
    ).to(device)

    counts = Counter(y_train.tolist())
    total = sum(counts.values())
    class_weights = torch.tensor(
        [total / (num_classes * counts[i]) for i in range(num_classes)],
        dtype=torch.float32
    ).to(device)

    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

    # Training loop
    for epoch in range(1, config.epochs + 1):
        model.train()
        train_loss_sum = 0.0
        train_correct = 0
        train_total = 0
        all_preds, all_labels = [], []

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            optimizer.step()

            train_loss_sum += loss.item() * xb.size(0)
            preds = logits.argmax(dim=1)
            train_correct += (preds == yb).sum().item()
            train_total += xb.size(0)

            all_preds.append(preds.cpu())
            all_labels.append(yb.cpu())

        train_loss = train_loss_sum / train_total
        train_acc  = train_correct / train_total
        train_f1   = f1_score(torch.cat(all_labels),
                               torch.cat(all_preds),
                               average="macro")

        model.eval()
        test_loss_sum = 0.0
        test_correct = 0
        test_total = 0
        all_preds, all_labels = [], []

        with torch.no_grad():
            for xb, yb in test_loader:
                xb, yb = xb.to(device), yb.to(device)
                logits = model(xb)
                loss = loss_fn(logits, yb)

                test_loss_sum += loss.item() * xb.size(0)
                preds = logits.argmax(dim=1)
                test_correct += (preds == yb).sum().item()
                test_total += xb.size(0)

                all_preds.append(preds.cpu())
                all_labels.append(yb.cpu())

        test_loss = test_loss_sum / test_total
        test_acc  = test_correct / test_total
        test_f1   = f1_score(torch.cat(all_labels),
                              torch.cat(all_preds),
                              average="macro")

        wandb.log({
            "epoch":            epoch,
            "train_loss":       train_loss,
            "train_accuracy":   train_acc,
            "train_macro_f1":   train_f1,
            "test_loss":        test_loss,
            "test_accuracy":    test_acc,
            "test_macro_f1":    test_f1,
        })

        print(f"Epoch {epoch:2d} | "
              f"Train loss {train_loss:.4f}, acc {train_acc:.4f}, F1 {train_f1:.4f} | "
              f"Test  loss {test_loss:.4f}, acc {test_acc:.4f}, F1 {test_f1:.4f}")

    wandb.finish()


175 proteins had multiple annotations; only first used
Epoch  1 | Train loss 1.8645, acc 0.1152, F1 0.0895 | Test  loss 1.7430, acc 0.3541, F1 0.1525
Epoch  2 | Train loss 1.7116, acc 0.1757, F1 0.1285 | Test  loss 1.6454, acc 0.3045, F1 0.1837
Epoch  3 | Train loss 1.6177, acc 0.2176, F1 0.1684 | Test  loss 1.6033, acc 0.1655, F1 0.1629
Epoch  4 | Train loss 1.5527, acc 0.1866, F1 0.1742 | Test  loss 1.5187, acc 0.4804, F1 0.2591
Epoch  5 | Train loss 1.4319, acc 0.2325, F1 0.2120 | Test  loss 1.4773, acc 0.1955, F1 0.1945
Epoch  6 | Train loss 1.3540, acc 0.2546, F1 0.2308 | Test  loss 1.4789, acc 0.3188, F1 0.2398
Epoch  7 | Train loss 1.2785, acc 0.2536, F1 0.2442 | Test  loss 1.4279, acc 0.2544, F1 0.2445
Epoch  8 | Train loss 1.1681, acc 0.2953, F1 0.2703 | Test  loss 1.6214, acc 0.3089, F1 0.2208
Epoch  9 | Train loss 1.0863, acc 0.3324, F1 0.2999 | Test  loss 1.5240, acc 0.1071, F1 0.1873
Epoch 10 | Train loss 1.0013, acc 0.3521, F1 0.3196 | Test  loss 1.7162, acc 0.1675, F1 0.

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_accuracy,▄▄▂▆▂▄▃▄▁▂▂▃▁▃▂▄█▆▅▇
test_loss,▃▂▂▂▁▁▁▂▂▃▃▂▅▃▃▄▄▅▄█
test_macro_f1,▁▂▁▅▃▄▅▄▂▂▃▄▃▃▃▅█▇▆▇
train_accuracy,▁▂▂▂▃▃▃▃▄▄▅▅▅▆▆▇▇███
train_loss,█▇▇▇▆▆▅▅▄▄▄▃▃▂▂▂▂▁▁▁
train_macro_f1,▁▂▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇███

0,1
epoch,20.0
test_accuracy,0.54568
test_loss,2.52767
test_macro_f1,0.31433
train_accuracy,0.63227
train_loss,0.39665
train_macro_f1,0.56081
