In [1]:
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from src.baselines import CNNModel, CARPModel, MSModel, LinearModel
from src.torch_helpers import NamedTensorDataset
from src.datamodule import PeptideDataModule
from pytorch_lightning import Trainer
from src.torch_helpers import NoValProgressBar
from src.constants import MSConstants
C = MSConstants()

torch.manual_seed(0);

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# from src.torch_helpers import start_tensorboard
# start_tensorboard(login_node='login-2')

In [5]:
# generate negatives by producing tons of shuffled sequences
# then cluster, and only take clusters containing sufficiently many positives

# from src.cdhit import CDHIT

# def generate_negatives(sequences, num_shuffles=10, min_frac=0.1, random_state=0):
#     pos_seqs = list(sequences)
#     neg_seqs = []
#     rng = npr.RandomState(random_state)
#     for n in range(num_shuffles):
#         neg_seqs += [''.join(rng.permutation(list(s))) for s in pos_seqs]

#     seqs = np.array(pos_seqs + neg_seqs)
#     ids = np.array([1]*len(pos_seqs) + [0]*len(neg_seqs))
#     clusters = np.array(CDHIT(threshold=0.5,word_length=3).fit_predict(seqs))
#     pos_frac = pd.DataFrame([clusters,ids],index=['clusters','ids']).T.groupby('clusters').mean()['ids']
#     pos_clusters = set(pos_frac[pos_frac>min_frac].index)
    
#     pos_seqs = set(pos_seqs)
#     negatives = [s for i,s,c in zip(ids,seqs,clusters) if i==0 and c in pos_clusters and s not in pos_seqs]
    
#     rng.shuffle(negatives)
#     negatives = negatives[:len(sequences)]
    
#     assert len(negatives) == len(sequences)
    
#     return negatives

# Datasets

In [6]:
datasets = {}

### Mitochondrial targeting

In [7]:
df = pd.read_csv('./data/mitochondria_targeting.csv')
df = df[['Sequence','Mitochondrial Targeting Signal']].drop_duplicates(keep='first')
df = df.loc[df['Sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)

dataset = NamedTensorDataset(
    sequence=df['Sequence'],
    x=df['Sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['Sequence'].map(lambda s: [1]*len(s)),
    y=df[['Mitochondrial Targeting Signal']].astype(np.int32).values
)

datasets['mito'] = dataset

len(dataset)

5349

### Cdc28 binding

In [8]:
df = pd.read_csv('./data/cdc28_binding.csv')
df = df[['Sequence','Cdc28 Binding']].drop_duplicates(keep='first')
df = df.loc[df['Sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)

dataset = NamedTensorDataset(
    sequence=df['Sequence'],
    x=df['Sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['Sequence'].map(lambda s: [1]*len(s)),
    y=df[['Cdc28 Binding']].astype(np.int32).values
)

datasets['cdc28'] = dataset

len(dataset)

5348

### Signal peptide

In [9]:
with open('./data/train_set.fasta','r') as f:
    fasta = [l.strip() for l in f]
    df = pd.Series(fasta[::3]).str.extract(
        '>(?P<uniprot>[^\|]+)\|(?P<kingdom>[^|]+)\|(?P<type>[^|]+)\|(?P<partition>[^|]+)'
    )
    df['sequence'] = fasta[1::3]
    df['annotation'] = fasta[2::3]
df = df.loc[df['sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)

dataset = NamedTensorDataset(
    sequence=df['sequence'],
    x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['sequence'].map(lambda s: [1]*len(s)),
    y=(df['type']!='NO_SP').values[:,None].astype(int)
)

datasets['signalp'] = dataset

len(dataset)

20290

### HLA binding

In [10]:
pos_seqs = !cat ./data/mlci2012/binding_HLA-A0201.txt
neg_seqs = !cat ./data/mlci2012/nonbinding_HLA-A0201.txt
df = pd.DataFrame({
    'sequence': pos_seqs + neg_seqs,
    'hla_binding': [1]*len(pos_seqs) + [0]*len(neg_seqs)
})
df = df.loc[df['sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)

dataset = NamedTensorDataset(
    sequence=df['sequence'],
    x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['sequence'].map(lambda s: [1]*len(s)),
    y=(df['hla_binding']).values[:,None].astype(int)
)

datasets['hla_a0201'] = dataset

len(dataset)

11747

In [11]:
pos_seqs = !cat ./data/mlci2012/binding_HLA-B0702.txt
neg_seqs = !cat ./data/mlci2012/nonbinding_HLA-B0702.txt
df = pd.DataFrame({
    'sequence': pos_seqs + neg_seqs,
    'hla_binding': [1]*len(pos_seqs) + [0]*len(neg_seqs)
})
df = df.loc[df['sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)

dataset = NamedTensorDataset(
    sequence=df['sequence'],
    x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['sequence'].map(lambda s: [1]*len(s)),
    y=(df['hla_binding']).values[:,None].astype(int)
)

datasets['hla_b0702'] = dataset

len(dataset)

3627

In [12]:
pos_seqs = !cat ./data/mlci2012/binding_H2-Kb.txt
neg_seqs = !cat ./data/mlci2012/nonbinding_H2-Kb.txt
df = pd.DataFrame({
    'sequence': pos_seqs + neg_seqs,
    'hla_binding': [1]*len(pos_seqs) + [0]*len(neg_seqs)
})
df = df.loc[df['sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)
df = df.loc[df['sequence'].map(len)<=500]

dataset = NamedTensorDataset(
    sequence=df['sequence'],
    x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['sequence'].map(lambda s: [1]*len(s)),
    y=(df['hla_binding']).values[:,None].astype(int)
)

datasets['h2_kb'] = dataset

len(dataset)

2598

### SATPDB

In [13]:
# fns = !ls ./data/satpdb/*.fasta
# df = {}
# for fn in fns:
#     name = fn.split('/')[-1].split('.')[0]
#     with open(fn,'r') as f:
#         fasta = [l.strip() for l in f]
#     df[name] = pd.Series([1]*len(fasta[::2]),index=fasta[1::2],name=name)
#     df[name] = df[name].reset_index().drop_duplicates(subset='index',keep='first')
#     df[name] = df[name].set_index('index')[name]
# df = pd.DataFrame(df).fillna(0)
# df.index.name = 'sequence'
# df = df.reset_index()
# df = df.loc[df['sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
# df = df.loc[df['sequence'].map(len)>=5]
# df = df.loc[df['sequence'].map(len)<=100]

# for name in df.columns[1:]:
#     dataset = NamedTensorDataset(
#         sequence=df['sequence'],
#         x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
#         x_mask=df['sequence'].map(lambda s: [1]*len(s)),
#         y=df[name].values[:,None].astype(int)
#     )
#     datasets[name] = dataset
#     print(name,len(dataset))

In [14]:
from torch.utils.data import Subset

npr.seed(0)

MIN_LENGTH = 5
MAX_LENGTH = 100

for d in datasets:
    idxs = [
        i 
        for i,item in enumerate(datasets[d]) 
        if (len(item['sequence'])>=MIN_LENGTH) and (len(item['sequence'])<=MAX_LENGTH)
    ]
    npr.shuffle(idxs)
    datasets[d] = Subset(datasets[d], idxs)
    print(d, len(datasets[d]))

mito 4121
cdc28 4120
signalp 20290
hla_a0201 11747
hla_b0702 3627
h2_kb 2598


# Models

In [15]:
models = {}

### Linear baseline

In [16]:
model = lambda : LinearModel(
    output_dim = 1,
    model_dim = 128,
    num_residues = len(C.alphabet),
    lr = 5e-4,
    output_weights = [(None,1),]
)
models['linear'] = model

### CNN baseline

In [17]:
model = lambda : CNNModel(
    output_dim = 1,
    model_dim = 128,
    model_depth = 3,
    kernel_size = 3,
    num_residues = len(C.alphabet),
    dropout = 0.,
    lr = 1e-4,
    output_weights = [(None,1),]
)
models['cnn'] = model

### MS pretraining

In [26]:
[last_ckpt] = !ls -t1 ./lightning_logs/ms_regularized/checkpoints/*.ckpt | head -n1
last_ckpt

'./lightning_logs/ms_regularized/checkpoints/epoch=0-step=4940.ckpt'

In [27]:
model = lambda : MSModel(
    checkpoint = last_ckpt,
    model_dim = 128,
    output_dim = 1,
    fixed_weights = True,
    naive = False,
    lr = 1e-4,
    output_weights = [(None,1),],
    max_length=100
)
models['ms_pretrained_frozen'] = model

In [28]:
# model = lambda : MSModel(
#     checkpoint = last_ckpt,
#     model_dim = 128,
#     output_dim = 1,
#     fixed_weights = False,
#     naive = False,
#     lr = 1e-4,
#     output_weights = [(None,1),],
#     max_length=100
# )
# models['ms_pretrained_finetune'] = model

### Random

In [29]:
model = lambda : MSModel(
    checkpoint = last_ckpt,
    model_dim = 128,
    output_dim = 1,
    fixed_weights = False,
    naive = True,
    lr = 1e-4,
    output_weights = [(None,1),],
    max_length=100
)
models['random_frozen'] = model

In [30]:
# model = lambda : MSModel(
#     checkpoint = last_ckpt,
#     model_dim = 128,
#     output_dim = 1,
#     fixed_weights = False,
#     naive = True,
#     lr = 1e-4,
#     output_weights = [(None,1),],
#     max_length=100
# )
# models['random_finetune'] = model

### Large language model

In [31]:
model = lambda : CARPModel(
    output_dim = 1,
    fixed_weights = True,
    max_length = 100,
    lr = 5e-4,
    output_weights = [(None,1),]
)
models['carp_pretrained_frozen'] = model

In [32]:
# model = lambda : CARPModel(
#     output_dim = 1,
#     fixed_weights = False,
#     max_length = 100,
#     lr = 1e-4,
#     output_weights = [(None,1),]
# )
# models['carp_pretrained_finetune'] = model

# need to do a proper hparam search for carp

In [None]:
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import DataLoader
from src.torch_helpers import zero_padding_collate
from src.cdhit import cdhit_split

seed_everything(0, workers=True)

aucs = {}

for MODEL in models.keys():
    for DATASET in datasets.keys():
        name = MODEL+'_'+DATASET
        !rm -rf ./lightning_logs/$name
# !rm -rf ./lightning_logs/version_$SLURM_JOBID

for DATASET in datasets.keys():
    dataset = datasets[DATASET]
    
    sequences = [item['sequence'] for item in dataset]
    train_val_seqs, test_seqs, train_val_dataset, test_dataset = cdhit_split(
        sequences,
        dataset,
        split=2./3,
        threshold=0.5,
        word_length=3
    )
    
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=len(test_dataset),
        collate_fn=zero_padding_collate,
        num_workers=1,
        shuffle=False,
        drop_last=False
    )
    
    dm = PeptideDataModule(
        train_val_dataset,
        batch_size=256,
        val_batch_size=-1,
        train_val_split=0.5,
        cdhit_threshold=0.5,
        cdhit_word_length=3,
        num_workers=4
    )
    dm.setup()

    for MODEL in models.keys():
        name = MODEL+'_'+DATASET
        
        torch.manual_seed(0)
        
        model = models[MODEL]()
        
        model.output_weights = [(DATASET,1)]
        
        trainer = Trainer(
            gpus=1,
            precision=32,
            callbacks=[
                NoValProgressBar(),
                EarlyStopping(
                    monitor=f'val_auc_{DATASET}',
                    mode='max',
                    patience=10
                ),
                ModelCheckpoint(
                    monitor=f'val_auc_{DATASET}', 
                    save_top_k=1
                )
            ]
        )

        trainer.fit(model, dm)
        
        metrics = trainer.test(model, test_dataloader)
        
        aucs[(DATASET,MODEL)] = metrics[0][f'test_auc_{DATASET}']
        
        print(DATASET, MODEL)

        !mv ./lightning_logs/version_$SLURM_JOBID ./lightning_logs/latest
        !mv ./lightning_logs/latest ./lightning_logs/$name

Global seed set to 0
Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name       | Type      | Params
-----------------------------------------
0 | embedding  | Embedding | 3.1 K 
1 | classifier | Linear    | 129   
-----------------------------------------
3.2 K     Trainable params
0         Non-trainable params
3.2 K     Total params
0.013     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 199: 100%|██████████| 6/6 [02:29<00:00, 24.94s/it, loss=0.132, v_num=1.79e+7] 


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  3.21it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_auc_mito         0.8321413397789001
        test_loss           0.1232985258102417
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito linear


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name       | Type       | Params
------------------------------------------
0 | embedding  | Embedding  | 3.1 K 
1 | encoder    | Sequential | 31.0 K
2 | classifier | Linear     | 33    
------------------------------------------
34.1 K    Trainable params
0         Non-trainable params
34.1 K    Total params
0.136     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 145: 100%|██████████| 6/6 [01:49<00:00, 18.29s/it, loss=0.309, v_num=1.79e+7] 


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_auc_mito         0.8106695413589478
        test_loss           0.31384706497192383
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito cnn


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 658 K 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
16.8 K    Trainable params
658 K     Non-trainable params
675 K     Total params
2.702     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 314: 100%|██████████| 6/6 [04:41<00:00, 46.89s/it, loss=0.103, v_num=1.79e+7]  


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_auc_mito         0.8616129159927368
        test_loss           0.10735632479190826
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito ms_pretrained_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 658 K 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
675 K     Trainable params
0         Non-trainable params
675 K     Total params
2.702     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 26: 100%|██████████| 6/6 [00:35<00:00,  5.90s/it, loss=0.00673, v_num=1.79e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.64it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_auc_mito         0.8039490580558777
        test_loss           0.20173271000385284
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito random_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
16.8 K    Trainable params
603 K     Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 43: 100%|██████████| 6/6 [00:45<00:00,  7.55s/it, loss=0.0845, v_num=1.79e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.30it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_auc_mito         0.8573124408721924
        test_loss           0.1063058078289032
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito carp_pretrained_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name       | Type      | Params
-----------------------------------------
0 | embedding  | Embedding | 3.1 K 
1 | classifier | Linear    | 129   
-----------------------------------------
3.2 K     Trainable params
0         Non-trainable params
3.2 K     Total params
0.013     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 10: 100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=0.411, v_num=1.79e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_auc_cdc28         0.6054238677024841
        test_loss           0.36368176341056824
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 linear


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name       | Type       | Params
------------------------------------------
0 | embedding  | Embedding  | 3.1 K 
1 | encoder    | Sequential | 31.0 K
2 | classifier | Linear     | 33    
------------------------------------------
34.1 K    Trainable params
0         Non-trainable params
34.1 K    Total params
0.136     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 19:  83%|████████▎ | 5/6 [00:15<00:03,  3.03s/it, loss=0.529, v_num=1.79e+7]



Epoch 56: 100%|██████████| 6/6 [00:44<00:00,  7.38s/it, loss=0.428, v_num=1.79e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_auc_cdc28         0.7766574025154114
        test_loss           0.42319998145103455
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 cnn


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 658 K 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
16.8 K    Trainable params
658 K     Non-trainable params
675 K     Total params
2.702     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 156: 100%|██████████| 6/6 [02:20<00:00, 23.46s/it, loss=0.087, v_num=1.79e+7]  


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_auc_cdc28         0.7541563510894775
        test_loss           0.09480345994234085
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 ms_pretrained_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 658 K 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
675 K     Trainable params
0         Non-trainable params
675 K     Total params
2.702     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 32: 100%|██████████| 6/6 [00:39<00:00,  6.50s/it, loss=0.0137, v_num=1.79e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_auc_cdc28         0.6976600885391235
        test_loss           0.12664860486984253
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 random_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
16.8 K    Trainable params
603 K     Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 44: 100%|██████████| 6/6 [00:46<00:00,  7.77s/it, loss=0.0652, v_num=1.79e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.15it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_auc_cdc28         0.8005952835083008
        test_loss           0.08877734839916229
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 carp_pretrained_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name       | Type      | Params
-----------------------------------------
0 | embedding  | Embedding | 3.1 K 
1 | classifier | Linear    | 129   
-----------------------------------------
3.2 K     Trainable params
0         Non-trainable params
3.2 K     Total params
0.013     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


                                                                           

  rank_zero_warn(


Epoch 136: 100%|██████████| 27/27 [04:06<00:00,  9.14s/it, loss=0.408, v_num=1.79e+7]  


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    test_auc_signalp        0.8394442796707153
        test_loss           0.4050133228302002
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
signalp linear


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name       | Type       | Params
------------------------------------------
0 | embedding  | Embedding  | 3.1 K 
1 | encoder    | Sequential | 31.0 K
2 | classifier | Linear     | 33    
------------------------------------------
34.1 K    Trainable params
0         Non-trainable params
34.1 K    Total params
0.136     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 100: 100%|██████████| 27/27 [03:20<00:00,  7.41s/it, loss=0.277, v_num=1.79e+7]  


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    test_auc_signalp        0.9029958248138428
        test_loss           0.3806380033493042
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
signalp cnn


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 658 K 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
16.8 K    Trainable params
658 K     Non-trainable params
675 K     Total params
2.702     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 433: 100%|██████████| 27/27 [16:18<00:00, 36.26s/it, loss=0.152, v_num=1.79e+7]  


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    test_auc_signalp        0.9627299904823303
        test_loss           0.20588509738445282
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
signalp ms_pretrained_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 658 K 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
675 K     Trainable params
0         Non-trainable params
675 K     Total params
2.702     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 16: 100%|██████████| 27/27 [00:56<00:00,  2.08s/it, loss=0.000688, v_num=1.79e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]


Testing DataLoader 0: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    test_auc_signalp        0.9736552238464355
        test_loss           0.3075225055217743
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
signalp random_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-a15c92af-b942-6540-1a2c-4509456ac8f1]

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
16.8 K    Trainable params
603 K     Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 19: 100%|██████████| 27/27 [00:49<00:00,  1.82s/it, loss=0.0612, v_num=1.79e+7]

In [None]:
df = pd.DataFrame(aucs,index=[0])
df.columns.names = ['dataset','model']
df = df.T
df = df.pivot_table(index='dataset',columns='model')[0]
# df['ms_naive'] = [aucs[('cdc28','ms_naive')],aucs[('mito','ms_naive')],aucs[('signalp','ms_naive')]]
df.round(4).T

In [None]:
# from sklearn.metrics import confusion_matrix

# model = model.cpu()
# model.eval()

# ys = []
# y_preds = []

# for batch in dm.val_dataloader():
#     y_pred = model.predict_step(batch, 0).detach().cpu().numpy()
#     y = batch['y'].cpu().numpy()
#     ys.append(y)
#     y_preds.append(y_pred)
# y = np.concatenate(ys)
# y_pred = np.concatenate(y_preds)

# for k in range(y.shape[1]):
#     plt.figure(figsize=(4,4))
#     sns.heatmap(
#         confusion_matrix(y[:,k], y_pred[:,k]>0.5),
#         annot=True, fmt='d', cmap='Blues'
#     )