In [1]:
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from src.baselines import CNNModel, CARPModel, MSModel, LinearModel
from src.torch_helpers import NamedTensorDataset
from src.datamodule import PeptideDataModule
from pytorch_lightning import Trainer
from src.torch_helpers import NoValProgressBar
from src.constants import MSConstants
C = MSConstants()

torch.manual_seed(0);

from src.torch_helpers import start_tensorboard
start_tensorboard(login_node='login-2')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# generate negatives by producing tons of shuffled sequences
# then cluster, and only take clusters containing sufficiently many positives

# from src.cdhit import CDHIT

# def generate_negatives(sequences, num_shuffles=10, min_frac=0.1, random_state=0):
#     pos_seqs = list(sequences)
#     neg_seqs = []
#     rng = npr.RandomState(random_state)
#     for n in range(num_shuffles):
#         neg_seqs += [''.join(rng.permutation(list(s))) for s in pos_seqs]

#     seqs = np.array(pos_seqs + neg_seqs)
#     ids = np.array([1]*len(pos_seqs) + [0]*len(neg_seqs))
#     clusters = np.array(CDHIT(threshold=0.5,word_length=3).fit_predict(seqs))
#     pos_frac = pd.DataFrame([clusters,ids],index=['clusters','ids']).T.groupby('clusters').mean()['ids']
#     pos_clusters = set(pos_frac[pos_frac>min_frac].index)
    
#     pos_seqs = set(pos_seqs)
#     negatives = [s for i,s,c in zip(ids,seqs,clusters) if i==0 and c in pos_clusters and s not in pos_seqs]
    
#     rng.shuffle(negatives)
#     negatives = negatives[:len(sequences)]
    
#     assert len(negatives) == len(sequences)
    
#     return negatives

# Datasets

In [5]:
datasets = {}

### Mitochondrial targeting

In [6]:
df = pd.read_csv('./data/mitochondria_targeting.csv')
df = df[['Sequence','Mitochondrial Targeting Signal']].drop_duplicates(keep='first')
df = df.sample(frac=1.,random_state=0)
df = df.loc[df['Sequence'].map(len)<=100]

dataset = NamedTensorDataset(
    sequence=df['Sequence'],
    x=df['Sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['Sequence'].map(lambda s: [1]*len(s)),
    y=df[['Mitochondrial Targeting Signal']].astype(np.int32).values
)

datasets['mito'] = dataset

len(dataset)

4121

### Cdc28 binding

In [7]:
df = pd.read_csv('./data/cdc28_binding.csv')
df = df[['Sequence','Cdc28 Binding']].drop_duplicates(keep='first')
df = df.sample(frac=1.,random_state=0)
df = df.loc[df['Sequence'].map(len)<=100]

dataset = NamedTensorDataset(
    sequence=df['Sequence'],
    x=df['Sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['Sequence'].map(lambda s: [1]*len(s)),
    y=df[['Cdc28 Binding']].astype(np.int32).values
)

datasets['cdc28'] = dataset

len(dataset)

4120

### Signal peptide

In [8]:
with open('./data/train_set.fasta','r') as f:
    fasta = [l.strip() for l in f]
    df = pd.Series(fasta[::3]).str.extract(
        '>(?P<uniprot>[^\|]+)\|(?P<kingdom>[^|]+)\|(?P<type>[^|]+)\|(?P<partition>[^|]+)'
    )
    df['sequence'] = fasta[1::3]
    df['annotation'] = fasta[2::3]
df = df.sample(frac=1.,random_state=0)
df = df.loc[df['sequence'].map(len)<=100]

dataset = NamedTensorDataset(
    sequence=df['sequence'],
    x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['sequence'].map(lambda s: [1]*len(s)),
    y=(df['type']!='NO_SP').values[:,None].astype(int)
)

datasets['signalp'] = dataset

len(dataset)

20290

### SATPDB

In [9]:
# fns = !ls ./data/satpdb/*.fasta
# df = {}
# for fn in fns:
#     name = fn.split('/')[-1].split('.')[0]
#     with open(fn,'r') as f:
#         fasta = [l.strip() for l in f]
#     df[name] = pd.Series([1]*len(fasta[::2]),index=fasta[1::2],name=name)
#     df[name] = df[name].reset_index().drop_duplicates(subset='index',keep='first')
#     df[name] = df[name].set_index('index')[name]
# df = pd.DataFrame(df).fillna(0)
# df.index.name = 'sequence'
# df = df.reset_index()
# df = df.loc[df['sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
# df = df.loc[df['sequence'].map(len)>=5]
# df = df.loc[df['sequence'].map(len)<=100]

# for name in df.columns[1:]:
#     dataset = NamedTensorDataset(
#         sequence=df['sequence'],
#         x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
#         x_mask=df['sequence'].map(lambda s: [1]*len(s)),
#         y=df[name].values[:,None].astype(int)
#     )
#     datasets[name] = dataset
#     print(name,len(dataset))

# Models

In [10]:
models = {}

### Linear baseline

In [11]:
model = lambda : LinearModel(
    output_dim = 1,
    model_dim = 128,
    num_residues = len(C.alphabet),
    lr = 5e-4,
    output_weights = [(None,1),]
)
models['linear'] = model

### CNN baseline

In [12]:
model = lambda : CNNModel(
    output_dim = 1,
    model_dim = 128,
    model_depth = 3,
    kernel_size = 3,
    num_residues = len(C.alphabet),
    dropout = 0.1,
    lr = 5e-4,
    output_weights = [(None,1),]
)
models['cnn'] = model

### MS transformer

In [13]:
[last_ckpt] = !ls -t1 ./ms_transformer/checkpoints/*.ckpt | head -n1
model = lambda : MSModel(
    checkpoint = last_ckpt,
    model_dim = 128,
    output_dim = 1,
    fixed_weights = True,
    naive = False,
    lr = 5e-4,
    output_weights = [(None,1),],
    max_length=100
)
models['ms_pretrained_frozen'] = model

In [14]:
[last_ckpt] = !ls -t1 ./ms_transformer/checkpoints/*.ckpt | head -n1
model = lambda : MSModel(
    checkpoint = last_ckpt,
    model_dim = 128,
    output_dim = 1,
    fixed_weights = False,
    naive = False,
    lr = 5e-4,
    output_weights = [(None,1),],
    max_length=100
)
models['ms_pretrained_finetune'] = model

In [15]:
[last_ckpt] = !ls -t1 ./ms_transformer/checkpoints/*.ckpt | head -n1
model = lambda : MSModel(
    checkpoint = last_ckpt,
    model_dim = 128,
    output_dim = 1,
    fixed_weights = False,
    naive = True,
    lr = 5e-4,
    output_weights = [(None,1),],
    max_length=100
)
models['ms_naive_frozen'] = model

In [None]:
[last_ckpt] = !ls -t1 ./ms_transformer/checkpoints/*.ckpt | head -n1
model = lambda : MSModel(
    checkpoint = last_ckpt,
    model_dim = 128,
    output_dim = 1,
    fixed_weights = False,
    naive = True,
    lr = 5e-4,
    output_weights = [(None,1),],
    max_length=100
)
models['ms_naive_finetune'] = model

### Large language model

In [16]:
model = lambda : CARPModel(
    output_dim = 1,
    fixed_weights = True,
    max_length = 100,
    lr = 5e-4,
    output_weights = [(None,1),]
)
models['carp_pretrained_frozen'] = model

In [17]:
model = lambda : CARPModel(
    output_dim = 1,
    fixed_weights = False,
    max_length = 100,
    lr = 5e-4,
    output_weights = [(None,1),]
)
models['carp_pretrained_finetune'] = model

In [20]:
!rm -rf ./lightning_logs/*

In [19]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import DataLoader
from src.torch_helpers import zero_padding_collate
from src.cdhit import cdhit_split

aucs = {}

for DATASET in datasets.keys():
    dataset = datasets[DATASET]
    
    sequences = [item['sequence'] for item in dataset]
    train_val_seqs, test_seqs, train_val_dataset, test_dataset = cdhit_split(
        sequences,
        dataset,
        split=0.67,
        threshold=0.5,
        word_length=3
    )
    
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=len(test_dataset),
        collate_fn=zero_padding_collate,
        num_workers=1,
        shuffle=False,
        drop_last=False
    )
    
    dm = PeptideDataModule(
        train_val_dataset,
        batch_size=256,
        val_batch_size=-1,
        train_val_split=0.5,
        cdhit_threshold=0.5,
        cdhit_word_length=3,
        num_workers=4
    )
    dm.setup()

    for MODEL in models.keys():
        torch.manual_seed(0)
        
        model = models[MODEL]()
        
        model.output_weights = [(DATASET,1)]
        
        trainer = Trainer(
            gpus=1,
            precision=32,
            callbacks=[
                NoValProgressBar(),
                EarlyStopping(
                    monitor=f'val_auc_{DATASET}',
                    mode='max',
                    patience=20
                )
            ]
        )

        trainer.fit(model, dm)
        
        metrics = trainer.test(model, test_dataloader)
        
        aucs[(DATASET,MODEL)] = metrics[0][f'test_auc_{DATASET}']
        
        print(DATASET, MODEL)

        !mv ./lightning_logs/version_$SLURM_JOBID ./lightning_logs/latest
        name = MODEL+'_'+DATASET
        !mv ./lightning_logs/latest ./lightning_logs/$name

Multiprocessing is handled by SLURM.
  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name       | Type      | Params
-----------------------------------------
0 | embedding  | Embedding | 3.1 K 
1 | classifier | Linear    | 129   
-----------------------------------------
3.2 K     Trainable params
0         Non-trainable params
3.2 K     Total params
0.013     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


                                                                           

  rank_zero_warn(


Epoch 217: 100%|██████████| 6/6 [02:44<00:00, 27.35s/it, loss=0.127, v_num=1.78e+7] 


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_mito                 0.0
      test_auc_mito         0.8312863707542419
        test_loss           0.12216643989086151
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito linear


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name       | Type       | Params
------------------------------------------
0 | embedding  | Embedding  | 3.1 K 
1 | encoder    | Sequential | 31.0 K
2 | classifier | Linear     | 33    
------------------------------------------
34.1 K    Trainable params
0         Non-trainable params
34.1 K    Total params
0.136     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 70: 100%|██████████| 6/6 [00:52<00:00,  8.74s/it, loss=0.165, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.91it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_mito                 0.0
      test_auc_mito         0.8200159668922424
        test_loss           0.22129809856414795
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito cnn


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 2.1 M 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
16.8 K    Trainable params
2.1 M     Non-trainable params
2.1 M     Total params
8.590     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 42: 100%|██████████| 6/6 [00:41<00:00,  6.93s/it, loss=0.156, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.83it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_mito                 0.0
      test_auc_mito         0.7749186754226685
        test_loss           0.1338619887828827
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito ms_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 2.1 M 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.590     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 60: 100%|██████████| 6/6 [01:14<00:00, 12.39s/it, loss=0.0109, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.65it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_mito         0.36734694242477417
      test_auc_mito         0.7436915040016174
        test_loss           0.24234524369239807
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito ms_finetune


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 2.1 M 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.590     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 72: 100%|██████████| 6/6 [01:32<00:00, 15.41s/it, loss=0.0816, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.70it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_mito         0.44897958636283875
      test_auc_mito          0.853142261505127
        test_loss           0.12625084817409515
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito ms_naive


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
16.8 K    Trainable params
603 K     Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 69: 100%|██████████| 6/6 [01:15<00:00, 12.51s/it, loss=0.0795, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_mito         0.2448979616165161
      test_auc_mito         0.8362365961074829
        test_loss           0.11306300759315491
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito carp_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
620 K     Trainable params
0         Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 25: 100%|██████████| 6/6 [00:38<00:00,  6.43s/it, loss=0.00143, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.30it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_mito         0.4285714626312256
      test_auc_mito         0.8363299369812012
        test_loss           0.23317386209964752
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito carp_finetune


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name       | Type      | Params
-----------------------------------------
0 | embedding  | Embedding | 3.1 K 
1 | classifier | Linear    | 129   
-----------------------------------------
3.2 K     Trainable params
0         Non-trainable params
3.2 K     Total params
0.013     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 20: 100%|██████████| 6/6 [00:16<00:00,  2.67s/it, loss=0.242, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  3.34it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_acc_cdc28                 0.0
     test_auc_cdc28          0.569185733795166
        test_loss           0.2238730490207672
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 linear


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name       | Type       | Params
------------------------------------------
0 | embedding  | Embedding  | 3.1 K 
1 | encoder    | Sequential | 31.0 K
2 | classifier | Linear     | 33    
------------------------------------------
34.1 K    Trainable params
0         Non-trainable params
34.1 K    Total params
0.136     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 7:  83%|████████▎ | 5/6 [00:05<00:01,  1.19s/it, loss=0.501, v_num=1.78e+7]



Epoch 106: 100%|██████████| 6/6 [01:23<00:00, 13.96s/it, loss=0.0803, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_acc_cdc28                 0.0
     test_auc_cdc28         0.7668333053588867
        test_loss           0.16059112548828125
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 cnn


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 2.1 M 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
16.8 K    Trainable params
2.1 M     Non-trainable params
2.1 M     Total params
8.590     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 112: 100%|██████████| 6/6 [01:50<00:00, 18.42s/it, loss=0.0727, v_num=1.78e+7] 


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.58it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_acc_cdc28                 0.0
     test_auc_cdc28         0.8034403920173645
        test_loss           0.08994068205356598
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 ms_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 2.1 M 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.590     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 83: 100%|██████████| 6/6 [01:43<00:00, 17.19s/it, loss=0.00806, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_acc_cdc28          0.103448286652565
     test_auc_cdc28         0.6686960458755493
        test_loss           0.17411746084690094
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 ms_finetune


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name        | Type           | Params
-----------------------------------------------
0 | transformer | MSTransformer  | 2.1 M 
1 | classifier  | ESMAttention1d | 16.8 K
-----------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.590     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 22: 100%|██████████| 6/6 [00:28<00:00,  4.67s/it, loss=0.094, v_num=1.78e+7] 


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_acc_cdc28                 0.0
     test_auc_cdc28         0.7337237000465393
        test_loss           0.10438255220651627
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 ms_naive


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
16.8 K    Trainable params
603 K     Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 63: 100%|██████████| 6/6 [01:08<00:00, 11.39s/it, loss=0.0573, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_acc_cdc28                 0.0
     test_auc_cdc28         0.7723516225814819
        test_loss           0.09800759702920914
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 carp_frozen


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
620 K     Trainable params
0         Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 23: 100%|██████████| 6/6 [00:32<00:00,  5.43s/it, loss=0.00654, v_num=1.78e+7]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.39it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_acc_cdc28                 0.0
     test_auc_cdc28         0.6349387168884277
        test_loss           0.18584546446800232
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cdc28 carp_finetune


Multiprocessing is handled by SLURM.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-745be0d7-20cb-0c84-dda6-d958e5cf3fc0]

  | Name       | Type      | Params
-----------------------------------------
0 | embedding  | Embedding | 3.1 K 
1 | classifier | Linear    | 129   
-----------------------------------------
3.2 K     Trainable params
0         Non-trainable params
3.2 K     Total params
0.013     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


                                                                           

  rank_zero_warn(


Epoch 20:  67%|██████▋   | 18/27 [00:36<00:18,  2.05s/it, loss=0.435, v_num=1.78e+7]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 60:  89%|████████▉ | 24/27 [02:26<00:18,  6.09s/it, loss=0.158, v_num=1.78e+7]  

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
df = pd.DataFrame(aucs,index=[0])
df.columns.names = ['dataset','model']
df = df.T
df = df.pivot_table(index='dataset',columns='model')[0]
# df['ms_naive'] = [aucs[('cdc28','ms_naive')],aucs[('mito','ms_naive')],aucs[('signalp','ms_naive')]]
df.round(4).T

# naive for all models?

In [None]:
# from sklearn.metrics import confusion_matrix

# model = model.cpu()
# model.eval()

# ys = []
# y_preds = []

# for batch in dm.val_dataloader():
#     y_pred = model.predict_step(batch, 0).detach().cpu().numpy()
#     y = batch['y'].cpu().numpy()
#     ys.append(y)
#     y_preds.append(y_pred)
# y = np.concatenate(ys)
# y_pred = np.concatenate(y_preds)

# for k in range(y.shape[1]):
#     plt.figure(figsize=(4,4))
#     sns.heatmap(
#         confusion_matrix(y[:,k], y_pred[:,k]>0.5),
#         annot=True, fmt='d', cmap='Blues'
#     )