In [1]:
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from src.baselines import CNNModel, CARPModel, MSModel, LinearModel
from src.torch_helpers import NamedTensorDataset
from src.datamodule import PeptideDataModule
from pytorch_lightning import Trainer
from src.torch_helpers import NoValProgressBar
from src.constants import MSConstants
C = MSConstants()

torch.manual_seed(0);

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# from src.torch_helpers import start_tensorboard
# start_tensorboard(login_node='login-2')

In [5]:
# generate negatives by producing tons of shuffled sequences
# then cluster, and only take clusters containing sufficiently many positives

# from src.cdhit import CDHIT

# def generate_negatives(sequences, num_shuffles=10, min_frac=0.1, random_state=0):
#     pos_seqs = list(sequences)
#     neg_seqs = []
#     rng = npr.RandomState(random_state)
#     for n in range(num_shuffles):
#         neg_seqs += [''.join(rng.permutation(list(s))) for s in pos_seqs]

#     seqs = np.array(pos_seqs + neg_seqs)
#     ids = np.array([1]*len(pos_seqs) + [0]*len(neg_seqs))
#     clusters = np.array(CDHIT(threshold=0.5,word_length=3).fit_predict(seqs))
#     pos_frac = pd.DataFrame([clusters,ids],index=['clusters','ids']).T.groupby('clusters').mean()['ids']
#     pos_clusters = set(pos_frac[pos_frac>min_frac].index)
    
#     pos_seqs = set(pos_seqs)
#     negatives = [s for i,s,c in zip(ids,seqs,clusters) if i==0 and c in pos_clusters and s not in pos_seqs]
    
#     rng.shuffle(negatives)
#     negatives = negatives[:len(sequences)]
    
#     assert len(negatives) == len(sequences)
    
#     return negatives

# Datasets

In [6]:
datasets = {}

### Mitochondrial targeting

In [7]:
df = pd.read_csv('./data/mitochondria_targeting.csv')
df = df[['Sequence','Mitochondrial Targeting Signal']].drop_duplicates(keep='first')
df = df.loc[df['Sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)

dataset = NamedTensorDataset(
    'mito',
    sequence=df['Sequence'],
    x=df['Sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['Sequence'].map(lambda s: [1]*len(s)),
    y=df['Mitochondrial Targeting Signal'].astype(int)
)

datasets[dataset.name] = dataset

len(dataset)

5349

### Cdc28 binding

In [8]:
df = pd.read_csv('./data/cdc28_binding.csv')
df = df[['Sequence','Cdc28 Binding']].drop_duplicates(keep='first')
df = df.loc[df['Sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)

dataset = NamedTensorDataset(
    'cdc28',
    sequence=df['Sequence'],
    x=df['Sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['Sequence'].map(lambda s: [1]*len(s)),
    y=df['Cdc28 Binding'].astype(int)
)

datasets[dataset.name] = dataset

len(dataset)

5348

### Signal peptide - MUST MULTITASK

In [9]:
with open('./data/train_set.fasta','r') as f:
    fasta = [l.strip() for l in f]
    df = pd.Series(fasta[::3]).str.extract(
        '>(?P<uniprot>[^\|]+)\|(?P<kingdom>[^|]+)\|(?P<type>[^|]+)\|(?P<partition>[^|]+)'
    )
    df['sequence'] = fasta[1::3]
    df['annotation'] = fasta[2::3]
df = df.loc[df['sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)

dataset = NamedTensorDataset(
    'signalp',
    sequence=df['sequence'],
    x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['sequence'].map(lambda s: [1]*len(s)),
    y=df['type'].astype('category').cat.codes
)

datasets[dataset.name] = dataset

len(dataset)

20290

### HLA binding

In [10]:
pos_seqs = !cat ./data/mlci2012/binding_HLA-A0201.txt
neg_seqs = !cat ./data/mlci2012/nonbinding_HLA-A0201.txt
df = pd.DataFrame({
    'sequence': pos_seqs + neg_seqs,
    'hla_binding': [1]*len(pos_seqs) + [0]*len(neg_seqs)
})
df = df.loc[df['sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
df = df.sample(frac=1.,random_state=0)

dataset = NamedTensorDataset(
    'hla',
    sequence=df['sequence'],
    x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
    x_mask=df['sequence'].map(lambda s: [1]*len(s)),
    y=df['hla_binding'].astype(int)
)

datasets[dataset.name] = dataset

len(dataset)

11747

### SATPDB

In [11]:
# fns = !ls ./data/satpdb/*.fasta
# df = {}
# for fn in fns:
#     name = fn.split('/')[-1].split('.')[0]
#     with open(fn,'r') as f:
#         fasta = [l.strip() for l in f]
#     df[name] = pd.Series([1]*len(fasta[::2]),index=fasta[1::2],name=name)
#     df[name] = df[name].reset_index().drop_duplicates(subset='index',keep='first')
#     df[name] = df[name].set_index('index')[name]
# df = pd.DataFrame(df).fillna(0)
# df.index.name = 'sequence'
# df = df.reset_index()
# df = df.loc[df['sequence'].map(lambda s: all([c==c.upper() and c in C.alphabet for c in s]))]
# df = df.loc[df['sequence'].map(len)>=5]
# df = df.loc[df['sequence'].map(len)<=100]

# for name in df.columns[1:]:
#     dataset = NamedTensorDataset(
#         sequence=df['sequence'],
#         x=df['sequence'].map(lambda s: [C.alphabet.index(c) for c in s]),
#         x_mask=df['sequence'].map(lambda s: [1]*len(s)),
#         y=df[name].values[:,None].astype(int)
#     )
#     datasets[name] = dataset
#     print(name,len(dataset))

In [12]:
from torch.utils.data import Subset

npr.seed(0)

MIN_LENGTH = 5
MAX_LENGTH = 100

for d in datasets:
    idxs = [
        i 
        for i,item in enumerate(datasets[d]) 
        if (len(item['sequence'])>=MIN_LENGTH) and (len(item['sequence'])<=MAX_LENGTH)
    ]
    npr.shuffle(idxs)
    datasets[d] = Subset(datasets[d], idxs)
    print(d, len(datasets[d]))

mito 4121
cdc28 4120
signalp 20290
hla 11747


# Models

In [13]:
models = {}

In [14]:
MODEL_DIM = 128
LR = 5e-4

### MS pretraining

In [15]:
[last_ckpt] = !ls -t1 ./version_17869980/checkpoints/*.ckpt | head -n1
last_ckpt

'./version_17869980/checkpoints/epoch=92-step=114948-best.ckpt'

In [16]:
model = lambda output_dim: MSModel(
    checkpoint = last_ckpt,
    model_dim = MODEL_DIM,
    output_dim = output_dim,
    fixed_weights = True,
    naive = False,
    lr = LR
)
models['ms_pretrained_frozen'] = model

In [17]:
# model = lambda output_dim: MSModel(
#     checkpoint = last_ckpt,
#     model_dim = MODEL_DIM,
#     output_dim = output_dim,
#     fixed_weights = False,
#     naive = False,
#     lr = LR
# )
# models['ms_pretrained_finetune'] = model

### Linear baseline

In [18]:
model = lambda output_dim: LinearModel(
    output_dim = output_dim,
    model_dim = MODEL_DIM,
    num_residues = len(C.alphabet),
    lr = LR,
)
models['linear'] = model

### CNN baseline

In [19]:
# https://arxiv.org/pdf/2011.03443.pdf
# differences: 128 vs 1024, attention vs max pool
model = lambda output_dim : CNNModel(
    output_dim = output_dim,
    model_dim = MODEL_DIM,
    model_depth = 3,
    kernel_size = 5,
    num_residues = len(C.alphabet),
    dropout = 0.,
    lr = LR,
)
models['cnn'] = model

### Random

In [20]:
model = lambda output_dim : MSModel(
    checkpoint = last_ckpt,
    model_dim = MODEL_DIM,
    output_dim = output_dim,
    fixed_weights = False,
    naive = True,
    lr = LR,
)
models['random_frozen'] = model

In [21]:
# model = lambda output_dim : MSModel(
#     checkpoint = last_ckpt,
#     model_dim = MODEL_DIM,
#     output_dim = output_dim,
#     fixed_weights = True,
#     naive = True,
#     lr = LR
# )
# models['random_finetune'] = model

### Large language model

In [22]:
model = lambda output_dim : CARPModel(
    output_dim = output_dim,
    fixed_weights = True,
    lr = LR,
)
models['carp_pretrained_frozen'] = model

In [23]:
# model = lambda output_dim : CARPModel(
#     output_dim = output_dim,
#     fixed_weights = False,
#     lr = LR,
# )
# models['carp_pretrained_finetune'] = model

# you cannot do this on CPU! it gives NAN

In [26]:
for MODEL in models.keys():
    for DATASET in datasets.keys():
        name = MODEL+'_'+DATASET
        !rm -rf ./lightning_logs/$name
!rm -rf ./lightning_logs/version_$SLURM_JOBID

In [25]:
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import DataLoader
from src.torch_helpers import zero_padding_collate
from src.cdhit import cdhit_split

seed_everything(0, workers=True)

aucs = {}

for MODEL in models.keys():
    for DATASET in datasets.keys():
        name = MODEL+'_'+DATASET
        !rm -rf ./lightning_logs/$name
!rm -rf ./lightning_logs/version_$SLURM_JOBID

for DATASET in datasets.keys():
    dataset = datasets[DATASET]
    
    sequences = [item['sequence'] for item in dataset]
    train_val_seqs, test_seqs, train_val_dataset, test_dataset = cdhit_split(
        sequences,
        dataset,
        split=2./3,
        threshold=0.5,
        word_length=3
    )
    
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=len(test_dataset),
        collate_fn=zero_padding_collate,
        num_workers=1,
        shuffle=False,
        drop_last=False
    )
    
    dm = PeptideDataModule(
        train_val_dataset,
        batch_size=256,
        val_batch_size=-1,
        train_val_split=0.5,
        cdhit_threshold=0.5,
        cdhit_word_length=3,
        num_workers=4
    )
    dm.setup()
    
    OUTPUT_DIM = 6 if DATASET == 'signalp' else 1

    for MODEL in models.keys():
        name = MODEL+'_'+DATASET
        
        torch.manual_seed(0)
        
        model = models[MODEL](OUTPUT_DIM)
        
        model.name = DATASET
        
        trainer = Trainer(
            gpus=0,
            precision=32,
            max_epochs=1000,
            min_epochs=30,
            callbacks=[
                NoValProgressBar(),
                EarlyStopping(
                    monitor=f'val_auc_{DATASET}',
                    mode='max',
                    patience=10
                ),
                ModelCheckpoint(
                    monitor=f'val_auc_{DATASET}', 
                    mode='max',
                    save_top_k=1,
                    filename='{epoch}-{step}-best'
                )
            ]
        )

        trainer.fit(model, dm)
        
        [best_ckpt] = !ls -t1 ./lightning_logs/version_$SLURM_JOBID/checkpoints/*.ckpt | head -n1
        checkpoint = torch.load(best_ckpt)
        model.load_state_dict(checkpoint['state_dict'])
        
        metrics, = trainer.test(model, test_dataloader)
        
        aucs[(DATASET,MODEL)] = metrics[f'test_auc_{DATASET}']
        
        print(DATASET, MODEL)

        !mv ./lightning_logs/version_$SLURM_JOBID ./lightning_logs/latest
        !mv ./lightning_logs/latest ./lightning_logs/$name

Global seed set to 0
Multiprocessing is handled by SLURM.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
16.8 K    Trainable params
603 K     Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


                                                                           

  rank_zero_warn(


Epoch 55: 100%|██████████| 6/6 [02:01<00:00, 20.27s/it, loss=0.0515, v_num=1.79e+7] 


  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_auc_mito         0.8738208413124084
        test_loss           0.11530359089374542
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito ms_pretrained_frozen


Multiprocessing is handled by SLURM.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name       | Type      | Params
-----------------------------------------
0 | embedding  | Embedding | 3.1 K 
1 | classifier | Linear    | 129   
-----------------------------------------
3.2 K     Trainable params
0         Non-trainable params
3.2 K     Total params
0.013     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 199: 100%|██████████| 6/6 [01:57<00:00, 19.65s/it, loss=0.132, v_num=1.79e+7] 
Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  4.33it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_auc_mito         0.8317251205444336
        test_loss           0.12419627606868744
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito linear


Multiprocessing is handled by SLURM.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name       | Type       | Params
------------------------------------------
0 | embedding  | Embedding  | 3.1 K 
1 | encoder    | Sequential | 51.5 K
2 | classifier | Linear     | 33    
------------------------------------------
54.6 K    Trainable params
0         Non-trainable params
54.6 K    Total params
0.218     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 33: 100%|██████████| 6/6 [00:27<00:00,  4.59s/it, loss=0.411, v_num=1.79e+7]
Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  3.06it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_auc_mito         0.7852209806442261
        test_loss           0.4843822121620178
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito cnn


Multiprocessing is handled by SLURM.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
620 K     Trainable params
0         Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 10: 100%|██████████| 6/6 [00:47<00:00,  7.95s/it, loss=nan, v_num=1.79e+7] 

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 11: 100%|██████████| 6/6 [00:52<00:00,  8.71s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 12: 100%|██████████| 6/6 [00:57<00:00,  9.51s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 13: 100%|██████████| 6/6 [01:01<00:00, 10.25s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 14: 100%|██████████| 6/6 [01:05<00:00, 10.99s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 15: 100%|██████████| 6/6 [01:10<00:00, 11.73s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 16: 100%|██████████| 6/6 [01:15<00:00, 12.53s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 17: 100%|██████████| 6/6 [01:19<00:00, 13.22s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 18: 100%|██████████| 6/6 [01:23<00:00, 13.99s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 19: 100%|██████████| 6/6 [01:28<00:00, 14.72s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 20: 100%|██████████| 6/6 [01:33<00:00, 15.50s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 21: 100%|██████████| 6/6 [01:37<00:00, 16.24s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 22: 100%|██████████| 6/6 [01:41<00:00, 16.92s/it, loss=nan, v_num=1.79e+7]

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 23: 100%|██████████| 6/6 [01:45<00:00, 17.66s/it, loss=nan, v_num=1.79e+7] 

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 24: 100%|██████████| 6/6 [01:50<00:00, 18.44s/it, loss=nan, v_num=1.79e+7] 

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 25: 100%|██████████| 6/6 [01:54<00:00, 19.14s/it, loss=nan, v_num=1.79e+7] 

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 26: 100%|██████████| 6/6 [01:59<00:00, 19.92s/it, loss=nan, v_num=1.79e+7] 

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 27: 100%|██████████| 6/6 [02:04<00:00, 20.67s/it, loss=nan, v_num=1.79e+7] 

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 28: 100%|██████████| 6/6 [02:08<00:00, 21.41s/it, loss=nan, v_num=1.79e+7] 

Trainer was signaled to stop but required minimum epochs (30) or minimum steps (None) has not been met. Training will continue...


Epoch 29: 100%|██████████| 6/6 [02:12<00:00, 22.16s/it, loss=nan, v_num=1.79e+7] 
Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.22it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_auc_mito         0.5863801836967468
        test_loss                   nan
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
mito random_frozen


Multiprocessing is handled by SLURM.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name       | Type           | Params
----------------------------------------------
0 | encoder    | ByteNet        | 603 K 
1 | classifier | ESMAttention1d | 16.8 K
----------------------------------------------
16.8 K    Trainable params
603 K     Non-trainable params
620 K     Total params
2.481     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 6:  67%|██████▋   | 4/6 [00:12<00:06,  3.16s/it, loss=0.151, v_num=1.79e+7]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


TypeError: cannot unpack non-iterable NoneType object

In [None]:
df = pd.DataFrame(aucs,index=[0])
df.columns.names = ['dataset','model']
df = df.T
df = df.pivot_table(index='dataset',columns='model')[0]
# df['ms_naive'] = [aucs[('cdc28','ms_naive')],aucs[('mito','ms_naive')],aucs[('signalp','ms_naive')]]
df.round(4).T

In [None]:
# from sklearn.metrics import confusion_matrix

# model = model.cpu()
# model.eval()

# ys = []
# y_preds = []

# for batch in dm.val_dataloader():
#     y_pred = model.predict_step(batch, 0).detach().cpu().numpy()
#     y = batch['y'].cpu().numpy()
#     ys.append(y)
#     y_preds.append(y_pred)
# y = np.concatenate(ys)
# y_pred = np.concatenate(y_preds)

# for k in range(y.shape[1]):
#     plt.figure(figsize=(4,4))
#     sns.heatmap(
#         confusion_matrix(y[:,k], y_pred[:,k]>0.5),
#         annot=True, fmt='d', cmap='Blues'
#     )