In [1]:
import argparse
import torch
import esm
from data_loader import TestSequencesLoader, TestFASTALoader
import random
import numpy as np
import pickle

In [2]:
from lightning.pytorch import LightningModule
from lightning.pytorch import Trainer
#from torchmetrics import MetricCollection
#from torchmetrics.classification import Precision, Recall, AveragePrecision, ConfusionMatrix, ROC 

from finetune import ESMFinetuner

seed = 12306
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    print('cuda is available')
    torch.cuda.manual_seed_all(seed)
    
torch.set_float32_matmul_precision('medium')

Current device: cuda
cuda is available


#### Initialize parameters that need to be set for the model to load and predict

In [3]:
class Params:
    def __init__(self):
        self.test_mode = True
        self.n_classes = 17
        self.granularity = 2
        self.bs = 64
        self.output_dir = "./outputs"
        self.suffix = "seq2symm"
        self.n_epoch = 1
        self.num_layers_frozen = 31
        self.fasta_file = "/tmp/seq.fasta"
        self.weighted_sampler = False
        self.use_soft_labels = False
        self.chkpt_file = "../../models/jointCoarseFine_wted_oversampled_distilled_marginLoss/epoch=2-step=2037.ckpt"

#### Define method that aggregated predictions across batches / multiple gpus

In [4]:
def aggregate_predictions(d):
    res_dict = dict()
    res=[]
    for i in range(len(d)):
       res.append(d[i][1])
    y_pred=np.row_stack(res)
    print(y_pred.shape)
    res_dict['y_pred'] = y_pred

    res=[]
    for i in range(len(d)):
       res.append(d[i][2])
    y_true=np.row_stack(res)
    if y_true.dtype == np.float64:
        # Convert to integers
        y_true = y_true.astype(np.int64)
    res_dict['y_true'] = y_true
    
    res=[]
    for i in range(len(d)):
       res=res+d[i][0]
    pdbids = res
    res_dict['pdbids'] = pdbids

    print('Size of predicted logits: ',y_pred.shape,' number of proteins: ',len(pdbids))
    return res_dict

#### initialize model, load model

In [5]:
from finetune import ESMFinetuner

params = Params()
task = ESMFinetuner(params=params)

Number of parameters being optimized:  39687697


In [6]:
task = task.load_from_checkpoint(params.chkpt_file)
task.eval()

Number of parameters being optimized:  39688468


ESMFinetuner(
  (model): ESM2(
    (embed_tokens): Embedding(33, 1280, padding_idx=1)
    (layers): ModuleList(
      (0-32): 33 x TransformerLayer(
        (self_attn): MultiheadAttention(
          (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (rot_emb): RotaryEmbedding()
        )
        (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
    )
    (contact_head): ContactPredictionHead(
      (regression): Linear(in_features=660, out_features=1, bias=True)
      (activation): S

#### Initialize trainer object

In [7]:
trainer = Trainer(accelerator="cuda", max_epochs=params.n_epoch)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Usecase 1: initialize dataloader for a FASTA file

In [8]:
dataloader = TestFASTALoader(params=params, collater=task.batch_converter)

In [9]:
predictions = trainer.predict(model=task, dataloaders=[dataloader.test_dataloader()])
predictions = aggregate_predictions(predictions)
with open(format('%s/test_predictions_%s.pkl' % (params.output_dir,params.suffix)), 'wb') as fout:
    pickle.dump(predictions, fout)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
(25, 20)
Size of predicted logits:  (25, 20)  number of proteins:  25


### Initialize dataloader for a CSV file with sequence and labels in it
##### the CSV file needs to have at least these columns: PDBID,SEQUENCE,SYMM

In [10]:
params.meta_data_file = '/tmp/metadata.csv'

In [11]:
dataloader = TestSequencesLoader(params=params, collater=task.batch_converter)

In [12]:
predictions = trainer.predict(model=task, dataloaders=[dataloader.test_dataloader()])
predictions = aggregate_predictions(predictions)
with open(format('%s/test_predictions_%s.pkl' % (params.output_dir,params.suffix)), 'wb') as fout:
    pickle.dump(predictions, fout)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


LazyFileLoader Reading:  /tmp/metadata.csv
....with header: Index(['PDBID', 'DEPOSITION', 'RESOLUTION', 'HASH', 'CLUSTER', 'SEQUENCE',
       'SYMM'],
      dtype='object')  and length: 99
Predicting DataLoader 0: 100%|██████████| 2/2 [00:01<00:00,  1.90it/s]
(99, 20)
Size of predicted logits:  (99, 20)  number of proteins:  99


In [13]:
predictions.keys()

dict_keys(['y_pred', 'y_true', 'pdbids'])