In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES']='0'

import argparse
import torch
import esm
from data_loader import TestSequencesLoader, TestFASTALoader
import random
import numpy as np
import pickle

In [2]:
from lightning.pytorch import LightningModule
from lightning.pytorch import Trainer
from finetune import ESMFinetuner

seed = 12306
random.seed(seed)
if torch.cuda.is_available():
    print('cuda is available')
    
print(torch.cuda.current_device())

# optional, if you have an A100 gpu
#torch.set_float32_matmul_precision('high')

Current device: cuda
cuda is available
0


### Initialize parameters that need to be set for the model to load and predict
##### 1. Change the "chkpt_file" parameter here to the full absolute path to the location where the model file is saved
##### 2. Change the "output_dir" parameter to point to the full absolute path where you want the results saved. If you have an experiment specific suffix to add, modify the "suffix" parameter.
##### 3. Change the batch-size, "bs" based on what would fit in your gpu memory

In [3]:
class Params:
    def __init__(self):
        self.test_mode = True
        self.n_classes = 20
        self.granularity = 3
        self.bs = 64
        self.output_dir = "./outputs"
        self.suffix = "seq2symm"
        self.n_epoch = 1
        self.num_layers_frozen = 31
        self.weighted_sampler = False
        self.use_soft_labels = False
        self.chkpt_file = "../../models/ESM2_model.ckpt"

#### Define method that aggregates predictions across batches / multiple gpus

In [4]:
def aggregate_predictions(d):
    res_dict = dict()
    res=[]
    for i in range(len(d)):
       res.append(d[i][1])
    y_pred=np.row_stack(res)
    print(y_pred.shape)
    res_dict['y_pred'] = y_pred

    res=[]
    for i in range(len(d)):
       res.append(d[i][2])
    y_true=np.row_stack(res)
    if y_true.dtype == np.float64:
        # Convert to integers
        y_true = y_true.astype(np.int64)
    res_dict['y_true'] = y_true
    
    res=[]
    for i in range(len(d)):
       res=res+d[i][0]
    pdbids = res
    res_dict['pdbids'] = pdbids

    print('Size of predicted logits: ',y_pred.shape,' number of proteins: ',len(pdbids))
    return res_dict

#### Initialize the model, load model

In [5]:
from finetune import ESMFinetuner

params = Params()
task = ESMFinetuner(params=params)

In [6]:
task = task.load_from_checkpoint(params.chkpt_file)
task = task.eval()

#### Initialize trainer object

In [7]:
trainer = Trainer(accelerator="cuda", max_epochs=params.n_epoch)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Usecase 1: initialize dataloader for a FASTA file

In [8]:
## Set "fasta_file" parameter to point to absolute path of your input data
params.fasta_file = "all_seqs.fasta"

## Create data loader
dataloader = TestFASTALoader(params=params, collater=task.batch_converter)

In [18]:
predictions = trainer.predict(model=task, dataloaders=[dataloader.test_dataloader()])
predictions = aggregate_predictions(predictions)

## get probabilities if needed
probs = torch.sigmoid(torch.tensor(predictions['y_pred']))
predictions['probabilities'] = probs

## save predictions to a pickle
with open(format('%s/test_predictions_%s.pkl' % (params.output_dir,params.suffix)), 'wb') as fout:
    pickle.dump(predictions, fout)


You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


LazyFileLoader Reading:  metadata.csv
....with header: Index(['PDBID', 'SEQUENCE', 'SYMM'], dtype='object')  and length: 151


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 3/3 [00:07<00:00,  2.38s/it]
(151, 20)
Size of predicted logits:  (151, 20)  number of proteins:  151


#### save predictions to a text file

In [19]:
## set up label map for 20 classes: 17 classes + 3 coarse-grained classes that indicate 
# high order C symmetry CX, high order D symmetry DX or other high order symmetry

symm_to_label_map = {'C1':0,'C2':1,'C3':2,'C4':3,'C5':4,'C6':5,'C7':6,'C8':6,'C9':6,'C10':7,'C11':7,'C12':7,'C13':7,'C14':7,
'C15':7,'C16':7,'C17':7,'D2':8,'D3':9,'D4':10,'D5':11,'D6':12,'D7':12,'D8':12,'D9':12,'D10':12,'D11':12,'D12':12,'H':13,'O':14,'T':15, 'I':16}
symm_to_label_map['CX']=17
symm_to_label_map['DX']=18
symm_to_label_map['HOTI']=19

label_to_symm_map = {v: k for k, v in symm_to_label_map.items()}
label_to_symm_map[6]='C6-C9'
label_to_symm_map[7]='C10-C17'
label_to_symm_map[12]='D6-D12'

In [20]:
# get probabilities for each class (note: this is multi-label classification, so the sum of all probabilities for a protein will not be equal to 1)
probs = torch.sigmoid(torch.tensor(predictions['y_pred']))

num_examples, num_classes = probs.shape

positive_class_strings = ['' for _ in range(num_examples)]

# Process each class independently
for class_idx in range(num_classes):
    positive_examples = probs[:, class_idx] >= 0.5
    for i in np.where(positive_examples)[0]:   
        positive_class_strings[i] += format(" %s:%.3f" % (label_to_symm_map[class_idx],probs[i, class_idx].numpy()))

## save to a text file
with open(format('%s/predictions_%s.txt' % (params.output_dir,params.suffix)), 'w') as fout:
    for (protid, labels) in zip(predictions['pdbids'], positive_class_strings):
        fout.write(f"{protid},{labels}\n")
        
    

### Usecase 2: Initialize dataloader for a CSV file with sequence and labels in it
##### the CSV file needs to have at least these columns:  PDBID, SEQUENCE, SYMM

In [21]:
## absolute path to your CSV file
params.meta_data_file = 'metadata.csv'

In [22]:
dataloader = TestSequencesLoader(params=params, collater=task.batch_converter)

In [23]:
predictions = trainer.predict(model=task, dataloaders=[dataloader.test_dataloader()])
predictions = aggregate_predictions(predictions)

## get probabilities if needed
probs = torch.sigmoid(torch.tensor(predictions['y_pred']))
predictions['probabilities'] = probs

with open(format('%s/test_predictions_%s.pkl' % (params.output_dir,params.suffix)), 'wb') as fout:
    pickle.dump(predictions, fout)


You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


LazyFileLoader Reading:  metadata.csv
....with header: Index(['PDBID', 'SEQUENCE', 'SYMM'], dtype='object')  and length: 151


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 3/3 [00:07<00:00,  2.38s/it]
(151, 20)
Size of predicted logits:  (151, 20)  number of proteins:  151


In [24]:
predictions.keys()

dict_keys(['y_pred', 'y_true', 'pdbids', 'probabilities'])