In [1]:
#Import necessary libraries
import os
import torch
import evaluate
import numpy as np
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import DataCollatorWithPadding,DataCollatorForSeq2Seq
from transformers import AutoTokenizer, GPT2LMHeadModel,TrainingArguments, Trainer,GPT2Config,EarlyStoppingCallback
from sklearn.metrics import average_precision_score,matthews_corrcoef,f1_score, precision_score, recall_score, balanced_accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#Use GPU else specify '-1' for CPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
#Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2',bos_token='<startoftext>',eos_token='<endoftext>',pad_token='<PAD>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
#Add custom tokens
tokenizer.add_tokens(['POSITIVE','NEGATIVE'])

2

In [5]:
#Map positive/negative labels and prepare prompt for inference
class SequenceClassificationDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.map_label={0:'NEGATIVE',1:'POSITIVE'}
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        prep_txt= f'<startoftext>{sequence}\n'
        encoding = self.tokenizer(prep_txt,return_tensors='pt')
        return  {
            'input_ids': encoding['input_ids'].squeeze(), 
            'attention_mask': encoding['attention_mask'].squeeze(), 
            'label': label
        }

In [6]:
#Load benchmark dataset
data=pd.read_csv('benchmark.csv')

In [7]:
data['Seq']=data['Seq'].str.replace('\n','')

In [8]:
data['Seq']=data['Seq'].str.replace('-','')

In [9]:
data

Unnamed: 0,Uniprot,Seq,Label
0,B7Z2C9_HUMAN_234,AYDSEVPSCHSIYMRQEGFLA,0
1,HDAC5_MOUSE_790,DTVWNEMHSSSAVRMAVGCLV,0
2,B3KS38_HUMAN_94,GFVFKMPWKPTHPSSTHALAE,0
3,IP3KA_MOUSE_156,RSRGNVQLETSEDVGQKSHWQ,0
4,MDC1_HUMAN_1314,KPTSRTTRSRTNMSSVKTPET,0
...,...,...,...
55703,ICP0_HHV11_67,DSASEADSTDTELFETGLLGP,1
55704,CAC1C_RABIT_1842,VRAQEAAWKLSSKRCHSQESQ,1
55705,Q9PU52_CHICK_46,LARSSSGSAFSPYPGSAAFTA,1
55706,MDM2_HUMAN_351,ISEKAKLENSTQAEEGFDVPD,1


In [10]:
#Check the choice of central amino acid
check = ['Y']
mask = data['Seq'].apply(lambda s: s[len(s)//2] in check)
data = data[mask]
data

Unnamed: 0,Uniprot,Seq,Label
86,MELK_HUMAN_88,TANKIFMVLEYCPGGELFDYI,0
243,DYRK3_MOUSE_189,GVIGGPNNGGYDDADGAYIHV,0
261,DYR1A_RAT_145,VYNDGYDDDNYDYIVKNGEKW,0
272,CDK1_MOUSE_270,GLDLLSKMLVYDPAKRISGKM,0
291,MMP14_HUMAN_497,NNQKLKVEPGYPKSALRDWMG,0
...,...,...,...
55518,CDK1_HUMAN_15,TKIEKIGEGTYGVVYKGRHKT,1
55558,MELK_HUMAN_367,LEDVTASDKNYVAGLIDYDWC,1
55589,A0A024R0M8_HUMAN_271,GSSCQLGQRIYQYIQSRFYRS,1
55625,NOXA1_MOUSE_252,TTGGHTSPGLYDSLLASRRPG,1


In [11]:
data['Label'].value_counts()

0    441
1     40
Name: Label, dtype: int64

In [12]:
data = data.reset_index(drop=True)

In [13]:
test_texts=data['Seq']
test_labels =data['Label']

In [14]:
test_dataset=SequenceClassificationDataset(test_texts,test_labels,tokenizer)

In [15]:
test_data_loder= DataLoader(test_dataset,collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer,padding=True),batch_size=1)

In [16]:
map_label={0:'NEGATIVE',1:'POSITIVE'}

In [17]:
def get_score(mdl_path):
    model_config = GPT2Config.from_pretrained(mdl_path)
    model = GPT2LMHeadModel.from_pretrained(mdl_path,config=model_config,ignore_mismatched_sizes=True)
    model=model.cuda().eval()
    predition=[]
    for i,x in enumerate(test_data_loder):
        Actual=f"{tokenizer.decode(x['input_ids'][0],skip_special_tokens=True)} {x['label']}"
        generated=x['input_ids'].cuda()
        sample_outputs=model.generate(generated,attention_mask=x['attention_mask'].cuda(),do_sample=False,top_k=50,max_new_tokens=2,top_p=0.15,temperature=0.1,num_return_sequences=0,pad_token_id=tokenizer.eos_token_id)
        predicted_text=tokenizer.decode(sample_outputs[0],skip_special_tokens=True)
        predition+=[[map_label[int(x.pop('label'))],predicted_text.split('\n')[-1]]]
    labels=[[0 if y=='NEGATIVE' else 1  for y in x] for x in predition]
    labels=np.asanyarray(labels)
    actual=labels[:,0]
    pred=labels[:,1]
    return f1_score(actual,pred),matthews_corrcoef(actual,pred), precision_score(actual,pred), recall_score(actual, pred), average_precision_score(actual,pred)

In [None]:
#replace the path with best performing checkpoint
get_score('checkpoint-22500/')

# Check the best performing checkpoint in the benchmark dataset

In [19]:
results=[]

In [None]:
#Replace the path with the output directory used during model training
for mdl in os.listdir('/media/8TB_hardisk/results-Prompt3/'):
    if 'checkpoint' in mdl:
        mdl_path='/media/8TB_hardisk/results-Prompt3/'+mdl
        f1,mcc,prc,rec,avg=get_score(mdl_path)
        with open('results-Prompt3.csv','a') as f:
            f.write(f'{mdl},{f1},{mcc},{prc},{rec},{avg}\n')