In [None]:
#Import necessary libraries
import os
import torch
import evaluate
import numpy as np
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import DataCollatorWithPadding,DataCollatorForSeq2Seq
from transformers import AutoTokenizer, GPT2LMHeadModel,TrainingArguments, Trainer,GPT2Config,EarlyStoppingCallback
from sklearn.metrics import average_precision_score,matthews_corrcoef,f1_score, precision_score, recall_score, balanced_accuracy_score

In [None]:
#Use GPU else specify '-1' for CPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
#Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2',bos_token='<startoftext>',eos_token='<endoftext>',pad_token='<PAD>')

In [3]:
#Add custom tokens
tokenizer.add_tokens(['POSITIVE','NEGATIVE'])

2

In [5]:
#Map positive/negative labels and prepare prompt for inference
class SequenceClassificationDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.map_label={0:'NEGATIVE',1:'POSITIVE'}
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        prep_txt= f'<startoftext>{sequence}\n'
        encoding = self.tokenizer(prep_txt,return_tensors='pt')
        return  {
            'input_ids': encoding['input_ids'].squeeze(), 
            'attention_mask': encoding['attention_mask'].squeeze(), 
            'label': label
        }

In [6]:
#Load benchmark dataset
data=pd.read_csv('benchmark.csv')

In [7]:
#Check the choice for central amino acid
check = ['Y']
mask = data['Seq-51'].apply(lambda s: s[len(s)//2] in check)
data = data[mask]
data

Unnamed: 0,Uniprot,Res,Sequence,Label,Seq-51
1861,HIPK2_HUMAN,675,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,0,PFQQALIVCPPGFQGLQASPSKHAGYSVRMENAVPIVTQAPGAQPL...
1862,HIPK2_HUMAN,285,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,0,FVRAYECFQHKNHTCLVFEMLEQNLYDFLKQNKFSPLPLKYIRPVL...
1864,HIPK2_HUMAN,558,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,0,LDFPHSTHVKSCFQNMEICKRRVNMYDTVNQSKTPFITHVAPSTST...
1865,HIPK2_HUMAN,1061,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,0,QPLNLSQAQQHITTDRTGSHRRQQAYITPTMAQAPYSFPHNSPSHG...
1871,HIPK2_HUMAN,264,MAPVYEGMASHVQVFSPHTLQSSAFCSVKKLKIEPSSNWDMTGYGS...,0,QGQIEVSILARLSTESADDYNFVRAYECFQHKNHTCLVFEMLEQNL...
...,...,...,...,...,...
53749,SYUB_HUMAN,39,MDVFMKGLSMAKEGVVAAAEKTKQGVTEAAEKTKEGVLYVGSKTRE...,0,GVVAAAEKTKQGVTEAAEKTKEGVLYVGSKTREGVVQGVASVAEKT...
53750,SYUB_HUMAN,119,MDVFMKGLSMAKEGVVAAAEKTKQGVTEAAEKTKEGVLYVGSKTRE...,0,KPEEVAQEAAEEPLIEPLMEPEGESYEDPPQEEYQEYEPEA-----...
54035,B4DDL9_HUMAN,209,MENFQKVERIGEGTYGVVYKARNKLTGEVVALKKIRLDTLLDVIHT...,0,ARQDFSKVVPPLDEDGRSLLSQTLHYDPNKRISAKAALAHPFFQDV...
54036,B4DDL9_HUMAN,19,MENFQKVERIGEGTYGVVYKARNKLTGEVVALKKIRLDTLLDVIHT...,1,-------MENFQKVERIGEGTYGVVYKARNKLTGEVVALKKIRLDT...


In [8]:
data['Seq-51']=data['Seq-51'].str.replace('\n','')

In [9]:
data['Seq-51']=data['Seq-51'].str.replace('-','')

In [10]:
data['Label'].value_counts()

0    431
1     37
Name: Label, dtype: int64

In [11]:
data = data.reset_index(drop=True)

In [12]:
test_texts=data['Seq-51']
test_labels =data['Label']

In [13]:
test_dataset=SequenceClassificationDataset(test_texts,test_labels,tokenizer)

In [14]:
test_data_loder= DataLoader(test_dataset,collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer,padding=True),batch_size=1)

In [15]:
map_label={0:'NEGATIVE',1:'POSITIVE'}

In [16]:
def get_score(mdl_path):
    model_config = GPT2Config.from_pretrained(mdl_path)
    model = GPT2LMHeadModel.from_pretrained(mdl_path,config=model_config,ignore_mismatched_sizes=True)
    model=model.cuda().eval()
    predition=[]
    for i,x in enumerate(test_data_loder):
        Actual=f"{tokenizer.decode(x['input_ids'][0],skip_special_tokens=True)} {x['label']}"
        generated=x['input_ids'].cuda()
        sample_outputs=model.generate(generated,attention_mask=x['attention_mask'].cuda(),do_sample=False,top_k=50,max_new_tokens=2,top_p=0.15,temperature=0.1,num_return_sequences=0,pad_token_id=tokenizer.eos_token_id)
        predicted_text=tokenizer.decode(sample_outputs[0],skip_special_tokens=True)
        predition+=[[map_label[int(x.pop('label'))],predicted_text.split('\n')[-1]]]
    labels=[[0 if y=='NEGATIVE' else 1  for y in x] for x in predition]
    labels=np.asanyarray(labels)
    actual=labels[:,0]
    pred=labels[:,1]
    return f1_score(actual,pred),matthews_corrcoef(actual,pred), precision_score(actual,pred), recall_score(actual, pred), average_precision_score(actual,pred)

In [None]:
#replace the path with best performing checkpoint
get_score('checkpoint-22500/')

# Check the best performing checkpoint in the benchmark dataset

In [21]:
results=[]

In [None]:
#Replace the path with the output directory used during model training
for mdl in os.listdir('/media/8TB_hardisk/results-Prompt4/'):
    if 'checkpoint' in mdl:
        mdl_path='/media/8TB_hardisk/results-Prompt4/'+mdl
        f1,mcc,prc,rec,avg=get_score(mdl_path)
        with open('results-Prompt4.csv','a') as f:
            f.write(f'{mdl},{f1},{mcc},{prc},{rec},{avg}\n')