In [1]:
#Import necessary libraries
import os
import torch
import evaluate
import numpy as np
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import DataCollatorWithPadding,DataCollatorForSeq2Seq
from transformers import AutoTokenizer, GPT2LMHeadModel,TrainingArguments, Trainer,GPT2Config,EarlyStoppingCallback
from sklearn.metrics import average_precision_score,matthews_corrcoef,f1_score, precision_score, recall_score, balanced_accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#Use GPU else specify '-1' for CPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
#Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2',bos_token='<startoftext>',eos_token='<endoftext>',pad_token='<PAD>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
#Add custom tokens
tokenizer.add_tokens(['SEQUENCE:','LABEL:','POSITIVE','NEGATIVE'])

4

In [4]:
#Map positive/negative labels and prepare prompt for inference
class SequenceClassificationDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.map_label={0:'NEGATIVE',1:'POSITIVE'}
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        prep_txt= f'<startoftext>SEQUENCE:{sequence}\nLABEL:'
        encoding = self.tokenizer(prep_txt,return_tensors='pt')
        return  {
            'input_ids': encoding['input_ids'].squeeze(), 
            'attention_mask': encoding['attention_mask'].squeeze(), 
            'label': label
        }

In [5]:
#Load benchmark dataset
data=pd.read_csv('benchmark.csv')

In [6]:
data['Seq']=data['Seq'].str.replace('\n','')

In [7]:
data['Seq']=data['Seq'].str.replace('-','')

In [8]:
data

Unnamed: 0,Uniprot,Seq,Label
0,RPC5_HUMAN_693,QEVDKVLKDCCVSYGGMWYLK,0
1,LRP1_MOUSE_4237,YTGDKCELDQCWEYCHNGGTC,0
2,STAB1_MOUSE_909,DGRICVAIDECGLDTRGGCHA,0
3,FLNC_HUMAN_2454,VHTPSGAVEECYVSELDSDKH,0
4,LAS1L_HUMAN_173,MPHINDCRRGCYFVLDWLQKT,0
...,...,...,...
2233,DHAR1_ARATH_20,AVGAPDHLGDCPFSQRALLTL,1
2234,HNRPF_HUMAN_290,EFTVQSTTGHCVHMRGLPYKA,1
2235,HS90B_MOUSE_589,TISNRLVSSPCCIVTSTYGWT,1
2236,RYR2_MOUSE_822,KFLPPPGYAACYEAVLPKEKL,1


In [9]:
data['Label'].value_counts()

0    1736
1     502
Name: Label, dtype: int64

In [14]:
test_texts=data['Seq']
test_labels =data['Label']

In [15]:
test_dataset=SequenceClassificationDataset(test_texts,test_labels,tokenizer)

In [16]:
test_data_loder= DataLoader(test_dataset,collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer,padding=True),batch_size=1)

In [14]:
map_label={0:'NEGATIVE',1:'POSITIVE'}

In [None]:
def get_score(mdl_path):
    model_config = GPT2Config.from_pretrained(mdl_path)
    model = GPT2LMHeadModel.from_pretrained(mdl_path,config=model_config,ignore_mismatched_sizes=True)
    model=model.cuda().eval()
    predition=[]
    for i,x in enumerate(test_data_loder):
        Actual=f"{tokenizer.decode(x['input_ids'][0],skip_special_tokens=True)} {x['label']}"
        generated=x['input_ids'].cuda()
        sample_outputs=model.generate(generated,attention_mask=x['attention_mask'].cuda(),do_sample=False,top_k=50,max_new_tokens=2,top_p=0.15,temperature=0.1,num_return_sequences=0,pad_token_id=tokenizer.eos_token_id)
        predicted_text=tokenizer.decode(sample_outputs[0],skip_special_tokens=True)
        predicted_text.split('LABEL:')[-1]
        predition+=[[map_label[int(x.pop('label'))],predicted_text.split('LABEL:')[-1]]]
    labels=[[0 if y=='NEGATIVE' else 1  for y in x] for x in predition]
    labels=np.asanyarray(labels)
    actual=labels[:,0]
    pred=labels[:,1]
    return f1_score(actual,pred),matthews_corrcoef(actual,pred), precision_score(actual,pred), recall_score(actual, pred), average_precision_score(actual,pred)

In [None]:
#replace the path with best performing checkpoint
get_score('checkpoint-22500/')

# Check the best performing checkpoint in the benchmark dataset

In [18]:
results=[]

In [None]:
#Replace the path with the output directory used during model training
for mdl in os.listdir('/media/8TB_hardisk/results-Prompt1/'):
    if 'checkpoint' in mdl:
        mdl_path='/media/8TB_hardisk/results-Prompt1/'+mdl
        f1,mcc,prc,rec,avg=get_score(mdl_path)
        with open('results-Prompt1.csv','a') as f:
            f.write(f'{mdl},{f1},{mcc},{prc},{rec},{avg}\n')