In [None]:
#Import necessary libraries
import os
import torch
import evaluate
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import DataCollatorWithPadding,DataCollatorForSeq2Seq
from transformers import AutoTokenizer, GPT2LMHeadModel,TrainingArguments, Trainer,GPT2Config,EarlyStoppingCallback

In [None]:
#Use GPU else specify '-1' for CPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
data=pd.read_csv('training.csv')

In [3]:
data

Unnamed: 0,Seq,Label
0,RTPPPSQGKGRGLSLSRFSWG,1
1,TSLALDESLFRGRQIKVIPKR,1
2,TNRPGISTTDRGFPRARYRAR,1
3,GISTTDRGFPRARYRARTTNY,1
4,TDRGFPRARYRARTTNYNSSR,1
...,...,...
126029,LMEHHSSHGGRKRYACQGCWK,0
126030,KEKSYALGGARGPQPSTREAQ,0
126031,GGARGPQPSTREAQAGARAGG,0
126032,PSTREAQAGARAGGPPESVEG,0


In [4]:
#Check the positive and negative labels
data['Label'].value_counts()

0    118466
1      7568
Name: Label, dtype: int64

In [5]:
#Remove \n and - characters from the sequence
data['Seq']=data['Seq'].str.replace('-','')
data['Seq']=data['Seq'].str.replace('\n','')

In [6]:
# Function to extract parts of the sequence
def extract_seq_parts(seq):
    first_ten = seq[:10]  # First ten characters
    last_ten = seq[-10:]  # Last ten characters
    center = seq[len(seq) // 2] if len(seq) % 2 != 0 else ''  # Middle character for odd length, empty for even
    return first_ten, center, last_ten

# Applying the function to each sequence in the DataFrame
data[['First-Ten', 'Center', 'Last-Ten']] = data['Seq'].apply(lambda x: pd.Series(extract_seq_parts(x)))

In [7]:
data

Unnamed: 0,Seq,Label,First-Ten,Center,Last-Ten
0,RTPPPSQGKGRGLSLSRFSWG,1,RTPPPSQGKG,R,GLSLSRFSWG
1,TSLALDESLFRGRQIKVIPKR,1,TSLALDESLF,R,GRQIKVIPKR
2,TNRPGISTTDRGFPRARYRAR,1,TNRPGISTTD,R,GFPRARYRAR
3,GISTTDRGFPRARYRARTTNY,1,GISTTDRGFP,R,ARYRARTTNY
4,TDRGFPRARYRARTTNYNSSR,1,TDRGFPRARY,R,ARTTNYNSSR
...,...,...,...,...,...
126029,LMEHHSSHGGRKRYACQGCWK,0,LMEHHSSHGG,R,KRYACQGCWK
126030,KEKSYALGGARGPQPSTREAQ,0,KEKSYALGGA,R,GPQPSTREAQ
126031,GGARGPQPSTREAQAGARAGG,0,GGARGPQPST,R,EAQAGARAGG
126032,PSTREAQAGARAGGPPESVEG,0,PSTREAQAGA,R,AGGPPESVEG


In [8]:
#Check the choice of central amino acid
data = data[data['Center']=='R']

In [9]:
data

Unnamed: 0,Seq,Label,First-Ten,Center,Last-Ten
0,RTPPPSQGKGRGLSLSRFSWG,1,RTPPPSQGKG,R,GLSLSRFSWG
1,TSLALDESLFRGRQIKVIPKR,1,TSLALDESLF,R,GRQIKVIPKR
2,TNRPGISTTDRGFPRARYRAR,1,TNRPGISTTD,R,GFPRARYRAR
3,GISTTDRGFPRARYRARTTNY,1,GISTTDRGFP,R,ARYRARTTNY
4,TDRGFPRARYRARTTNYNSSR,1,TDRGFPRARY,R,ARTTNYNSSR
...,...,...,...,...,...
126028,GPGADEPGLSRGKPYACGECG,0,GPGADEPGLS,R,GKPYACGECG
126029,LMEHHSSHGGRKRYACQGCWK,0,LMEHHSSHGG,R,KRYACQGCWK
126030,KEKSYALGGARGPQPSTREAQ,0,KEKSYALGGA,R,GPQPSTREAQ
126031,GGARGPQPSTREAQAGARAGG,0,GGARGPQPST,R,EAQAGARAGG


In [10]:
#Reset index to minimize out of index errors
data.reset_index(inplace=True, drop=True)

In [11]:
#Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2',bos_token='<startoftext>',eos_token='<endoftext>',pad_token='<PAD>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
#Add custom tokens
tokenizer.add_tokens(['POSITIVE','NEGATIVE'])

2

In [13]:
tokenizer.special_tokens_map

{'bos_token': '<startoftext>',
 'eos_token': '<endoftext>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<PAD>'}

In [15]:
#Map positive/negative labels and prepare prompt for training
class SequenceClassificationDataset(Dataset):
    def __init__(self,first_ten, center, last_ten, labels, tokenizer,dtype='Train'):
        self.first_ten = first_ten
        self.center = center
        self.last_ten = last_ten
        self.labels = labels
        self.tokenizer = tokenizer
        self.map_label={1:'POSITIVE',0:'NEGATIVE'}
        self.dtype='Train'
    def __len__(self):
        return len(self.first_ten)

    def __getitem__(self, idx):
        first_ten = self.first_ten[idx]
        center = self.center[idx]
        last_ten = self.last_ten[idx]
        label = self.labels[idx]
        prep_txt1= f'<startoftext>{first_ten} {center} {last_ten}\n{self.map_label[label]}<endoftext>'
        encoding1 = self.tokenizer(prep_txt1,return_tensors='pt')
        return {
            'input_ids': encoding1['input_ids'].squeeze(), 
            'attention_mask': encoding1['attention_mask'].squeeze(), 
            'labels': encoding1['input_ids'].squeeze()
        }

In [18]:
first_train_texts=data['First-Ten'].reset_index(drop=True)
center_train_texts=data['Center'].reset_index(drop=True)
last_train_texts=data['Last-Ten'].reset_index(drop=True)
train_labels=data['Label'].reset_index(drop=True)

In [19]:
train_dataset=SequenceClassificationDataset(first_train_texts,center_train_texts,last_train_texts,train_labels,tokenizer,'Train')

In [21]:
#Load the pre-trained model
model_config = GPT2Config.from_pretrained('nferruz/ProtGPT2')

In [22]:
training_args = TrainingArguments(
    output_dir='/media/8TB_hardisk/results-Prompt2/',  # output directory
    num_train_epochs=200,  # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='logs/',
    save_steps=500,
    logging_steps=500
    save_total_limit=10 #no. of models to save in the output directory
)

In [23]:
model = GPT2LMHeadModel.from_pretrained('nferruz/ProtGPT2',config=model_config,ignore_mismatched_sizes=True)
model.resize_token_embeddings(len(tokenizer))

Embedding(50262, 1280)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer,padding='longest'),
)

In [None]:
trainer.train()