In [1]:
from transformers import AutoTokenizer, GPT2LMHeadModel,TrainingArguments, Trainer,GPT2Config,EarlyStoppingCallback
import torch
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from torch.utils.data import DataLoader,Dataset
from transformers import DataCollatorWithPadding,DataCollatorForSeq2Seq
import os
import torch.nn.functional as F
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data=pd.read_csv('/home/jeevan/succi/Glutathionylation/cd_hit_data-Glutathionylation.csv')

In [3]:
data

Unnamed: 0,Seq,Label
0,AYAREEFASTCPDDEEIELAY,1
1,YEVFGRVQGVCFRMYAEDEAR,1
2,AWIGYEHTSFCGQQFILERGE,1
3,IERLMSFRPICSANHKESKMT,1
4,EAPFTKFDPSCLFPACRDYWT,1
...,...,...
15282,PAKDTSSLESCLQTELHLCTE,0
15283,ESCLQTELHLCTEQPEKEDMT,0
15284,IERQPSSVSICFESLTTDLEH,0
15285,IHNTVKIGPDCKEALPDLPSP,0


In [4]:
data['Label'].value_counts()

0    12134
1     3153
Name: Label, dtype: int64

In [5]:
data['Seq']=data['Seq'].str.replace('-','')
data['Seq']=data['Seq'].str.replace('\n','')

In [6]:
data

Unnamed: 0,Seq,Label
0,AYAREEFASTCPDDEEIELAY,1
1,YEVFGRVQGVCFRMYAEDEAR,1
2,AWIGYEHTSFCGQQFILERGE,1
3,IERLMSFRPICSANHKESKMT,1
4,EAPFTKFDPSCLFPACRDYWT,1
...,...,...
15282,PAKDTSSLESCLQTELHLCTE,0
15283,ESCLQTELHLCTEQPEKEDMT,0
15284,IERQPSSVSICFESLTTDLEH,0
15285,IHNTVKIGPDCKEALPDLPSP,0


In [7]:
benchmark = pd.read_csv('benchmark-unique.csv')
benchmark

Unnamed: 0,Uniprot,Seq,Label
0,RPC5_HUMAN_693,QEVDKVLKDCCVSYGGMWYLK,0
1,LRP1_MOUSE_4237,YTGDKCELDQCWEYCHNGGTC,0
2,STAB1_MOUSE_909,DGRICVAIDECGLDTRGGCHA,0
3,FLNC_HUMAN_2454,VHTPSGAVEECYVSELDSDKH,0
4,LAS1L_HUMAN_173,MPHINDCRRGCYFVLDWLQKT,0
...,...,...,...
2233,DHAR1_ARATH_20,AVGAPDHLGDCPFSQRALLTL,1
2234,HNRPF_HUMAN_290,EFTVQSTTGHCVHMRGLPYKA,1
2235,HS90B_MOUSE_589,TISNRLVSSPCCIVTSTYGWT,1
2236,RYR2_MOUSE_822,KFLPPPGYAACYEAVLPKEKL,1


In [8]:
benchmark['Seq']=benchmark['Seq'].str.replace('-','')
benchmark['Seq']=benchmark['Seq'].str.replace('\n','')

common_seqs = data['Seq'].isin(benchmark['Seq'])
data = data[common_seqs]
data

Unnamed: 0,Seq,Label


In [7]:
# tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2')
tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2',bos_token='<startoftext>',eos_token='<endoftext>',pad_token='<PAD>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
tokenizer.add_tokens(['SEQUENCE:','LABEL:','POSITIVE','NEGATIVE'])

4

In [9]:
tokenizer.special_tokens_map

{'bos_token': '<startoftext>',
 'eos_token': '<endoftext>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<PAD>'}

In [10]:
tokenizer.encode('<|endoftext|>ESAKTIVDSGKLPSSLLSYFV1<|endoftext|>',add_special_tokens=True)

[0, 44285, 9903, 277, 1047, 265, 570, 44795, 17, 0]

In [11]:
class SequenceClassificationDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer,dtype='Train'):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.map_label={1:'POSITIVE',0:'NEGATIVE'}
        self.dtype='Train'
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        prep_txt1= f'<startoftext>SEQUENCE:{sequence}\nLABEL:{self.map_label[label]}<endoftext>'
        prep_txt2= f'<startoftext>SEQUENCE:{sequence}\nLABEL:<endoftext>'
        encoding1 = self.tokenizer(prep_txt1,return_tensors='pt')
        encoding2 = self.tokenizer(prep_txt2,return_tensors='pt')
        return {
            'input_ids': encoding1['input_ids'].squeeze(), 
            'attention_mask': encoding1['attention_mask'].squeeze(), 
            'labels': encoding1['input_ids'].squeeze()
        } if self.dtype=='Train' else {
            'input_ids': encoding2['input_ids'].squeeze(), 
            'attention_mask': encoding2['attention_mask'].squeeze(), 
            'labels': encoding1['input_ids'].squeeze()
        }

In [12]:
train_texts=data['Seq'].reset_index(drop=True)
train_labels=data['Label'].reset_index(drop=True)

In [13]:
train_dataset=SequenceClassificationDataset(train_texts,train_labels,tokenizer,'Train')

In [14]:
model_config = GPT2Config.from_pretrained('nferruz/ProtGPT2')

In [15]:
training_args = TrainingArguments(
    output_dir='/media/8TB_hardisk/results-glutarylation-dbptm/',  # output directory
    num_train_epochs=200,  # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='logs/',
    save_steps=500,
    # eval_steps=500,
    logging_steps=500,
    # evaluation_strategy='steps',
    # save_total_limit=1,
    # metric_for_best_model='eval_loss',
    # load_best_model_at_end=True
)

In [16]:
import torch

In [17]:
model = GPT2LMHeadModel.from_pretrained('nferruz/ProtGPT2',config=model_config,ignore_mismatched_sizes=True)
model.resize_token_embeddings(len(tokenizer))

Embedding(50264, 1280)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=test_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer,padding='longest'),
    # callbacks=[early_stop],
    # model_init=model_init
)

In [19]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,5.9416
1000,2.6865
1500,1.2299
2000,0.8623
2500,0.7823
3000,0.744
3500,0.7269
4000,0.7169
4500,0.7106
5000,0.7068


TrainOutput(global_step=5600, training_loss=1.4243123463221958, metrics={'train_runtime': 21061.9244, 'train_samples_per_second': 33.435, 'train_steps_per_second': 0.266, 'total_flos': 4.49621025667584e+16, 'train_loss': 1.4243123463221958, 'epoch': 200.0})