## Libraries

In [1]:
#Import necessary libraries
import os
import torch
import evaluate
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader,Dataset
from transformers import DataCollatorForSeq2Seq
from transformers import AutoTokenizer, GPT2LMHeadModel,TrainingArguments, Trainer,GPT2Config
from sklearn.metrics import average_precision_score,matthews_corrcoef,f1_score, precision_score, recall_score, balanced_accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


## Example Sequence

In [2]:
#Define benchmark dataset using pandas
sequence='MASKSVVVLLFLALIASSAIAQAPGPAPTRSPLPSPAQPPRTAAPTPSITPTPTPTPSATPTAAPVSPPAGSPLPSSASPPAPPTSLTPDGAPVAGPTGSTPVDNNNAATLAAGSLAGFVFVASLLL'

## Subsequence breakdown (Data pre-processing)

In [3]:
def find_subsequences(sequence:str, chars:list, left=10, right=10):
    subsequences = []
    length = len(sequence)
    # Iterate through the sequence to find the character
    for i, c in enumerate(sequence):
        if c in chars:
            # Calculate the start and end indices for the subsequence
            start = max(0, i - left)  # Ensure start is not less than 0
            end = min(length, i + right + 1)  # Ensure end does not exceed the sequence length
            
            # Append the subsequence to the list
            subsequences.append({'Seq':sequence[start:end],
                                 'Pos':i+1,
                                 'text':f'<startoftext>SEQUENCE:{sequence[start:end]}\nLABEL:'
                                 })
    return subsequences

## Load model | Load tokenizer | Prediction API call

In [4]:
def load_model(mdl_pth):
    model_config = GPT2Config.from_pretrained(mdl_pth)
    model = GPT2LMHeadModel.from_pretrained(mdl_pth,config=model_config,ignore_mismatched_sizes=True)
    return model.cpu().eval()

def tokenize(sub_sequences,tokenizer):
    sub_sequences=[x['text'] for x in sub_sequences]
    encoded=tokenizer(sub_sequences,return_tensors='pt',padding='longest')
    return encoded

def inference(input_seq,tokenizer_pth,model_pth,chars:list):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_pth,padding_side='left')
    model=load_model(model_pth)
    sub_sequences=find_subsequences(input_seq,chars=chars)
    inputs_encode=tokenize(sub_sequences=sub_sequences,tokenizer=tokenizer)
    predicted=model.generate(inputs_encode['input_ids'],attention_mask=inputs_encode['attention_mask'],do_sample=False,top_k=50,max_new_tokens=2,top_p=0.15,temperature=0.1,num_return_sequences=0,pad_token_id=50259)
    predicted_text=tokenizer.batch_decode(predicted,skip_special_tokens=True)
    predicted_labels=[x.split('LABEL:')[-1] for x in predicted_text]
    json_results={'Sequence':input_seq,
                'Type':model_pth,
                'Results':[]
                }
    for label,sub_seq in zip(predicted_labels,sub_sequences):
        json_results['Results'].append({sub_seq['Pos']:label})
    return json_results

In [5]:
result = inference(sequence, 'Tokenizer/','Hydroxylation (P) sample model/',['P'])
print(result)

{'Sequence': 'MASKSVVVLLFLALIASSAIAQAPGPAPTRSPLPSPAQPPRTAAPTPSITPTPTPTPSATPTAAPVSPPAGSPLPSSASPPAPPTSLTPDGAPVAGPTGSTPVDNNNAATLAAGSLAGFVFVASLLL', 'Type': 'Hydroxylation (P) sample model/', 'Results': [{24: 'POSITIVE'}, {26: 'POSITIVE'}, {28: 'POSITIVE'}, {32: 'POSITIVE'}, {34: 'POSITIVE'}, {36: 'POSITIVE'}, {39: 'POSITIVE'}, {40: 'POSITIVE'}, {45: 'NEGATIVE'}, {47: 'NEGATIVE'}, {51: 'NEGATIVE'}, {53: 'NEGATIVE'}, {55: 'NEGATIVE'}, {57: 'NEGATIVE'}, {61: 'NEGATIVE'}, {65: 'NEGATIVE'}, {68: 'NEGATIVE'}, {69: 'NEGATIVE'}, {73: 'NEGATIVE'}, {75: 'NEGATIVE'}, {80: 'NEGATIVE'}, {81: 'NEGATIVE'}, {83: 'NEGATIVE'}, {84: 'NEGATIVE'}, {89: 'NEGATIVE'}, {93: 'NEGATIVE'}, {97: 'NEGATIVE'}, {102: 'NEGATIVE'}]}
