In [1]:
## Dependency Imports
# Compute protein descriptors
from propy import PyPro
from propy import AAComposition
from propy import CTD

# Build Sequence Object
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Read Fasta File
from pyfaidx import Fasta

# Grouping iterable
from itertools import chain

# Return file path
import glob

# Unpack Files
import json
import pickle

# Dataframes
import pandas as pd
import numpy as np

In [2]:
## Classification Model Imports:
# Scaler
with open('Classification Dependencies/Scaler Classification.pkl', 'rb') as fh:
    scaler_for_classification = pickle.load(fh)

# First Model
with open('Classification Dependencies/SVM Linear Classification trained.pkl', 'rb') as fh:
    classification_model_1 = pickle.load(fh)

# Second Model
with open('Classification Dependencies/SVM RBF Classification trained.pkl', 'rb') as fh:
    classification_model_2 = pickle.load(fh)

# Selected features Classification
with open('Classification Dependencies/Features for Classification Model.json') as fh:
    classification_features = json.loads(fh.read())

In [86]:
## Inference Function for .csv Files:
def inference_csv(csv, ensemble=True):
    sequences = pd.read_csv(csv)
    sequences = sequences.replace(r"^ +| +$", r"", regex=True)
    sequences = sequences['Seq']
    
    # To save decision function results
    result = pd.DataFrame()

    sequence_data = []
    for seq in sequences:
        # Make sure the sequence is a string
        s = str(seq)
        
        # replace the unappropriate peptide sequence to A
        s = s.replace('X','A')
        s = s.replace('x','A')
        s = s.replace('U','A')
        s = s.replace('Z','A')
        s = s.replace('B','A')

        # Calculating primary features
        analysed_seq = ProteinAnalysis(s)
        wt = analysed_seq.molecular_weight()
        arm = analysed_seq.aromaticity()
        instab = analysed_seq.instability_index()
        flex = analysed_seq.flexibility()
        pI = analysed_seq.isoelectric_point()

        # create a list for the primary features
        pFeatures = [seq, s, len(seq), wt, arm, instab, pI]

        # Get Amino Acid Composition (AAC), Composition Transition Distribution (CTD) and Dipeptide Composition (DPC)
        resultAAC = AAComposition.CalculateAAComposition(s)
        resultCTD = CTD.CalculateCTD(s)
        resultDPC = AAComposition.CalculateDipeptideComposition(s)

        # Collect all the features into lists
        aacFeatures = [j for i,j in resultAAC.items()]
        ctdFeatures = [l for k,l in resultCTD.items()]
        dpcFeatures = [n for m,n in resultDPC.items()]
        sequence_data.append(pFeatures + aacFeatures + ctdFeatures + dpcFeatures)

    # Collect feature names
    pFeaturesName = ['Name','Seq' ,'SeqLength','Weight','Aromaticity','Instability','IsoelectricPoint']
    aacFeaturesData = [i for i,j in resultAAC.items()]
    ctdFeaturesData = [k for k,l in resultCTD.items()]
    dpcFeaturesData = [m for m,n in resultDPC.items()]

    featuresName  = []
    featuresName.append(pFeaturesName+aacFeaturesData+ctdFeaturesData+dpcFeaturesData)
    featuresFlattenList = list(chain.from_iterable(featuresName))

    # create dataframe using all extracted features and the names
    sequence_data = pd.DataFrame(sequence_data, columns = featuresFlattenList)
    
    result = sequence_data[['Name','Seq']].copy()
    
    # Apply preprocessing function
    sequence_data_scaled = pd.DataFrame(scaler_for_classification.transform(sequence_data[classification_features]),
                                       columns = classification_features)
    # get decision function
    result['Model 1'] = classification_model_1.decision_function(sequence_data_scaled)
    result['Model 2'] = classification_model_2.decision_function(sequence_data_scaled)
    
    result['Confidence Score'] = (result['Model 1']+result['Model 2'])/2
    
    result.drop(columns=['Model 1','Model 2'], inplace=True)
    
    result.sort_values(by=['Confidence Score'], ascending=False, inplace=True)
    
    return result

In [87]:
pd.set_option('display.max_rows',None)
sorted_peptides = inference_csv('combined_hits.csv')
sorted_peptides

Unnamed: 0,Name,Seq,Confidence Score
362,DPKPHRSKQRHDNEDDNDDNNDNDDNDDNDDNDNDDNDDNDDNDDN...,DPKPHRSKQRHDNEDDNDDNNDNDDNDDNDDNDNDDNDDNDDNDDN...,1.264969
361,YGEYEEHDEYEEQGESEEHGEYAEHDDLEENGESEDYGESEEHDES...,YGEYEEHDEYEEQGESEEHGEYAEHDDLEENGESEDYGESEEHDES...,1.119958
272,GDDTSGGDNSGGDDTSGGDNSGGDDTSGGDYSGGDDTSGGDDYSGG...,GDDTSGGDNSGGDDTSGGDNSGGDDTSGGDYSGGDDTSGGDDYSGG...,1.083797
275,PTESEMEDPETAESEPDEESAEPEAEEPETEVPEAETDGEMTEPDE...,PTESEMEDPETAESEPDEESAEPEAEEPETEVPEAETDGEMTEPDE...,1.079723
87,ITYTDCTESGQNLCLCEGSNVCGNGNKCKLGSDGEENQCVTGEGTP...,ITYTDCTESGQNLCLCEGSNVCGNGNKCKLGSDGEENQCVTGEGTP...,0.999266
101,CDCGEKICLYGQSCNDGQCSGDPKPSSEFEEFEIDEEEK,CDCGEKICLYGQSCNDGQCSGDPKPSSEFEEFEIDEEEK,0.966279
83,VVYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,VVYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,0.948248
543,MCTETEYTCKMCNQYTDNDWEICAEGGGPACPNPQPKIVEPDPETD...,MCTETEYTCKMCNQYTDNDWEICAEGGGPACPNPQPKIVEPDPETD...,0.936942
98,VVYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,VVYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,0.920917
563,MRDYGQSGDERIDDRQSDNGQGDGAQRNDEQSDGEPQSDGEPQSGG...,MRDYGQSGDERIDDRQSDNGQGDGAQRNDEQSDGEPQSDGEPQSGG...,0.905575
