In [1]:
# Importing Dependencies
import argparse
import csv

# Write to a log file
import logging
import os
import sys
import json
from operator import add

# Plotter
import matplotlib.pyplot as plt
import numpy as np

# Standard libraries
import pandas as pd

# Model Persistence
from joblib import dump, load

# Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector

# Models
from sklearn.linear_model import Lasso

# Metrics
from sklearn.metrics import (accuracy_score, make_scorer, matthews_corrcoef,
                             mean_squared_error)

# Preprocessing
from sklearn.model_selection import (GridSearchCV, cross_val_score, StratifiedKFold,
                                     train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier

## Packages to use the .fasta file.
# Compute protein descriptors
from propy import PyPro
from propy import AAComposition
from propy import CTD

# Build Sequence Object
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Read Fasta File
from pyfaidx import Fasta

# Grouping iterable
from itertools import chain

# Return file path
import glob

## Global Variables
#
PATH = os.getcwd()
FOLDS = 5
RAND = 42
THRESHOLD = 0.01
CLF_NAME = 'SVC with RBF Kernel'
REG_NAME = 'SVR with RBF Kernel'

In [3]:
list(map(add, [1, 3, 5], [3, 5, 7]))

[4, 8, 12]

In [1]:
def inferenceSingleSeqence(seq):
    
    """ The inference function gets the protein sequence, trained model, preprocessing function and selected
    features as input. 
    
    The function read the sequence as string and extract the peptide features using appropriate packages into 
    the dataframe.
    
    The necessary features are selected from the extracted features which then undergoes preprocessing function, the
    target value is predicted using trained function and give out the results. """
    
    # empty list to save the features
    listing = []
    
    # Make sure the sequence is a string
    s = str(seq)
    
    # replace the unappropriate peptide sequence to A
    s = s.replace('X','A')
    s = s.replace('x','A')
    s = s.replace('U','A')
    s = s.replace('Z','A')
    s = s.replace('B','A')
    
    # Calculating primary features
    analysed_seq = ProteinAnalysis(s)
    wt = analysed_seq.molecular_weight()
    arm = analysed_seq.aromaticity()
    instab = analysed_seq.instability_index()
    flex = analysed_seq.flexibility()
    pI = analysed_seq.isoelectric_point()
    
    # create a list for the primary features
    pFeatures = [seq, s, len(seq), wt, arm, instab, pI]
    
    # Get secondary structure in a list
    sectruc = analysed_seq.secondary_structure_fraction()
    sFeatures = list(sectruc)
    
    # Get Amino Acid Composition (AAC), Composition Transition Distribution (CTD) and Dipeptide Composition (DPC)
    resultAAC = AAComposition.CalculateAAComposition(s)
    resultCTD = CTD.CalculateCTD(s)
    resultDPC = AAComposition.CalculateDipeptideComposition(s)
    
    # Collect all the features into lists
    aacFeatures = [j for i,j in resultAAC.items()]
    ctdFeatures = [l for k,l in resultCTD.items()]
    dpcFeatures = [n for m,n in resultDPC.items()]
    listing.append(pFeatures + sFeatures + aacFeatures + ctdFeatures + dpcFeatures)
    
    # Collect feature names
    name1 = ['Name','Seq' ,'SeqLength','Weight','Aromaticity','Instability','IsoelectricPoint','Helix','Turn','Sheet']
    name2 = [i for i,j in resultAAC.items()]
    name3 = [k for k,l in resultCTD.items()]
    name4 = [m for m,n in resultDPC.items()]
    name  = []
    name.append(name1+name2+name3+name4)
    flatten_list = list(chain.from_iterable(name))
    
    # create dataframe using all extracted features and the names
    allFeatures = pd.DataFrame(listing, columns = flatten_list)

    return allFeatures

In [5]:
# Importing the test data
def test_data():
    """
    Import the full test dataset from the current path.

    Parameters
    ----------
    None

    Returns
    -------
    x_test: DataFrame containing the test dataset.
    """
    # Import, format, and drop duplicates.
    peptide_sequences = pd.read_csv('combined_hits.csv')
    peptide_sequences = peptide_sequences.replace(r"^ +| +$", r"", regex=True)
    name_index = peptide_sequences.columns.get_loc('Seq')
    peptide_sequences.rename(columns={'Seq':'Name'}, inplace=True)
    peptide_sequences = peptide_sequences.drop_duplicates(subset=['Name'])

    # Create a dataframe for the extracted features for the peptide sequences.
    df = pd.DataFrame()
    for i in range(len(peptide_sequences)):
        df = pd.concat([df, inferenceSingleSeqence(peptide_sequences.iloc[i][name_index])])
    df = df.drop(columns=['Seq','Helix','Turn','Sheet'])

    return df

In [8]:
# Import, format, and drop duplicates.
peptide_sequences = pd.read_csv('combined_hits.csv')
peptide_sequences = peptide_sequences.replace(r"^ +| +$", r"", regex=True)
name_index = peptide_sequences.columns.get_loc('Seq')
peptide_sequences.rename(columns={'Seq':'Name'}, inplace=True)
peptide_sequences = peptide_sequences.drop_duplicates(subset=['Name'])

# Create a dataframe for the extracted features for the peptide sequences.

data = inferenceSingleSeqence(peptide_sequences.iloc[0][name_index])
data = data.drop(columns=['Seq','Helix','Turn','Sheet'])