In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installing packages, downloading necessary files and features

In [2]:
!pip install pyteomics
!pip install biopython
!pip install modlamp
!pip install propy3
!pip install PyPro

Collecting pyteomics
  Downloading pyteomics-4.6.3-py2.py3-none-any.whl (236 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/236.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/236.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m235.5/236.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.2/236.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyteomics
Successfully installed pyteomics-4.6.3
Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83
Collecting 

In [3]:
import sklearn
import joblib
import pandas as pd
import pyteomics.parser as parser
import re
import time
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from propy import CTD
from propy import PyPro
from pyteomics.parser import cleave, expasy_rules
from sklearn import svm
from sklearn.metrics import mean_squared_error, make_scorer, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from modlamp.database import query_database
from modlamp.descriptors import PeptideDescriptor, GlobalDescriptor
from modlamp.sequences import Helices

In [5]:
#File paths
Models_path = '/content/drive/MyDrive/TriplEP-CPP/Models/'
Organisms_path = '/content/drive/MyDrive/TriplEP-CPP/Organisms/'
Results_path = '/content/drive/MyDrive/TriplEP-CPP/Results/'

#Frequently used variables
CPP_sample_path = '/content/drive/MyDrive/TriplEP-CPP/Training sample/df_TRAINING_SAMPLE.csv'
animals = ['Spider', 'Jellyfish', 'Red ant', 'Honeybee', 'Snake', 'Leech']
proteome = [True, True, False, False, False, False]
TriplEP = joblib.load(Models_path + 'TriplEP_CPP.pkl')

#Organisms
spider_file = 'SPIDER.fasta'
jellyfish_file = 'JELLYFISH.xlsx'
red_ant_file = 'RED_ANT.xlsx'
honeybee_file = 'HONEYBEE.xlsx'
snake_file = 'SNAKE.xlsx'
leech_file = 'LEECH.fasta'
animals_file = [spider_file, jellyfish_file, red_ant_file,
                honeybee_file,snake_file, leech_file]

spider_result = 'spider.csv'
jellyfish_result = 'jellyfish.csv'
red_ant_result = 'red_ant.csv'
honeybee_result = 'honeybee.csv'
snake_result = 'snake.csv'
leech_result = 'leech.csv'
animals_result = [spider_result, jellyfish_result, red_ant_result,
                  honeybee_result, snake_result, leech_result]

In [34]:
#File ---> Sequence List
def reading_sequences_from_file(file_name):
    file_format = file_name.split('.')[-1]
    file_name = Organisms_path + file_name
    if file_format == 'xlsx':
        animal = pd.read_excel(file_name)['Peptide'].drop_duplicates()
    if file_format == 'csv':
        animal = pd.read_csv(file_name)['Peptide'].drop_duplicates()
    if file_format == 'fasta':
        animal = list()
        with open(file_name) as file:
            seqs = SeqIO.parse(file, "fasta")
            for seq in seqs:
                animal.append(str(seq.seq))
        animal = list(set(animal))
    return animal

#Protein List ---> Protein and Peptide Dictionary + Peptide List
def proteins_for_peptides(animal_proteins):
    result = dict()
    peptides = list()
    for protein in animal_proteins:
        result[protein] = []
        for rule in parser.expasy_rules.keys():
            pep = cleave(protein, expasy_rules[rule], regex=True)
            for p in pep:
                result[protein].append(p)
                peptides.append(p)
    return result, list(set(peptides))

#Peptides ---> Peptides of a given length with a given aa composition, not previously known as CPPs, cleared of unnecessary symbols
def purify_peptides(peptides, min_len, max_len, max_per, test_file_name):
    test_file = pd.read_csv(test_file_name)
    clean_peptides = list()
    for pep in peptides:
        pep = re.sub(r'[^NDCQEGHIMFSTWYPKARLV]', '', pep)
        if min_len <= len(pep) <= max_len:
            if pep in test_file['Sequence']:
                print('Used in training')
            else:
                start_dict = ProteinAnalysis(pep).count_amino_acids()
                per = max(start_dict.values()) / (len(pep) * 0.01)
                if per <= max_per:
                    clean_peptides.append(pep)
    return list(set(clean_peptides))

#Auxiliary step for writing descriptors to the dictionary
def create_a_dictionary(feature, parameter):
    if len(feature) == 0:
        for key in parameter.keys():
            feature[key] = [parameter[key]]
    else:
        if parameter != 0:
            for key in parameter.keys():
                feature[key].append(parameter[key])
        else:
            for key in feature.keys():
                feature[key].append(0)
    return feature

#List of filtered peptides ---> Descriptors
def count_descriptors(peptides):
    desc_names = ['_ChargeD1025', '_ChargeD1075', '_ChargeD1100',
                  '_ChargeD3001', '_ChargeD3100',
                  '_NormalizedVDWVD1001', '_NormalizedVDWVD2001',
                  '_NormalizedVDWVD2025', '_NormalizedVDWVD2075',
                  '_PolarityD1075', '_PolarityD3025',
                  '_PolarizabilityD1100',
                  '_SecondaryStrD2025', '_SecondaryStrD2075', '_SecondaryStrD2100',
                  '_SecondaryStrD3025', '_SecondaryStrD3100',
                  '_SolventAccessibilityD3025', '_SolventAccessibilityD3100']
    CTD = {}
    for pep in peptides:
        DesObject = PyPro.GetProDes(pep)
        try:
            ctd = DesObject.GetCTD()
        except ZeroDivisionError:
            ctd = 0
        CTD = create_a_dictionary(CTD, ctd)
    if CTD != {}:
        df_desc = pd.DataFrame(CTD)[desc_names].copy()
    else:
        print('Something went wrong')
        return CTD
    AMP = GlobalDescriptor(peptides)
    AMP.instability_index()
    i_index = pd.DataFrame({"InstabilityInd": AMP.descriptor.flatten().tolist()})
    df_desc = pd.concat([df_desc, i_index], axis = 1)
    return df_desc

#СList of peptides with descriptors ---> List of peptides with results of algorithms ---> Write results to file
def launch_ML_search(peptides, descriptors, name_to_save):
    scaler = StandardScaler()
    scal_desc = scaler.fit_transform(descriptors)
    df_peptides = pd.DataFrame(peptides).rename(columns={0: 'Peptide'})
    df_tep = pd.DataFrame(TriplEP.predict_proba(scal_desc)).round(3)
    print(df_tep[:1])
    df_animal = pd.concat([df_peptides, df_tep], axis = 1).rename(columns={1: 'CPP', 0: 'nonCPP'})
    name_to_save = Results_path + name_to_save
    df_animal[(df_animal.CPP >= 0.5)].to_csv(name_to_save)
    df_animal.to_csv(name_to_save)
    return df_animal

#List of peptides with algorithm results ---> List of candidate CPPs
def predict_CPP(df_animal, limit):
    start_time = time.time()
    print('TriplEP-CPP \t' + str((df_animal['CPP'] > limit).sum()))
    df_animal = df_animal[(df_animal.CPP >= limit)]
    print('TriplEP-CPP \t' + str(len(df_animal)))
    #predicted_CPP = df_animal['Peptide']
    print('Penetrating peptides are predicted for' + time_to_do(start_time, time.time()))
    return df_animal

#List of peptides with algorithm results ---> List of candidate nonCPPs
def predict_nonCPP(df_animal, limit):
    start_time = time.time()
    print('TriplEP-CPP \t' + str((df_animal['nonCPP'] > limit).sum()))
    df_animal = df_animal[(df_animal.nonCPP >= limit)]
    print('TriplEP-CPP \t' + str(len(df_animal)))
    #predicted_CPP = df_animal['Peptide']
    print('Non-penetrating peptides are predicted for' + time_to_do(start_time, time.time()))
    return df_animal

#Function execution time
def time_to_do(start, end):
    t = end - start
    total = '\t' + str(int(t // 60)) + ' min ' + str(round(t % 60)) + ' sec'
    return total

#Starting with a single function
def CPP_search(file_name, flag, min_len, max_len, max_per, CPP_sample_path, name_to_save):
    start_time = time.time()
    animal_sequences = reading_sequences_from_file(file_name)
    print('Read in' + time_to_do(start_time, time.time()))

    if flag:
        start_time = time.time()
        animal_dict, animal_peptides = proteins_for_peptides(animal_sequences)
        print('Proteins split into peptides in' + time_to_do(start_time, time.time()))
    else:
      animal_peptides = animal_sequences

    start_time = time.time()
    clean_animal_peptides = purify_peptides(animal_peptides, min_len, max_len, max_per, CPP_sample_path)
    print('The peptides are purified for' + time_to_do(start_time, time.time()))

    start_time = time.time()
    df_animal_desc = count_descriptors(clean_animal_peptides)
    print('Descriptors counted for' + time_to_do(start_time, time.time()))

    start_time = time.time()
    df_animal = launch_ML_search(clean_animal_peptides, df_animal_desc, name_to_save)
    print('The results of the algorithms are obtained for' + time_to_do(start_time, time.time()))

    return df_animal

# Search for new cell-penetrating peptides

In [23]:
animals = ['Red ant', 'Honeybee', 'Snake', 'Leech']
proteome = [False, False, False, False]
animals_file = [red_ant_file,
                honeybee_file,snake_file, leech_file]

animals_result = [red_ant_result,
                  honeybee_result, snake_result, leech_result]

In [None]:
min_len = 9     #Minimum peptide length
max_len = 35    #Maximum peptide length
max_per = 30    #Percentage of uniqueness of amino acids
df_animals = dict()
organisms = zip(animals, animals_file, animals_result, proteome)
for animal, animal_file, animal_result, prot in organisms:
    print('\n\t', animal)
    df_an = CPP_search(animal_file, prot, min_len, max_len, max_per, CPP_sample_path, animal_result)
    df_animals[animal] = df_an

In [None]:
al = 0
for anim in animals_result:
  with open(Results_path + anim) as r_file:
      print(anim)
      # Create a reader object, specify delimiter character ","
      file_reader = csv.reader(r_file)
      # Row number counter and column header display
      count = 0
      # Reading data from a CSV file
      for row in file_reader:
          count += 1
      al += count
      print(f'Total {count} lines in the file.')

print(f'Total {al} lines.')

## Megalomorph spider Hadronyche infensa

In [None]:
predict_CPP(df_animals['Spider'], 0.95)

XGB model 	80357
KNN model 	30444
RF model 	12
XGB + KNN + RFM 	11
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
59403,TSYNAALVNGGGVKMPDR,0.004,0.996,0.0,1.0,0.047,0.953
252989,QPGCVWILTSTASMGHKAMSAKNSAE,0.01,0.99,0.0,1.0,0.047,0.953
289068,NSPICPALSGKGAPDPDAEDAR,0.006,0.994,0.0,1.0,0.044,0.956
399730,WSPQIGSMLTNSYRPLAEHGR,0.002,0.998,0.0,1.0,0.047,0.953
400888,SQLSTLGNLGGSPKDQADR,0.006,0.994,0.0,1.0,0.047,0.953
415797,SHCRPPHGAEGHR,0.001,0.999,0.0,1.0,0.044,0.956
523602,CHSVSMGLTNQKVHSDR,0.002,0.998,0.0,1.0,0.041,0.959
537441,PQICSINTDAMKSRL,0.004,0.996,0.0,1.0,0.044,0.956
576375,PQPRVTGLAESAGK,0.005,0.995,0.0,1.0,0.047,0.953
824290,SWQGSAVVSIDSKRARAAS,0.003,0.997,0.0,1.0,0.037,0.963


In [None]:
pred = predict_CPP(df_animals['Spider'], 0&.0)
pred[pred['Peptide'] == 'CRYFHYRQKKHWQL']

KNN model 	1075083
XGB model 	1119698
RF model 	1119699
KNN + XGB + RFM 	1119699
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
843269,CRYFHYRQKKHWQL,0.469,0.531,0.222,0.778,0.39,0.61


## Jellyfish Rhopilema esculentum and Sanderia malayensis

In [None]:
predict_CPP(df_animals['Jellyfish'], 0.95)

KNN model 	31136
XGB model 	81667
RF model 	6
KNN + XGB + RFM 	5
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
69067,SYQWQIFYRSLDGSGAKE,0.005,0.995,0.0,1.0,0.041,0.959
422503,CQNTQVNISNQHRPAKMDGK,0.005,0.995,0.0,1.0,0.041,0.959
633258,SHSLGKAPDGSGR,0.002,0.998,0.0,1.0,0.034,0.966
643661,TVANFKTNSAAPPAAEPPR,0.003,0.997,0.0,1.0,0.044,0.956
716405,TNHNYVRFHHSHHQQDDGDGK,0.004,0.996,0.0,1.0,0.034,0.966


## Red ant Manica rubida

In [None]:
predict_CPP(df_animals['Red ant'], 0.95)

XGB model 	197
KNN model 	45
RF model 	4
XGB + KNN + RFM 	4
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
1342,HTGSLLAVVSLLLLKLVAPAAAEVLAGGKLP,0.004,0.996,0.0,1.0,0.112,0.888
1481,PTPAQKALTMLTMLLALLPVPACLEAEKYG,0.01,0.99,0.0,1.0,0.108,0.892
2546,PKGVTGAAAAPVVKLLKAAVAPDPLGKAPQ,0.002,0.998,0.0,1.0,0.146,0.854
2643,PQAVLFVLLKLLLKVAPAAAEVKGHS,0.002,0.998,0.0,1.0,0.051,0.949


In [None]:
pred = predict_CPP(df_animals['Red ant'], 0.0)
#pred[pred['Peptide'] == 'KHLKHTPVWWY']
pred[pred['Peptide'] == 'RGSLLAKAALKRS']

KNN model 	2657
XGB model 	2760
RF model 	2760
KNN + XGB + RFM 	2760
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
1326,RGSLLAKAALKRS,0.609,0.391,0.333,0.667,0.471,0.529


## Honey bee Apis mellifera

In [None]:
predict_CPP(df_animals['Honeybee'], 0.95)

XGB model 	32
KNN model 	6
RF model 	17
XGB + KNN + RFM 	2
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
4,CLHYTVDKSKPK,0.007,0.993,0.0,1.0,0.203,0.797
358,SVCPPQLLVFDLNTSQLLK,0.002,0.998,0.0,1.0,0.224,0.776


## King cobra Ophiophagus hannah.

In [None]:
predict_CPP(df_animals['Snake'], 0.90)

KNN model 	14
XGB model 	19
RF model 	8
KNN + XGB + RFM 	2
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
35,RAVTIFGESAGAASVGMHLLSTQSRA,0.015,0.985,0.056,0.944,0.2,0.8
248,KTWHMVYPGGYDHTRG,0.002,0.998,0.0,1.0,0.19,0.81


In [None]:
pred = predict_CPP(df_animals['Snake'], 0.0)
pred[pred['Peptide'] == 'REKDLLPRK']

KNN model 	238
XGB model 	252
RF model 	252
KNN + XGB + RFM 	252
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
115,REKDLLPRK,0.972,0.028,0.944,0.056,0.671,0.329


## Medicinal leech Hirudo medicinalis

In [None]:
predict_CPP(df_animals['leech'], 0.90)

XGB model 	24
KNN model 	17
RF model 	14
XGB + KNN + RFM 	3
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
35,AIRNDEELNK,0.023,0.977,0.111,0.889,0.2,0.8
140,SKAADESER,0.008,0.992,0.056,0.944,0.18,0.82
264,ALVVDNGSGMCKAGFAGDDAPR,0.007,0.993,0.111,0.889,0.183,0.817


In [None]:
#nonCPP
predict_nonCPP(df_animals['leech'], 0.0)

XGB model 	18
KNN model 	9
RF model 	54
XGB + KNN + RFM 	2
Непроникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
9,DLLSGVLGGVDN,0.96,0.04,1.0,0.0,0.715,0.285
27,DLLSGVLGGVDDLASLDVAG,0.986,0.014,1.0,0.0,0.698,0.302


In [None]:
pred = predict_CPP(df_animals['leech'], 070)
pred[pred['Peptide'] == 'HYNKRSTIT']

KNN model 	303
XGB model 	312
RF model 	312
KNN + XGB + RFM 	312
Проникающие пептиды предсказаны за	0 min 0 sec


Unnamed: 0,Peptide,nonCPP_XGB,CPP_XGB,nonCPP_KNN,CPP_KNN,nonCPP_RFM,CPP_RFM
94,HYNKRSTIT,0.951,0.049,1.0,0.0,0.644,0.356
