In [159]:
#generate in-library peptide sequences
import pandas as pd
import numpy as np
import os
from pandas import DataFrame
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import logomaker
import random
from sequence_to_smiles import *

In [2]:
linker_7 = 'CC(Cn)=O' #Kong et al. Nature biomedical engineering 2020
aa_df = pd.read_excel('AA_smiles_dict.xlsx')
aa_df = aa_df.set_index('1 letter')
aa_dict = dict(zip(aa_df.index,aa_df['full smiles']))

In [10]:
AAs = set(aa_df.drop(['Z','C']).index.values)
AAs

{'A',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y'}

In [154]:
def getPeptide(AAs,m=None,n=None,o=None,n_seq=1) -> str:
    '''
    getPeptide: return cyclic peptide like Kong2020
    Inputs:
        AAs: set of amino acids to be used
        m: int
        n: int
        o: int (3<= m+n+o <= 8)
        n_seq: int, # of sequences to generate
    Outputs:
        randomized peptide sequence(s)
    '''
    if (m is None) and (n is None) and (o is None):
        length_random=True
        
    peptides = []
    for i in range(n_seq):
        if length_random:
            total_length = random.randint(3,8)
            m = random.randint(1,total_length-2)
            n = random.randint(1,total_length-1-m)
            o = random.randint(1,total_length-m-n)
            x = [m,n,o]
            random.shuffle(x)
            m,n,o = [i for i in x]
        peptides.append("".join([str(i) for i in (random.sample(AAs,1) + ['C'] + random.sample(AAs,m)+['C']+random.sample(AAs,n)+['C']+random.sample(AAs,o) + ['C'] + random.sample(AAs,1))]))
    return peptides

In [184]:
num_peptides = 1000000
generated_peptides = pd.DataFrame(index=range(num_peptides),columns=['Sequence'],data=getPeptide(AAs,n_seq=num_peptides))
generated_peptides['smiles'] = smiles = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7) for seq in generated_peptides['Sequence']]
generated_peptides['smiles1'] = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7)[0] for seq in generated_peptides['Sequence']]
generated_peptides['smiles2'] = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7)[1] for seq in generated_peptides['Sequence']]
generated_peptides['smiles3'] = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7)[2] for seq in generated_peptides['Sequence']]
generated_peptides

Unnamed: 0,Sequence,smiles,smiles1,smiles2,smiles3
0,MCTYMKPCTFCSCW,(N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C...,N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@...,N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@...,N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@...
1,DCRCMCILWFCR,(N[C@@H](CC(=O)O)C(=O)N[C@@H](CSCC(C3)=O)C(=O)...,N[C@@H](CC(=O)O)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N...,N[C@@H](CC(=O)O)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N...,N[C@@H](CC(=O)O)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N...
2,NCKCYCWCF,(N[C@@H](CC(=O)N)C(=O)N[C@@H](CSCC(C3)=O)C(=O)...,N[C@@H](CC(=O)N)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N...,N[C@@H](CC(=O)N)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N...,N[C@@H](CC(=O)N)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N...
3,TCECFCHCE,(N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O...,N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)...,N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C4)=O)...,N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)...
4,DCECDNCGNHLCI,(N[C@@H](CC(=O)O)C(=O)N[C@@H](CSCC(C3)=O)C(=O)...,N[C@@H](CC(=O)O)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N...,N[C@@H](CC(=O)O)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N...,N[C@@H](CC(=O)O)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N...
...,...,...,...,...,...
999995,SCHCHENFTMCSCP,(N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@...,N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...,N[C@@H](CO)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H...,N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...
999996,ICNSCTGRCHTCN,(N[C@@H]([C@]([H])(CC)C)C(=O)N[C@@H](CSCC(C3)=...,N[C@@H]([C@]([H])(CC)C)C(=O)N[C@@H](CSCC(C3)=O...,N[C@@H]([C@]([H])(CC)C)C(=O)N[C@@H](CSCC(C4)=O...,N[C@@H]([C@]([H])(CC)C)C(=O)N[C@@H](CSCC(C3)=O...
999997,PCVNCACWNLICY,(N1[C@@H](CCC1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[...,N1[C@@H](CCC1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C...,N1[C@@H](CCC1)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C...,N1[C@@H](CCC1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C...
999998,NCPCMCICE,(N[C@@H](CC(=O)N)C(=O)N[C@@H](CSCC(C3)=O)C(=O)...,N[C@@H](CC(=O)N)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N...,N[C@@H](CC(=O)N)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N...,N[C@@H](CC(=O)N)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N...


In [181]:
merged_cleaned = pd.read_csv('merged_cleaned.csv')
merged_cleaned

Unnamed: 0,sequences_translated,R2,R3,R5_0%,R5_10%,R5_1%,R6_0%,R6_10%,R6_1%
0,TCVPEHCSCYRCP,34162.0,3512.0,177.0,419.0,157.0,124.0,26.0,
1,KCFDCCYRCP,32186.0,36542.0,3268.0,30341.0,10822.0,556.0,12397.0,3494.0
2,YCSLCCYRCP,20320.0,13454.0,307.0,1106.0,307.0,2.0,320.0,1.0
3,FCLSCCYRCP,18614.0,14008.0,208.0,732.0,307.0,3.0,16.0,1.0
4,SCDCCYRCP,17209.0,9899.0,64.0,322.0,73.0,2.0,36.0,
...,...,...,...,...,...,...,...,...,...
76152,FCGGFMCDMCYRCP,,,,,,,,1.0
76153,SCYRCYCAVPCF,,,,,,,,1.0
76154,SCYRCYCPIWRCE,,,,,,,,1.0
76155,SCYRCYCSIRHCE,,,,,,,,1.0


In [186]:
generated_peptides = generated_peptides[~(generated_peptides['Sequence'].isin(merged_cleaned['sequences_translated']))]

In [225]:
generated_peptides.sort_values('Sequence')

Unnamed: 0,Sequence,smiles,smiles1,smiles2,smiles3
548489,ACACACDKANCD,(N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...
492931,ACACACDMPCM,(N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...
23611,ACACACEAMRCD,(N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...
583689,ACACACECK,(N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...
704011,ACACACELCY,(N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H]...,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...
...,...,...,...,...,...
578694,YCYWRDCNCIECD,(N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C4)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...
591361,YCYWTACDHPCGCV,(N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C4)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...
876345,YCYWTCPWCPCF,(N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C4)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...
647052,YCYWVCFHACDCI,(N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C4)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...


In [226]:
generated_peptides = np.hstack(generated_peptides[['smiles1','smiles2','smiles3']].values)
generated_peptides

array(['N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CS3)C(=O)N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CC(=O)O)C(=O)O',
       'N[C@@H](C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CS3)C(=O)N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CS4)C(=O)N[C@@H](CC(=O)O)C(=O)O',
       'N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](C)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CS4)C(=O)N[C@@H](CC(=O)O)C(=O)O',
       ...,
       'N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CS3)C(=O)N[C

In [228]:
generated_peptides = pd.DataFrame(generated_peptides)
generated_peptides.to_csv('simulated_peptides.txt',header=None, index=None, sep=' ', mode='a')

In [229]:
real_peptides = pd.read_csv('/home/marcase/hgraph2graph/data/cyclic_peptides/cyclic_peptide_smiles.txt',header=None)
real_peptides = [i[0] for i in real_peptides.values]

In [233]:
output = np.hstack((output_data,real_peptides))
output = pd.DataFrame(output)
output

Unnamed: 0,0
0,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...
1,N[C@@H](C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H]...
2,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...
3,N[C@@H](C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]...
4,N[C@@H](C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H]...
...,...
3228457,N[C@@H](CO)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H...
3228458,N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...
3228459,N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...
3228460,N[C@@H](CO)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H...
