In [56]:
#create SMILES strings for positive labels, negative labels, and unlabeled dataset

import pandas as pd
import numpy as np
import os
from pandas import DataFrame
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from sequence_to_smiles import *
from sklearn.model_selection import train_test_split

In [4]:
linker_7 = 'CC(Cn)=O' #Kong et al. Nature biomedical engineering 2020
aa_df = pd.read_excel('AA_smiles_dict.xlsx')
aa_df = aa_df.set_index('1 letter')
aa_dict = dict(zip(aa_df.index,aa_df['full smiles']))

In [5]:
aa_dict

{'A': 'N[C@@H](C)C(=O)',
 'C': 'N[C@@H](CS)C(=O)',
 'D': 'N[C@@H](CC(=O)O)C(=O)',
 'E': 'N[C@@H](CCC(=O)O)C(=O)',
 'F': 'N[C@@H](Cc1ccccc1)C(=O)',
 'G': 'N[C@@H]([H])C(=O)',
 'H': 'N[C@@H](CC1=CN=C-N1)C(=O)',
 'I': 'N[C@@H]([C@]([H])(CC)C)C(=O)',
 'K': 'N[C@@H](CCCCN)C(=O)',
 'L': 'N[C@@H](CC(C)C)C(=O)',
 'M': 'N[C@@H](CCSC)C(=O)',
 'N': 'N[C@@H](CC(=O)N)C(=O)',
 'P': 'N1[C@@H](CCC1)C(=O)',
 'Q': 'N[C@@H](CCC(=O)N)C(=O)',
 'R': 'N[C@@H](CCCNC(=N)N)C(=O)',
 'S': 'N[C@@H](CO)C(=O)',
 'T': 'N[C@@H]([C@]([H])(O)C)C(=O)',
 'V': 'N[C@@H](C(C)C)C(=O)',
 'W': 'N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)',
 'Y': 'N[C@@H](Cc1ccc(O)cc1)C(=O)',
 'Z': 'N[C@@H](CCN=[N+]=[N-])C(=O)'}

In [6]:
merged_cleaned = pd.read_csv('merged_cleaned.csv')
merged_cleaned = merged_cleaned.loc[merged_cleaned['sequences_translated'].str.count('C')==4]
merged_cleaned['sequences_translated']

0         TCVPEHCSCYRCP
1            KCFDCCYRCP
2            YCSLCCYRCP
3            FCLSCCYRCP
4             SCDCCYRCP
              ...      
76152    FCGGFMCDMCYRCP
76153      SCYRCYCAVPCF
76154     SCYRCYCPIWRCE
76155     SCYRCYCSIRHCE
76156     SCYRCYCSVWHCE
Name: sequences_translated, Length: 76157, dtype: object

In [7]:
smiles_seq = getSmilesFromAA('FCGGFMCDMCYRCP')
#smiles = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7) for seq in merged_cleaned['sequences_translated']]

In [8]:
c1,c2,c3,c4=[m.start() for m in re.finditer('S\)',smiles_seq)]
staple=linker_7
c1,c2,c3,c4

(32, 123, 178, 244)

In [9]:
if smiles_seq.count('S)') != 4:
    print('incorrect number of Cys residues, needs to have four')

In [10]:
def getCrosslinkedSeq(smiles_seq,staple):
    #get 3 reacted products from cyclic peptide reaction from Wong et al. 2020
    if smiles_seq.count('S)') != 4:
        print('incorrect number of Cys residues, needs to have four')
        return None
    elif len(staple.split('n')) != 2:
        print('incorrect attachment form of the linker. Linker needs to like: CC(Cn)=O where \'n\' is the attachment point for the second atom')
        return None
    else:
        c1,c2,c3,c4=[m.start() for m in re.finditer('S\)',smiles_seq)]
        smiles_seq_updated_1 = smiles_seq[:c1+1]+staple.split('n')[0] + str(3) + staple.split('n')[1]+smiles_seq[c1+1:c2+1]+'3'+\
            smiles_seq[c2+1:c3+1] + staple.split('n')[0] + str(3) + staple.split('n')[1] + smiles_seq[c3+1:c4+1] + '3' +\
            smiles_seq[c4+1:]
        smiles_seq_updated_2 = smiles_seq[:c1+1]+staple.split('n')[0] + str(4) + staple.split('n')[1]+smiles_seq[c1+1:c2+1]+'3'+\
            smiles_seq[c2+1:c3+1] + staple.split('n')[0] + str(3) + staple.split('n')[1] + smiles_seq[c3+1:c4+1] + '4' +\
            smiles_seq[c4+1:]
        smiles_seq_updated_3 = smiles_seq[:c1+1]+staple.split('n')[0] + str(3) + staple.split('n')[1] + smiles_seq[c1+1:c2+1] +\
            staple.split('n')[0] + '4'+ staple.split('n')[1]+ smiles_seq[c2+1:c3+1]  + str(3)  + smiles_seq[c3+1:c4+1] + '4' +\
            smiles_seq[c4+1:]
        return smiles_seq_updated_1,smiles_seq_updated_2,smiles_seq_updated_3

In [11]:
getCrosslinkedSeq(smiles_seq,linker_7)

('N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]([H])C(=O)N[C@@H]([H])C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS3)C(=O)N1[C@@H](CCC1)C(=O)O',
 'N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H]([H])C(=O)N[C@@H]([H])C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS4)C(=O)N1[C@@H](CCC1)C(=O)O',
 'N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H]([H])C(=O)N[C@@H]([H])C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CS3)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS4)C(=O)N1[C@@H](CCC1)C(=O)O')

In [17]:
smiles = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7) for seq in merged_cleaned['sequences_translated']]
truncated_length = int(np.around(len(merged_cleaned['sequences_translated'])/10))
print('truncated length: ' + str(truncated_length))
smiles_truncated_data = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7) for seq in merged_cleaned['sequences_translated'][:truncated_length]]

truncated length: 7616


In [18]:
smiles_truncated_data[:5]

[('N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](C(C)C)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS3)C(=O)N1[C@@H](CCC1)C(=O)O',
  'N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H](C(C)C)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS4)C(=O)N1[C@@H](CCC1)C(=O)O',
  'N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](C(C)C)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CS3)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS4)C(=O)N1[C@@H](CCC1)C(=O)O'),
 ('N[C@@H](CCCCN)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc

In [19]:
smiles = np.hstack(smiles)
smiles_truncated_data = np.hstack(smiles_truncated_data)

In [20]:
smiles_truncated_data

array(['N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](C(C)C)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS3)C(=O)N1[C@@H](CCC1)C(=O)O',
       'N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H](C(C)C)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS4)C(=O)N1[C@@H](CCC1)C(=O)O',
       'N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](C(C)C)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CS3)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS4)C(=O)N1[C@@H](CCC1)C(=O)O',
       ...,
       'N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CS3)C(=O)N[C@

In [21]:
smiles = pd.DataFrame(smiles)
smiles_truncated_data = pd.DataFrame(smiles_truncated_data)

In [22]:
smiles_truncated_data

Unnamed: 0,0
0,N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)...
1,N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C4)=O)...
2,N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)...
3,N[C@@H](CCCCN)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C...
4,N[C@@H](CCCCN)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C...
...,...
22843,N[C@@H](CO)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H...
22844,N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...
22845,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...
22846,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C4)=O)C...


In [23]:
# #output pretraining data
# smiles.to_csv('cyclic_peptide_smiles.txt', header=None, index=None, sep=' ', mode='a')
# smiles_truncated_data.to_csv('cyclic_peptide_smiles_truncated.txt',header=None,index=None,mode='a',sep=' ')

In [24]:
#output training and test data from positive and negative labels
positive = pd.read_csv('positive_labels.csv')
negative = pd.read_csv('negative_labels.csv')

In [26]:
positive.head()

Unnamed: 0,sequences_translated,Stable
0,KCFDCCYRCP,1
1,YCSLCCYRCP,1
2,FCLSCCYRCP,1
3,SCDCCYRCP,1
4,ICTNCRSYCDRCP,1


In [27]:
negative.head()

Unnamed: 0,sequences_translated,Stable
0,TCVPEHCSCYRCP,0
1,YCFLCCYRCP,0
2,FCYRCCSWLGCD,0
3,FCFRCLPCEYCP,0
4,GCGFCDRCLPCY,0


In [48]:
all_data = positive.append(negative, ignore_index=True)
all_data

Unnamed: 0,sequences_translated,Stable
0,KCFDCCYRCP,1
1,YCSLCCYRCP,1
2,FCLSCCYRCP,1
3,SCDCCYRCP,1
4,ICTNCRSYCDRCP,1
...,...,...
1289,ECSNFFCCRCP,0
1290,TCYRCTCYFCN,0
1291,LCFRCCVFPWCD,0
1292,MCFRCDCARPHCA,0


In [49]:
all_data['smiles1'] = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7)[0] for seq in all_data['sequences_translated']]
all_data['smiles2'] = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7)[1] for seq in all_data['sequences_translated']]
all_data['smiles3'] = [getCrosslinkedSeq(getSmilesFromAA(seq),linker_7)[2] for seq in all_data['sequences_translated']]

In [50]:
all_data

Unnamed: 0,sequences_translated,Stable,smiles1,smiles2,smiles3
0,KCFDCCYRCP,1,N[C@@H](CCCCN)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C...,N[C@@H](CCCCN)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C...,N[C@@H](CCCCN)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C...
1,YCSLCCYRCP,1,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C4)=O)C...,N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C...
2,FCLSCCYRCP,1,N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CSCC(C3)=O)C(=O...,N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CSCC(C4)=O)C(=O...,N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CSCC(C3)=O)C(=O...
3,SCDCCYRCP,1,N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...,N[C@@H](CO)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H...,N[C@@H](CO)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H...
4,ICTNCRSYCDRCP,1,N[C@@H]([C@]([H])(CC)C)C(=O)N[C@@H](CSCC(C3)=O...,N[C@@H]([C@]([H])(CC)C)C(=O)N[C@@H](CSCC(C4)=O...,N[C@@H]([C@]([H])(CC)C)C(=O)N[C@@H](CSCC(C3)=O...
...,...,...,...,...,...
1289,ECSNFFCCRCP,0,N[C@@H](CCC(=O)O)C(=O)N[C@@H](CSCC(C3)=O)C(=O)...,N[C@@H](CCC(=O)O)C(=O)N[C@@H](CSCC(C4)=O)C(=O)...,N[C@@H](CCC(=O)O)C(=O)N[C@@H](CSCC(C3)=O)C(=O)...
1290,TCYRCTCYFCN,0,N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)...,N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C4)=O)...,N[C@@H]([C@]([H])(O)C)C(=O)N[C@@H](CSCC(C3)=O)...
1291,LCFRCCVFPWCD,0,N[C@@H](CC(C)C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[...,N[C@@H](CC(C)C)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[...,N[C@@H](CC(C)C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[...
1292,MCFRCDCARPHCA,0,N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@...,N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@...,N[C@@H](CCSC)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@...


In [52]:
output_x_data = np.hstack((all_data['smiles1'],all_data['smiles2'],all_data['smiles3']))

In [53]:
output_x_data

array(['N[C@@H](CCCCN)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS3)C(=O)N1[C@@H](CCC1)C(=O)O',
       'N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS3)C(=O)N1[C@@H](CCC1)C(=O)O',
       'N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](CS3)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CS3)C(=O)N1[C@@H](CCC1)C(=O)O',
       ...,
       'N[C@@H](CC(C)C)C(=O)N[C@@H](CSCC(C3)=O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CSCC(C4)=O)C(=O)N[C@@H](CS3)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](CS4)C(=O)N[C@@H](CC(=O)O)C(=O)O',
       'N[C@@H](CCSC)C(=O)N[C@@H](CSCC

In [54]:
output_y_data = np.hstack((all_data['Stable'],all_data['Stable'],all_data['Stable']))

In [55]:
output_y_data

array([1, 1, 1, ..., 0, 0, 0])

In [57]:
#split train and test data
X_train, X_test, y_train, y_test = train_test_split(output_x_data, output_y_data, test_size=0.3, random_state=42)

In [59]:
y_train

array([0, 0, 1, ..., 0, 0, 1])

In [66]:
pd.DataFrame(X_train).to_csv('X_train.txt', header=None, index=None, sep=' ', mode='a')
pd.DataFrame(X_test).to_csv('X_test.txt', header=None, index=None, sep=' ', mode='a')
pd.DataFrame(y_train).to_csv('y_train.txt', header=None, index=None, sep=' ', mode='a')
pd.DataFrame(y_test).to_csv('y_test.txt', header=None, index=None, sep=' ', mode='a')