# 通过相似性识别生物不可到达的分子

In [13]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
reac = pd.read_csv('reac_prop.tsv', sep='\t', header=351) # skip 351 lines of documentation
reac

Unnamed: 0,#ID,mnx_equation,reference,classifs,is_balanced,is_transport
0,EMPTY,=,mnx:EMPTY,,B,
1,MNXR01,1 MNXM01@MNXD1 = 1 MNXM1@MNXD1,mnx:MNXR01,,B,
2,MNXR02,1 MNXM1@MNXD1 = 1 MNXM1@MNXD2,mnx:MNXR02,,B,T
3,MNXR03,1 MNXM01@MNXD1 = 1 MNXM01@MNXD2,mnx:MNXR03,,B,T
4,MNXR100000,1 MNXM10958@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,biggR:GALNACT5g,,,
...,...,...,...,...,...,...
74608,MNXR99995,1 MNXM1100890@MNXD1 + 1 MNXM147451@MNXD1 = 1 M...,biggR:GALNACT1g,,,
74609,MNXR99996,1 MNXM1100890@MNXD1 + 1 MNXM163780@MNXD1 = 1 M...,biggR:GALNACT1g_cho,,,
74610,MNXR99997,1 MNXM1102128@MNXD1 + 1 MNXM147449@MNXD1 = 1 M...,biggR:GALNACT2g,,,
74611,MNXR99998,1 MNXM10945@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,biggR:GALNACT3g,,,


In [6]:
list(reac["mnx_equation"])

[' = ',
 '1 MNXM01@MNXD1 = 1 MNXM1@MNXD1',
 '1 MNXM1@MNXD1 = 1 MNXM1@MNXD2',
 '1 MNXM01@MNXD1 = 1 MNXM01@MNXD2',
 '1 MNXM10958@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MNXM1102128@MNXD1 + 1 MNXM8415@MNXD1',
 '1 MNXM1100890@MNXD1 + 1 MNXM147296@MNXD1 = 1 MNXM1102128@MNXD1 + 1 MNXM7559@MNXD1',
 '1 MNXM1102128@MNXD1 + 1 MNXM148499@MNXD1 = 1 MNXM1104754@MNXD1 + 1 MNXM149175@MNXD1',
 '1 MNXM1102128@MNXD1 + 1 MNXM47647@MNXD1 = 1 MNXM1104754@MNXD1 + 1 MNXM148499@MNXD1',
 '1 MNXM1102128@MNXD1 + 1 MNXM148500@MNXD1 = 1 MNXM1104754@MNXD1 + 1 MNXM149176@MNXD1',
 '1 MNXM1102128@MNXD1 + 1 MNXM148996@MNXD1 = 1 MNXM1104754@MNXD1 + 1 MNXM148500@MNXD1',
 '1 MNXM1108175@MNXD1 + 1 MNXM1@MNXD1 + 2 MNXM27@MNXD1 + 1 MNXM40333@MNXD1 + 1 MNXM9@MNXD1 = 1 MNXM1108175@MNXD2 + 2 MNXM27@MNXD2 + 1 MNXM3@MNXD1 + 1 WATER@MNXD1',
 '1 MNXM162408@MNXD1 = 1 MNXM162408@MNXD2',
 '1 MNXM1102128@MNXD1 + 1 MNXM11618@MNXD1 = 1 MNXM1104619@MNXD1 + 1 MNXM1104754@MNXD1',
 '1 MNXM1233@MNXD1 + 1 MNXM73@MNXD2 = 1 MNXM23@MNXD2 + 1 MNXM731248@M

In [9]:
exemple = list(reac["mnx_equation"])[4]
exemple

'1 MNXM10958@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MNXM1102128@MNXD1 + 1 MNXM8415@MNXD1'

In [10]:
exemple.split(" ")

['1',
 'MNXM10958@MNXD1',
 '+',
 '1',
 'MNXM1104529@MNXD1',
 '=',
 '1',
 'MNXM1102128@MNXD1',
 '+',
 '1',
 'MNXM8415@MNXD1']

In [11]:
def preprocess_rex(rex):
    """
    rex: one reaction, such as '1 MNXM01@MNXD1 = 1 MNXM1@MNXD1'

    output: 
        - reaction formula without compartment (MNXM01 instead of MNXM01@MNXD1)
        - list of metabolites involved in the reaction
        - list of substrates
        - list of products
    """
    # compartment can only be @MNXD1 or @MNXD2
    rex_clean = rex.replace('@MNXD1', '').replace('@MNXD2', '')

    metabolites = take_MNXM(rex_clean.split(' '))
    # drop duplicates
    metabolites = list(set(metabolites))

    substrates, products = rex_clean.split('=')
    substrates = take_MNXM(substrates.split(' '))
    products = take_MNXM(products.split(' '))

    return rex_clean, metabolites, substrates, products


def take_MNXM(str_list):
    """
    Helper function for preprocessing.
    """
    return [mol for mol in str_list if mol.startswith('MNXM')]

In [14]:
reac_prep = []
for rex in reac['mnx_equation']:
    reac_prep.append(preprocess_rex(rex))
reac_prep = np.array(reac_prep, dtype=object)

reac = reac[['#ID', 'mnx_equation']]
reac['equation'] = reac_prep[:, 0]
reac['metabolites'] = reac_prep[:, 1]
reac['substrates'] = reac_prep[:, 2]
reac['products'] = reac_prep[:, 3]

reac

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,#ID,mnx_equation,equation,metabolites,substrates,products
0,EMPTY,=,=,[],[],[]
1,MNXR01,1 MNXM01@MNXD1 = 1 MNXM1@MNXD1,1 MNXM01 = 1 MNXM1,"[MNXM01, MNXM1]",[MNXM01],[MNXM1]
2,MNXR02,1 MNXM1@MNXD1 = 1 MNXM1@MNXD2,1 MNXM1 = 1 MNXM1,[MNXM1],[MNXM1],[MNXM1]
3,MNXR03,1 MNXM01@MNXD1 = 1 MNXM01@MNXD2,1 MNXM01 = 1 MNXM01,[MNXM01],[MNXM01],[MNXM01]
4,MNXR100000,1 MNXM10958@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,1 MNXM10958 + 1 MNXM1104529 = 1 MNXM1102128 + ...,"[MNXM10958, MNXM1104529, MNXM1102128, MNXM8415]","[MNXM10958, MNXM1104529]","[MNXM1102128, MNXM8415]"
...,...,...,...,...,...,...
74608,MNXR99995,1 MNXM1100890@MNXD1 + 1 MNXM147451@MNXD1 = 1 M...,1 MNXM1100890 + 1 MNXM147451 = 1 MNXM1102128 +...,"[MNXM147451, MNXM1102128, MNXM8416, MNXM1100890]","[MNXM1100890, MNXM147451]","[MNXM1102128, MNXM8416]"
74609,MNXR99996,1 MNXM1100890@MNXD1 + 1 MNXM163780@MNXD1 = 1 M...,1 MNXM1100890 + 1 MNXM163780 = 1 MNXM1102128 +...,"[MNXM163780, MNXM1102128, MNXM8416, MNXM1100890]","[MNXM1100890, MNXM163780]","[MNXM1102128, MNXM8416]"
74610,MNXR99997,1 MNXM1102128@MNXD1 + 1 MNXM147449@MNXD1 = 1 M...,1 MNXM1102128 + 1 MNXM147449 = 1 MNXM1104529 +...,"[MNXM147449, MNXM1104529, MNXM148157, MNXM1102...","[MNXM1102128, MNXM147449]","[MNXM1104529, MNXM148157]"
74611,MNXR99998,1 MNXM10945@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,1 MNXM10945 + 1 MNXM1104529 = 1 MNXM10946 + 1 ...,"[MNXM10945, MNXM10946, MNXM1102128, MNXM1104529]","[MNXM10945, MNXM1104529]","[MNXM10946, MNXM1102128]"


In [15]:
metabolites_list = []
for i in trange(len(reac)):
    metabolites_list += reac.loc[i, 'metabolites']
metabolites_list = list(set(metabolites_list))
bioreachable = chem[chem['#ID'].isin(metabolites_list)]
bioreachable.index = range(len(bioreachable))
bioreachable

100%|██████████| 74613/74613 [00:00<00:00, 161685.44it/s]


Unnamed: 0,#ID,name,reference,formula,charge,mass,InChI,InChIKey,SMILES
0,MNXM01,PMF,mnx:PMF,H,1.0,1.00794,InChI=1S/p+1,InChIKey=GPRLSGONYQIRFK-UHFFFAOYSA-N,[H+]
1,MNXM1,H(+),mnx:PROTON,H,1.0,1.00794,InChI=1S/p+1,InChIKey=GPRLSGONYQIRFK-UHFFFAOYSA-N,[H+]
2,MNXM10,NADH,chebi:57945,C21H27N7O14P2,-2.0,663.11022,InChI=1S/C21H29N7O14P2/c22-17-12-19(25-7-24-17...,InChIKey=BOPGDPNILDQYTO-NNYOXOHSSA-L,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...
3,MNXM100,(2E)-geranyl diphosphate,chebi:58057,C10H17O7P2,-3.0,311.04660,InChI=1S/C10H20O7P2/c1-9(2)5-4-6-10(3)7-8-16-1...,InChIKey=GVVPGTZRZFNKDS-JXMROGBWSA-K,CC(C)=CCC/C(C)=C/COP(=O)([O-])OP(=O)([O-])[O-]
4,MNXM10002,3-deoxycapsidiol,chebi:72642,C15H24O,0.0,220.18272,InChI=1S/C15H24O/c1-10(2)12-6-7-13-14(16)8-5-1...,InChIKey=NJWPLFBOSCSZFA-QHSBEEBCSA-N,C=C(C)[C@@H]1CC=C2[C@H](O)CC[C@@H](C)[C@@]2(C)C1
...,...,...,...,...,...,...,...,...,...
42547,MNXM9994,"5-chlorobenzoate-cis-3,4-diol",metacycM:CPD-11220,C7H6ClO4,-1.0,188.99601,InChI=1S/C7H7ClO4/c8-4-1-3(7(11)12)2-5(9)6(4)1...,InChIKey=GNYUNLRRAAQENB-NTSWFWBYSA-M,O=C([O-])C1=C[C@H](O)[C@H](O)C(Cl)=C1
42548,MNXM9995,3-chlorotoluene,metacycM:CPD-10654,C7H7Cl,0.0,126.02363,"InChI=1S/C7H7Cl/c1-6-3-2-4-7(8)5-6/h2-5H,1H3",InChIKey=OSOUNOBYRMOXQQ-UHFFFAOYSA-N,Cc1cccc(Cl)c1
42549,MNXM99969,compound 0043171,envipathM:650babc9-9d68-4b73-9332-11972ca26f7b...,C35H72,0.0,492.56340,InChI=1S/C35H72/c1-3-5-7-9-11-13-15-17-19-21-2...,InChIKey=VHQQPFLOGSTQPC-UHFFFAOYSA-N,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
42550,MNXM9999,3-demethylubiquinol-7,chebi:84431,C43H66O4,0.0,646.49611,InChI=1S/C43H66O4/c1-31(2)17-11-18-32(3)19-12-...,InChIKey=OHBHBMXNJCUMCR-DKCCAHEHSA-N,COc1c(O)c(O)c(C)c(C/C=C(\C)CC/C=C(\C)CC/C=C(\C...


In [17]:
bioinreachable = chem[~chem['#ID'].isin(metabolites_list)]
bioinreachable.index = range(len(bioinreachable))
bioinreachable

Unnamed: 0,#ID,name,reference,formula,charge,mass,InChI,InChIKey,SMILES
0,BIOMASS,BIOMASS,mnx:BIOMASS,,,,,,
1,MNXM02,OH(-),mnx:HYDROXYDE,OH,-1.0,17.00734,InChI=1S/H2O/h1H2/p-1,InChIKey=XLYOFNOQVPJJNP-UHFFFAOYSA-M,[O-][H]
2,MNXM03,H3O(+),mnx:OXONIUM,H3O,1.0,19.02322,InChI=1S/H2O/h1H2/p+1,InChIKey=XLYOFNOQVPJJNP-UHFFFAOYSA-O,[OH3+]
3,MNXM100000,"6Z,9Z-Heneicosadien-11-one",chebi:165749,C21H38O,0.0,306.29227,InChI=1S/C21H38O/c1-3-5-7-9-11-13-15-17-19-21(...,InChIKey=ZNAIFUOOHZIIGO-OHNCOSGTSA-N,CCCCC/C=C\C/C=C\C(=O)CCCCCCCCCC
4,MNXM1000000,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692388,C69H123NO9P,-1.0,1140.89409,InChI=1S/C69H124NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=MHYQSGUKJRWFDJ-MVSNXQKDSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...
...,...,...,...,...,...,...,...,...,...
1249597,MNXM999996,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692384,C73H121NO9P,-1.0,1186.87844,InChI=1S/C73H122NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=VNZHXXLXDVSBLA-IZNAGHOASA-M,CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)NC...
1249598,MNXM999997,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692385,C69H121NO9P,-1.0,1138.87844,InChI=1S/C69H122NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=FTBDAPNXHPOOLH-RUXWUTLCSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...
1249599,MNXM999998,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692386,C69H119NO9P,-1.0,1136.86279,InChI=1S/C69H120NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=UTTKGJJHRRHZRR-BNJOEXAFSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...
1249600,MNXM999999,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692387,C71H121NO9P,-1.0,1162.87844,InChI=1S/C71H122NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=BSOMIPWIPDKFRW-FMCFKDERSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...


In [64]:
# bioinreachable = bioinreachable.applymap(lambda x: x if str(x) != 'nan' else None)
# bioinreachable

In [39]:
bioreachable = bioreachable.dropna(subset=['SMILES'])
bioinreachable = bioinreachable.dropna(subset=['SMILES'])

In [40]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect 

In [41]:
# for i in bioreachable["SMILES"]:
#     print(i)

[H+]
[H+]
NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1
CC(C)=CCC/C(C)=C/COP(=O)([O-])OP(=O)([O-])[O-]
C=C(C)[C@@H]1CC=C2[C@H](O)CC[C@@H](C)[C@@]2(C)C1
C[Se+](C)CCC=O
CCCCC(O)C(C)=O
C[C@H]([C@H](O)CCC(C)(C)O)[C@H]1CC[C@@]2(O)C3=CC(=O)[C@@H]4C[C@H](O)[C@@H](O)C[C@]4(C)[C@H]3CC[C@]12C
O=C([O-])C[C@H](NC(=O)CC1(O)C(=O)Nc2ccccc21)C(=O)[O-]
C=C1CC1C(O)(CC(=O)O)C(=O)O
CC1(O)C(=O)Nc2ccccc21
O=C(O)CC(O)c1ccccc1
CCCCCCCCCC(=O)OC
CCCCCC/C=C\CCC[C@@H](O)CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O
CCCCCCOC(=O)CCC
CCCCCC/C=C\CCCCC[C@@H](O)CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O
C[C@H](O)C(=O)C1=Nc2c(nc(N)[nH]c2=O)NC1
CCCCCCCCCCCCCCOC(C)=O
CCCCCCC(=O)CCC
CCCCCCCC(O)CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)([O-])[O-]
CCCC(=O)OCC(C)CC
[*]C(=O)N[C@]1([*])C(=O)N2C(C(=O)O)=C(CO)CSC21
CCCCCOC(

CC[C@H](C)/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O
CC(C)/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O
CC(C)CCC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O
CCC[C@H]1C[NH2+][C@H](C(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O)C1
[*]OP(=O)([O-])OC[C@H]1O[C@@H](n2cc(O)c(N)nc2=O)[C@H](O)[C@@H]1OP(=O)([O-])O[*]
CC(C)C[C@@H](O)CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O
CC(C)CC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O
CC(C)CCCC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O
CC(C)C/C=C/C(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OC[C@H](N[*])C([*])=O
[*][C@@H]1O[C@H](COP(=O)([O-])[O-])[C@@H](OP(=O)([O-])O[*])[C@H]1[*]
[*][C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])[O-])[C@@H](OP([*])(=O)[O-])[C@H]1O
[*][C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])[C@@H](OP([*])(=O)[O-])[C@H]1O
CC(

C[C@@H](CCCC[C@@H](O)C[C@H](O)CCCCCCCCCCCCCCC[*])C(=O)C[*]
OC1Cc2cccc3cccc1c23
NCC(CCC(=O)O)C(=O)O
O=P(O)(O)OC[C@H]1O[C@@](O)(CO)[C@@H](O)[C@H](O)[C@@H]1O
[*]N[C@@H](Cc1c[nH]cn1)C([*])=O
[*]N[C@@H](CCCC[NH3+])C([*])=O
[*][C@@H]1O[C@H](COP(=O)([O-])O[*])[C@@H](OP(=O)([O-])OP(=O)([O-])OC[C@H]2O[C@@H](n3cnc4c(N)ncnc43)[C@H](O)[C@@H]2O)[C@H]1O
[*]C(=O)[C@@H]([NH3+])CS(=O)O
CCCCCCCC(=O)SC[C@H](N[*])C([*])=O
CS(=O)[*]CC/C(=N\OS(=O)(=O)[O-])S[C@@H]1O[C@H](CO)[C@@H](O)[C@H](O)[C@H]1O
C[C@]12CC[C@H]3[C@@H](CCC4=C[C@H](O)CC[C@@]43C)[C@@H]1CC[C@@H]2O
CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H](O[C@H]3O[C@H]([C@H](C)N)[C@@H](O)[C@H](O)[C@H]3N)[C@@H](N)C[C@H]2N)OC[C@]1(C)O
O=C/C=C/c1cc(C(=O)[O-])oc(=O)c1
CC(=O)CCc1ccccc1
COC(=O)NC1=C2/C(=C\CSSSC)[C@](O)(C#C/C=C\C#C[C@@H]2O[C@@H]2O[C@H](C)[C@@H](NO[C@H]3C[C@H](O)[C@H](SC(=O)c4c(C)c(I)c(O)c(OC)c4OC)[C@@H](C)O3)[C@H](O)[C@H]2O)CC1=O
[*]OP(=O)([O-])OC[C@H]1O[C@@H](n2ccc(N)nc2=O)[C@H](O)[C@@H]1OP(=O)([O-])OC[C@H]1O[C@@H](n2ccc(N)nc2=O)[C@H](O)[C@@H]

C[N+](C)(C)C[C@H](O)CC(N)=O
NC(=O)[C@H](O)c1ccccc1
O[C@H](c1ccccc1)[C@H](O)c1ccccc1
C[C@]12CC[C@H]3[C@@H](CC[C@H]4C[C@H](OS(=O)(=O)[O-])CC[C@@]43C)[C@@H]1CCC2=O
CC1=CC[C@@H](C2=CCCC(C)(C)C2)CC1
CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@@H](NC(=O)[C@@H](N)CCCN=C(N)N)C(C)C)C(=O)N[C@@H](Cc1cnc[nH]1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1)C(=O)O
Nc1ccc(C(=O)[O-])cc1
O=C([C@@H]1CSC(c2cccc(O)c2O)=N1)N(O)CCc1c[nH]cn1
C/C(=C/N)C(=O)OO
C=C/C(C)=C/C=C/C(C)=C/C=C1/C(C)=CCCC1(C)C
NC(=O)N/C=C\C(=O)[O-]
Clc1ccc(Cl)c(Cl)c1
Oc1ccc2cc3ccccc3cc2c1O
COc1cc(/C=C/C(=O)O[C@@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]2OC(=O)/C=C/c2cc(OC)c(O)c(OC)c2)cc(OC)c1O
COc1c(-c2cc(O)c(O)c(O)c2)oc2cc(O)cc(O)c2c1=O
CCCCCCCCCCCCCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCCCCCCCCC
Oc1ccc2sc3ccccc3c2c1O
Clc1ccc(Cl)cc1
[NH3+]CCCC=O
NC(=[NH2+])NCCCCNC(N)=[NH2+]
O=C1c2ccc(O)cc2C(=O)c2ccc(O)cc21
O=C1CCCCCNC(=O)CCCCCN1
Cc1ncc(C[n+]2cccc(CCO)c2C)c(N)n1.[Br-]
CN[C@@H]1[C@@H](O)[C@@H](O[C@@H]2[C@@H](O)[C@H](O[C@H]3O[C@H](CN)[C@@H](O)[C@H](O)[C@

CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)([O-])[O-])[C@@H](O)C(=O)NCCC(=O)NCCSC([*])=O
CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)([O-])[O-])[C@@H](O)C(=O)NCCC(=O)NCCSC([*])=O
C[C@]12CC[C@H]3[C@@H](CC[C@H]4C[C@H](O)CC[C@@]43C)[C@@H]1CC[C@@H]2O
[*][C@@H]1O[C@H](CO)[C@@H](OP(=O)([O-])O[*])[C@H]1O
[*]OP(=O)([O-])O[C@@H]1[C@@H](CO)O[C@@H]([*])[C@@H]1O
[*]OP(=O)([O-])O[C@@H]1[C@@H](CO)O[C@@H]([*])[C@@H]1O
[NH3+]CCCCCC(=O)NCCCCCC(=O)[O-]
[*]CCCCCC(=O)NCCCCCC(=O)NCCCCCC(=O)NCCCCCC(=O)[O-]
C[C@@]12CC3=C(CCC(=O)[O-])[C@](C)(CC(=O)[O-])C(=N3)/C=C3\[NH2+][C@@](C)([C@@H]4N=C(/C=C(\[NH2+]1)C(CCC(=O)[O-])=C2CC(=O)[O-])[C@](C)(CCC(=O)[O-])[C@H]4CC(=O)[O-])[C@@](C)(CC(=O)[O-])[C@@H]3CCC(=O)[O-]
C[C@@]12CC3=C(CCC(=O)[O-])[C@](C)(CC(=O)[O-])C(=N3)/C=C3\N[C@@](C)([C@@H]4N=C(CC(=N1)C(CCC(=O)[O-])=C2CC(=O)[O-])[C@](C)(CCC(=O)[O-])[C@H]4CC(=O)[O-])[C@@](C)(CC(=O)[O-])[C@@H]3CCC(=O)[O-]
COc1cc(O)c(C)c(C/C=C(\C)CCC=C(C)C)c1O

[*][C@@H]1[C@@H](CO)O[C@@H](O[C@@H]2[C@@H](COP(=O)([O-])OCC[NH3+])O[C@@H](O[C@@H]3[C@@H](CO)O[C@@H](O[C@@H]4[C@@H](COP(=O)([O-])OCC[NH3+])O[C@@H]([*])[C@H](O)[C@H]4O)[C@H](O)[C@H]3O)[C@H](O)[C@H]2O)[C@H](O)[C@H]1O
[*]OP(=O)([O-])OC[C@H]1O[C@@H]([*])C[C@@H]1OP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1O
[*][NH2+][C@@H](CCCCNC(=O)CC[C@H]([NH2+][*])C(=O)[O-])C([*])=O
O=Cc1c[nH]c2ccccc12
C=CCC1/C=C(\C)CC(C)CC(OC)C(O)C(OC)C2OC(O)(C(=O)C2C)C(=O)N2CCCCC2C(=O)OC(C(C)=CC2CCC(O)C(OC)C2)C(C)C(O)CC1=O
C=CCC1C=C(C)CC(C)CC(OC)C(O)C2CC(C)C(O)(O2)C(=O)C(=O)N2CCCCC2C(=O)OC(C(C)=CC2CCC(O)C(O)C2)C(C)C(O)CC1=O
C=CCC1C=C(C)CC(C)CC(OC)C(O)C2CC(C)C(O)(O2)C(=O)C(=O)N2CCCCC2C(=O)OC(/C(C)=C/C2CCC(O)C(OC)C2)C(C)C(O)CC1=O
C=CCC1/C=C(\C)CC(C)CC(O)C(O)C2CC(C)C(O)(O2)C(=O)C(=O)N2CCCCC2C(=O)OC(C(C)=CC2CCC(O)C(OC)C2)C(C)C(O)CC1=O
C=CCC1/C=C(\C)CC(C)CC(O)C2OC(O)(C(=O)C(=O)N3CCCCC3C(=O)OC(/C(C)=C/C3CCC(O)C(O)C3)C(C)C(O)CC1=O)C(C)CC2OC
C=CCC1/C=C(\C)CC(C)CC(O)C2OC(O)(C(=O)C(=O)N3CCCCC3C(=O)OC(/

In [35]:
# for i in bioinreachable["SMILES"]:
#     print(i)

[O-][H]
[OH3+]
CCCCC/C=C\C/C=C\C(=O)CCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCCCC/C=C\CCCCCC)OC(=O)CCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCC/C=C\CCCCCCCC)OC(=O)CCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCC)OC(=O)CCCCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCC)OC(=O)CCCCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCCCC)OC(=O)CCCCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCCCCCC)OC(=O)CCCCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCCCCCCCC)OC(=O)CCCCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCCCCCCCCCC)OC(=O)CCCCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C

CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCCCCCCCCCCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCC/C=C\C/C=C\C/C=C\CC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCCCCCCCCCCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC)COP(=O)([O-])OCCNC(=O)CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCCCCCCCCCCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCC/C=C\C/C=C\CCCCC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCCCCCCCCCCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC)COP(=O)([O-])OCCNC(=O)CCCC/C=C\C/C=C\C/C=C\CCCCC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCCCCCCCCCCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC)COP(=O)([O-])OCCNC(=O)CCC/C=C\C/C=C\C/C=C\C/C=C\CCCCC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCCC(=O)O[C@H](COC(=O)CCCCCCCCCCCCCCCCCC/C=C\C/C=C\C/C=C\C/C=C\C/

CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)NCCOP(=O)([O-])OC[C@@H](CO/C=C\CCCCCCCCCCCCCC)OC(=O)CCC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)NCCOP(=O)([O-])OC[C@@H](CO/C=C\CCCCCCCCCCCCCC)OC(=O)CCC
CCCCC/C=C\C/C=C\CCCCCCCC(=O)NCCOP(=O)([O-])OC[C@@H](CO/C=C\CCCCCCCCCCCCCC)OC(=O)CCC
NCCC[C@H](N)CC(=O)NCCC[C@H](N)CC(=O)N[C@@H]1[C@H](O)[C@@H](O)[C@@H](COC(N)=O)O[C@H]1/N=C1/N[C@@H]2C(=O)NC[C@@H](O)[C@H]2N1
CCCCC/C=C\C/C=C\C/C=C\CCCCC(=O)NCCOP(=O)([O-])OC[C@@H](CO/C=C\CCCCCCCCCCCCCC)OC(=O)CCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)NCCOP(=O)([O-])OC[C@@H](CO/C=C\CCCCCCCCCCCCCC)OC(=O)CCC
CCCCCC/C=C\CCCCCCCCCC(=O)NCCOP(=O)([O-])OC[C@@H](CO/C=C\CCCCCCCCCCCCCC)OC(=O)CCC
CCCCCCCC/C=C\CCCCCCCC(=O)NCCOP(=O)([O-])OC[C@@H](CO/C=C\CCCCCCCCCCCCCC)OC(=O)CCC
CCCCCCCCCCCCCC/C=C\OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCC)OC(=O)CCCCC
CCCCCCCCCCCCCC/C=C\OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCC)OC(=O)CCCCC
CCCCCCCCCCCCCC/C=C\OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCCCC)OC(=O)CCCCC
CCCCCCCCCCCCCC/C=C\OC[C@H](COP(=O)([O-])OCCNC(=O)CCCCCCCCC

CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCC/C=C\C/C=C\C/C=C\CCCCC
O=C(/C=C/CCCCCC/C=C/c1ccc2c(c1)OCO2)N1CCCC1
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCC/C=C\C/C=C\C/C=C\C/C=C\CCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCC/C=C\CCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCC/C=C\CCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCCCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCCCC
CCC

CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCCCC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCCCCCC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCCCCCCCC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCCCCCCCCCC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCC/C=C\C/C=C\C/C=C\CC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCCCCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCC/C=C\C/C=C\CCCCC
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCC(=O)O[C@H](CO/C=C\CCCCCCCCCCCCCCCCCCCCCCCCCC

CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O[C@H](CO/C=C\CCCCCCCC/C=C\CCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O[C@H](CO/C=C\CCCCCCCC/C=C\CCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O[C@H](CO/C=C\CCCCCCCC/C=C\CCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCC
CC(=O)OC[C@@]12[C@H](OC(C)=O)[C@@H](OC(C)=O)[C@@H]3[C@@H](OC(C)=O)[C@@]14O[C@@]3(C)COC(=O)c1cccnc1[C@@H](C)[C@H](C)C(=O)O[C@@H]([C@H](OC(C)=O)[C@@H]2OC(=O)c1ccccc1)[C@]4(C)O
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O[C@H](CO/C=C\CCCCCCCC/C=C\CCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O[C@H](CO/C=C\CCCCCCCC/C=C\CCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O[C@H](CO/C=C\CCCCCCCC/C=C\CCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCCCCCCCC
CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O[C@H](CO/C=C\CCCCCCCC/C=C\CCCCCCCC)COP(=O)([O-])OCCNC(=O)CCCCCCCCCCCCCCCCC
CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)NCCOP(=O)([O-])OC[C@@H](CO/C=C\CCCCCCCC/C=C\CCCCCCCC)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




CCCCCC/C=C\CCCCCCCC(=O)N[C@@H](CO[C@@H]1O[C@H](CO)[C@@H](O[C@@H]2O[C@H](CO)[C@H](O[C@@H]3O[C@H](CO)[C@H](O)[C@H](O[C@@H]4O[C@H](CO)[C@H](O)[C@H](O[C@@H]5O[C@H](CO)[C@@H](O[C@@H]6O[C@H](CO)[C@H](O)[C@H](O[C@]7(C(=O)[O-])C[C@H](O)[C@@H](NC(=O)CO)[C@H]([C@H](O)[C@H](O)CO)O7)[C@H]6O)[C@H](O)[C@H]5NC(C)=O)[C@H]4O)[C@H]3NC(C)=O)[C@H](O[C@]3(C(=O)[O-])C[C@H](O)[C@@H](NC(=O)CO)[C@H]([C@H](O)[C@H](O)CO)O3)[C@H]2O)[C@H](O)[C@H]1O)[C@H](O)CCCCCCCCCCCCCCC
CCCCCC/C=C\CCCCCCCCCC(=O)N[C@@H](CO[C@@H]1O[C@H](CO)[C@@H](O[C@@H]2O[C@H](CO)[C@H](O[C@@H]3O[C@H](CO)[C@H](O)[C@H](O[C@@H]4O[C@H](CO)[C@H](O)[C@H](O[C@@H]5O[C@H](CO)[C@@H](O[C@@H]6O[C@H](CO)[C@H](O)[C@H](O[C@]7(C(=O)[O-])C[C@H](O)[C@@H](NC(=O)CO)[C@H]([C@H](O)[C@H](O)CO)O7)[C@H]6O)[C@H](O)[C@H]5NC(C)=O)[C@H]4O)[C@H]3NC(C)=O)[C@H](O[C@]3(C(=O)[O-])C[C@H](O)[C@@H](NC(=O)CO)[C@H]([C@H](O)[C@H](O)CO)O3)[C@H]2O)[C@H](O)[C@H]1O)[C@H](O)CCCCCCCCCCCCCCC
CCCCCC/C=C/CCCCCCCCCC(=O)N[C@@H](CO[C@@H]1O[C@H](CO)[C@@H](O[C@@H]2O[C@H](CO)[C@H](O[C@@H]3O[C@H](CO)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



CCCCCCCCCCCCC/C=C/[C@@H](O)[C@H](CO[C@@H]1OC(CO)[C@@H](O[C@@H]2OC(CO)[C@H](O[C@@H]3OC(CO)[C@H](O)[C@H](O[C@@H]4OC(CO)[C@H](O)[C@H](O[C@@H]5OC(CO)[C@H](O)[C@H](O)C5O)C4NC(C)=O)C3O)[C@H](O)C2O)[C@H](O)C1O)NC(=O)CCCCCCCCCCCCCCCCC
CCCCCCCCCCCCC/C=C/[C@@H](O)[C@H](CO[C@@H]1O[C@H](CO)[C@@H](O[C@@H]2O[C@H](CO)[C@H](O[C@H]3O[C@H](CO)[C@H](O)[C@H](O[C@@H]4O[C@H](CO)[C@H](O)[C@H](O[C@@H]5O[C@H](CO)[C@H](O)[C@H](O)[C@H]5O)[C@H]4NC(C)=O)[C@H]3O)[C@H](O)[C@H]2O)[C@H](O)[C@H]1O)NC(=O)CCCCCCCCCCCCCCCCC
CCCCCCCCCCCCC/C=C/[C@@H](O)[C@H](CO[C@@H]1OC(CO)[C@@H](O[C@@H]2OC(CO)[C@H](O[C@@H]3OC(CO)[C@H](O)[C@H](O[C@@H]4OC(CO)[C@H](O)[C@H](O[C@@H]5OC(CO)[C@H](O)[C@H](O[C@]6(C(=O)O)CC(O)[C@@H](NC(=O)CO)C([C@H](O)[C@@H](CO)O[C@]7(C(=O)O)CC(O)[C@@H](NC(=O)CO)C([C@H](O)[C@H](O)CO)O7)O6)C5O)C4NC(C)=O)C3O)[C@H](O)C2O)[C@H](O)C1O)NC(=O)CCCCCCCCCCCCCCCCCCCCC
CCCCCCCCCCCCC/C=C/[C@@H](O)[C@H](CO[C@@H]1O[C@H](CO)[C@@H](O[C@@H]2O[C@H](CO)[C@H](O[C@H]3O[C@H](CO)[C@H](O)[C@H](O[C@@H]4O[C@H](CO)[C@H](O)[C@H](O[C@@H]5O[C@H](

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




CC(=O)N[C@H]1[C@H](O[C@@H]2[C@@H](CO[C@H]3O[C@H](CO[C@@H]4O[C@H](CO)[C@@H](O[C@@H]5O[C@H](CO)[C@H](O)[C@H](O)[C@H]5O)[C@H](O)[C@H]4NC(C)=O)[C@@H](O)[C@H](O)[C@@H]3O[C@@H]3O[C@H](CO)[C@@H](O[C@@H]4O[C@H](CO)[C@H](O)[C@H](O)[C@H]4O)[C@H](O)[C@H]3NC(C)=O)O[C@@H](O[C@@H]3[C@@H](CO)O[C@@H](O[C@@H]4[C@@H](CO[C@@H]5O[C@@H](C)[C@@H](O)[C@@H](O)[C@@H]5O)O[C@@H](O)[C@H](NC(C)=O)[C@H]4O)[C@H](NC(C)=O)[C@H]3O)[C@@H](O)[C@H]2O[C@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@@H]2O[C@@H]2O[C@H](CO)[C@@H](O[C@@H]3O[C@H](CO)[C@H](O)[C@H](O)[C@H]3O)[C@H](O)[C@H]2NC(C)=O)O[C@H](CO)[C@@H](O)[C@@H]1O
CC(=O)N[C@H]1[C@H](O[C@H]2[C@@H](O)[C@@H](CO)O[C@H](OC[C@H]3O[C@@H](O[C@@H]4[C@@H](CO)O[C@@H](O)[C@H](O)[C@H]4O)[C@H](O)[C@@H](O)[C@H]3O)[C@@H]2O)O[C@H](CO)[C@H](O)[C@@H]1O
CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)OC[C@@H](COC(=O)CCCCCCC/C=C\CCCCC)OC(=O)CCCCCCC/C=C\C/C=C\CCCCC
CC(=O)N[C@@H](CO)[C@@H](O[C@@H]1O[C@H](CO)[C@H](O)[C@H](O[C@H]2O[C@H](CO)[C@H](O)[C@H](O)[C@H]2NC(C)=O)[C@H]1O[C@@H]1O[C@@H](C)[C@@H](O)[C@@H](O)[C@@

KeyboardInterrupt: 

In [44]:
#make fp for bioreachable
bioreachable_mols = [Chem.MolFromSmiles(s) for s in bioreachable["SMILES"] if type(s) is not None]
fps = [AllChem.GetMorganFingerprintAsBitVect(m, radius=4, nBits=2048) for m in bioreachable_mols]



In [55]:
from rdkit.Chem import AllChem
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit import DataStructs

def highest_tanimoto_precalc_fps(mol, fps):

    if fps is None or len(fps) == 0:
        return 0

    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol, 4, 2048)
    sims = np.array(DataStructs.BulkTanimotoSimilarity(fp1, fps))

    return sims.max() 

In [59]:
#find inbioreachable molecules
inreachable_index = []
for idx, s in enumerate(bioinreachable["SMILES"]):
    score = highest_tanimoto_precalc_fps(mol=Chem.MolFromSmiles(s),fps=fps)
#     print(score)
    if score<0.7:
        inreachable_index.append(idx)
#     break







In [63]:
bioinreachable.iloc[inreachable_index]

Unnamed: 0,#ID,name,reference,formula,charge,mass,InChI,InChIKey,SMILES
1,MNXM02,OH(-),mnx:HYDROXYDE,OH,-1.0,17.00734,InChI=1S/H2O/h1H2/p-1,InChIKey=XLYOFNOQVPJJNP-UHFFFAOYSA-M,[O-][H]
2,MNXM03,H3O(+),mnx:OXONIUM,H3O,1.0,19.02322,InChI=1S/H2O/h1H2/p+1,InChIKey=XLYOFNOQVPJJNP-UHFFFAOYSA-O,[OH3+]
3,MNXM100000,"6Z,9Z-Heneicosadien-11-one",chebi:165749,C21H38O,0.0,306.29227,InChI=1S/C21H38O/c1-3-5-7-9-11-13-15-17-19-21(...,InChIKey=ZNAIFUOOHZIIGO-OHNCOSGTSA-N,CCCCC/C=C\C/C=C\C(=O)CCCCCCCCCC
14,MNXM100001,"4,10-Dimethyltriacontane",lipidmapsM:LMFA11000413,C32H66,0.0,450.51645,InChI=1S/C32H66/c1-5-7-8-9-10-11-12-13-14-15-1...,InChIKey=NIPDQYCBQMWKIJ-UHFFFAOYSA-N,CCCCCCCCCCCCCCCCCCCCC(C)CCCCCC(C)CCC
25,MNXM100002,"9,13-Dimethylheptacosane",lipidmapsM:LMFA11000539,C29H60,0.0,408.46950,InChI=1S/C29H60/c1-5-7-9-11-13-14-15-16-17-18-...,InChIKey=MTSZDHFVGBEJLQ-UHFFFAOYSA-N,CCCCCCCCCCCCCCC(C)CCCC(C)CCCCCCCC
...,...,...,...,...,...,...,...,...,...
1249546,MNXM99995,"11Z,19-Eicosadienyl acetate",lipidmapsM:LMFA07010399,C22H40O2,0.0,336.30283,InChI=1S/C22H40O2/c1-3-4-5-6-7-8-9-10-11-12-13...,InChIKey=KSEXUZDYADHVFH-KHPPLWFESA-N,C=CCCCCCC/C=C\CCCCCCCCCCOC(C)=O
1249557,MNXM99996,"(3R,6S)-3-Hydroxy-1,7-dioxaspiro[5.5]undecane",lipidmapsM:LMPK09000005,C9H16O3,0.0,172.10994,InChI=1S/C9H16O3/c10-8-3-5-9(12-7-8)4-1-2-6-11...,InChIKey=PSXHJXYOZUWYQY-BDAKNGLRSA-N,O[C@@H]1CC[C@]2(CCCCO2)OC1
1249579,MNXM99998,"12-methyl-tridecanyl 2,6,10,14-tetramethyl-pen...",lipidmapsM:LMFA07010086,C33H66O2,0.0,494.50628,InChI=1S/C33H66O2/c1-28(2)20-15-13-11-9-8-10-1...,InChIKey=BGLXHVGOGVGFEL-UHFFFAOYSA-N,CC(C)CCCCCCCCCCCOC(=O)C(C)CCCC(C)CCCC(C)CCCC(C)C
1249590,MNXM99999,"(E,E)-3,7,11-Trimethyl-2,6,10-dodecatrienyl pr...",lipidmapsM:LMFA07010541,C18H30O2,0.0,278.22458,InChI=1S/C18H30O2/c1-6-18(19)20-14-13-17(5)12-...,InChIKey=XFACLYNWBJYMCK-IUBLYSDUSA-N,CCC(=O)OC/C=C(\C)CC/C=C(\C)CCC=C(C)C


In [66]:
bioinreachable2 = bioinreachable.iloc[inreachable_index]

In [70]:
bioreachable.to_csv("bioreachable_molesules_fromql.csv")

In [69]:
bioinreachable2.to_csv("inbioreachable_molesules_fromql.csv")

In [3]:
chem = pd.read_csv('chem_prop.tsv', sep='\t', header=351) # skip 351 lines of documentation
chem

Unnamed: 0,#ID,name,reference,formula,charge,mass,InChI,InChIKey,SMILES
0,BIOMASS,BIOMASS,mnx:BIOMASS,,,,,,
1,MNXM01,PMF,mnx:PMF,H,1.0,1.00794,InChI=1S/p+1,InChIKey=GPRLSGONYQIRFK-UHFFFAOYSA-N,[H+]
2,MNXM02,OH(-),mnx:HYDROXYDE,OH,-1.0,17.00734,InChI=1S/H2O/h1H2/p-1,InChIKey=XLYOFNOQVPJJNP-UHFFFAOYSA-M,[O-][H]
3,MNXM03,H3O(+),mnx:OXONIUM,H3O,1.0,19.02322,InChI=1S/H2O/h1H2/p+1,InChIKey=XLYOFNOQVPJJNP-UHFFFAOYSA-O,[OH3+]
4,MNXM1,H(+),mnx:PROTON,H,1.0,1.00794,InChI=1S/p+1,InChIKey=GPRLSGONYQIRFK-UHFFFAOYSA-N,[H+]
...,...,...,...,...,...,...,...,...,...
1292149,MNXM999996,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692384,C73H121NO9P,-1.0,1186.87844,InChI=1S/C73H122NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=VNZHXXLXDVSBLA-IZNAGHOASA-M,CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)NC...
1292150,MNXM999997,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692385,C69H121NO9P,-1.0,1138.87844,InChI=1S/C69H122NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=FTBDAPNXHPOOLH-RUXWUTLCSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...
1292151,MNXM999998,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692386,C69H119NO9P,-1.0,1136.86279,InChI=1S/C69H120NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=UTTKGJJHRRHZRR-BNJOEXAFSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...
1292152,MNXM999999,"1-(14Z,17Z,20Z,23Z,26Z-dotriacontapentaenoyl)-...",slm:000692387,C71H121NO9P,-1.0,1162.87844,InChI=1S/C71H122NO9P/c1-4-7-10-13-16-19-22-24-...,InChIKey=BSOMIPWIPDKFRW-FMCFKDERSA-M,CCCCC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCCCCC...


In [4]:
list(chem["#ID"])

['BIOMASS',
 'MNXM01',
 'MNXM02',
 'MNXM03',
 'MNXM1',
 'MNXM10',
 'MNXM100',
 'MNXM100000',
 'MNXM1000000',
 'MNXM1000001',
 'MNXM1000002',
 'MNXM1000003',
 'MNXM1000004',
 'MNXM1000005',
 'MNXM1000006',
 'MNXM1000007',
 'MNXM1000008',
 'MNXM1000009',
 'MNXM100001',
 'MNXM1000010',
 'MNXM1000011',
 'MNXM1000012',
 'MNXM1000013',
 'MNXM1000014',
 'MNXM1000015',
 'MNXM1000016',
 'MNXM1000017',
 'MNXM1000018',
 'MNXM1000019',
 'MNXM100002',
 'MNXM1000020',
 'MNXM1000021',
 'MNXM1000022',
 'MNXM1000023',
 'MNXM1000024',
 'MNXM1000025',
 'MNXM1000026',
 'MNXM1000027',
 'MNXM1000028',
 'MNXM1000029',
 'MNXM100003',
 'MNXM1000030',
 'MNXM1000031',
 'MNXM1000032',
 'MNXM1000033',
 'MNXM1000034',
 'MNXM1000035',
 'MNXM1000036',
 'MNXM1000037',
 'MNXM1000038',
 'MNXM1000039',
 'MNXM100004',
 'MNXM1000040',
 'MNXM1000041',
 'MNXM1000042',
 'MNXM1000043',
 'MNXM1000044',
 'MNXM1000045',
 'MNXM1000046',
 'MNXM1000047',
 'MNXM1000048',
 'MNXM1000049',
 'MNXM100005',
 'MNXM1000050',
 'MNXM1000051',
