In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

In [2]:
def ChangeStartIdx(atomList):
    atoms=[]
    for site,atom in enumerate(atomList):
        for i in atom:
            new=[i+1 for i in atom]
        atoms.append(new)
    return(atoms)

In [3]:
def FindEsterFG(smiles):
    smi=smiles
    mol=Chem.MolFromSmiles(smi)
    
    #read in functional group smarts string and convert it to rdkit SMARTS object
    CAE_smarts='[CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][#6;!$(C=[O,N,S])]'
    CAE=Chem.MolFromSmarts(CAE_smarts)
    
    CarbAnhydride_smarts='[CX3;$([H0][#6]),$([H1])](=[OX1])[#8X2][CX3;$([H0][#6]),$([H1])](=[OX1])'
    CarbAnhydride=Chem.MolFromSmarts(CarbAnhydride_smarts)
    
     # Use GetSubstructMatches to find atoms in functional group
    #GetSubstructMatches returns a tuple, convert to list to parse through
    CAE_atom=list(Chem.Mol.GetSubstructMatches(mol,CAE,uniquify=True))
    anhydride_atom=list(Chem.Mol.GetSubstructMatches(mol,CarbAnhydride,uniquify=True))

    #GetSubstructMatches has a 0-start index, add 1 to atom nums so that index matches hydrowin using Function ChangeStartIdx()
    anh=ChangeStartIdx(anhydride_atom)
    cae=ChangeStartIdx(CAE_atom)
    an_site=len(anh)
    cae_site=len(cae)
    
    
    #return a list of atom numbers in anhydride and CAE functional group (FG). 
    #if there is more that one site for the FG, a list of list will be returned for the FG. 
    # Example: a molecule with 2 anhydride sites and 1 CAE site will return a list of atom numbers followed by number of site as:
    #([[anh_site1_atoms],[anh__site2_atoms]],2,[case_site_atoms],1)
   
    return(anh,an_site,cae,cae_site)

In [5]:
#test CAE exmaple from hydrolysis lib doc to see if they all are distingueshed as CAE and not anhydrides
test_c=['CCOC(=O)CC','CCCCC(CC)COC(=O)C1=C(C=CC=C1)C(=O)OCC(CC)CCCC','CC1(C)C(C(=O)OC(C#N)C2=CC=CC(OC3=CC=CC=C3)=C2)C1(C)C',
        'CCOC(=O)C(O)(C1=CC=C(Cl)C=C1)C1=CC=C(Cl)C=C1','COC(=O)C1=C(Cl)C=CC(Cl)=C1']

for m in test_c:
    a,s,c,cs=FindEsterFG(m)
    print('anhydride site count:' ,s,'\n',a,'\n','CAE site count: ',cs,'\n',c)

anhydride site count: 0 
 [] 
 CAE site count:  1 
 [[4, 5, 3, 2]]
anhydride site count: 0 
 [] 
 CAE site count:  2 
 [[10, 11, 9, 8], [18, 19, 20, 21]]
anhydride site count: 0 
 [] 
 CAE site count:  1 
 [[5, 6, 7, 8]]
anhydride site count: 0 
 [] 
 CAE site count:  1 
 [[4, 5, 3, 2]]
anhydride site count: 0 
 [] 
 CAE site count:  1 
 [[3, 4, 2, 1]]


In [7]:
# Testif FindEsterFG() distinguishes between chain anhydride and cyclic anhydride groups, examples takend from Hydrolysis lib doc

test_a=['O=C1CCCC(=O)O1','CC(=O)OC(C)=O']
for m in test_a:
    a,s,c,cs=FindEsterFG(m)
    print('anhydride site count:' ,s,'\n',a,'\n','CAE site count: ',cs,'\n',c)

anhydride site count: 1 
 [[2, 1, 8, 6, 7]] 
 CAE site count:  0 
 []
anhydride site count: 1 
 [[2, 3, 4, 5, 7]] 
 CAE site count:  0 
 []


Current Anhydride SMART string does not distingush between cylic and nchane anhydride groups, need to resolve.

In [8]:
a,s,c,cs=FindEsterFG('OCC1OC(=O)C(O)C(O)C1O')
print('anhydride site count:' ,s,'\n',a,'\n','CAE site count: ',cs,'\n',c)

anhydride site count: 0 
 [] 
 CAE site count:  0 
 []


In [12]:
smi='OCC1OC(=O)C(O)C(O)C1O'
mol=Chem.MolFromSmiles(smi)
lactone_smarts='[#6][#6X3R](=[OX1])[#8X2][#6;!$(C=[O,N,S])]'
lactone=Chem.MolFromSmarts(lactone_smarts)
lactone_atom=list(Chem.Mol.GetSubstructMatches(mol,lactone,uniquify=True))
lac=ChangeStartIdx(lactone_atom)
lac_site=len(lac)

print('Lactone site count:', lac_site, '\n', lac)

Lactone site count: 1 
 [[7, 5, 6, 4, 3]]


In [14]:
#confirm that lacotn does not find anhydrides
for m in test_a:
    smi=m
    mol=Chem.MolFromSmiles(smi)
    lactone_smarts='[#6][#6X3R](=[OX1])[#8X2][#6;!$(C=[O,N,S])]'
    lactone=Chem.MolFromSmarts(lactone_smarts)
    lactone_atom=list(Chem.Mol.GetSubstructMatches(mol,lactone,uniquify=True))
    lac=ChangeStartIdx(lactone_atom)
    lac_site=len(lac)

    print('Lactone site count:', lac_site, '\n', lac)

Lactone site count: 0 
 []
Lactone site count: 0 
 []


In [15]:
#confirm lactone does not find CAE
for m in test_c:
    smi=m
    mol=Chem.MolFromSmiles(smi)
    lactone_smarts='[#6][#6X3R](=[OX1])[#8X2][#6;!$(C=[O,N,S])]'
    lactone=Chem.MolFromSmarts(lactone_smarts)
    lactone_atom=list(Chem.Mol.GetSubstructMatches(mol,lactone,uniquify=True))
    lac=ChangeStartIdx(lactone_atom)
    lac_site=len(lac)

    print('Lactone site count:', lac_site, '\n', lac)

Lactone site count: 0 
 []
Lactone site count: 0 
 []
Lactone site count: 0 
 []
Lactone site count: 0 
 []
Lactone site count: 0 
 []


In [24]:
#confim that lactam distinguishes between lactam (last two examples) and amides (first two examples)
test_la=['CCNC(=O)[C@@H](C)OC(=O)NC1=CC=CC=C1','CCC(=O)NC1=CC=C(Cl)C(Cl)=C1','O[N+](=O)C1=CC(=CC=C1)N1CCCC1=O',
'[H][C@]12C[C@@]3([H])C4=CCO[C@@]5([H])CC(=O)N6C7=C(C=CC=C7)[C@@]1(CCN2C4)[C@]6([H])[C@@]35[H]']

for m in test_la:
    smi=m
    mol=Chem.MolFromSmiles(smi)
    lactam_smarts='[#6R][#6X3R](=[OX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]'
    lactam=Chem.MolFromSmarts(lactam_smarts)
    lactam_atom=list(Chem.Mol.GetSubstructMatches(mol,lactam,uniquify=True))
    lam=ChangeStartIdx(lactam_atom)
    lam_site=len(lam)
    print('Lactam site count:', lam_site, '\n', lam)

Lactam site count: 0 
 []
Lactam site count: 0 
 []
Lactam site count: 1 
 [[13, 14, 15, 10]]
Lactam site count: 1 
 [[9, 10, 11, 12]]


In [27]:
#confim that amide distinguishes between lactam (last two examples) and amides (first two examples)
for m in test_la:
    smi=m
    mol=Chem.MolFromSmiles(smi)
    amide=(Chem.MolFromSmarts('[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]'))
    amide_atom=list(Chem.Mol.GetSubstructMatches(mol,amide,uniquify=True))
    ami=ChangeStartIdx(amide_atom)
    ami_site=len(ami)
    print('amide site count:', ami_site, '\n', ami)

amide site count: 1 
 [[4, 5, 3]]
amide site count: 1 
 [[3, 4, 5]]
amide site count: 0 
 []
amide site count: 0 
 []


In [29]:
#check carbonate, will include cyclic but not cyclic carbonate exmaples to test
smi='CCOC(=O)OC1=C(C(=O)N[C@@]11CC[C@@H](CC1)OC)C1=C(C)C=CC(C)=C1'
mol=Chem.MolFromSmiles(smi)
carbonic=(Chem.MolFromSmarts('[#6;!$(C=[O,N,S])][#8X2][#6X3](=[OX1])[#8X2][#6;!$(C=[O,N,S])]'))
carbonic_atom=list(Chem.Mol.GetSubstructMatches(mol,carbonic,uniquify=True))
carb=ChangeStartIdx(carbonic_atom)
carb_site=len(carb)
print('carbonate site count:', carb_site, '\n', carb)

carbonate site count: 1 
 [[2, 3, 4, 5, 6, 7]]


In [31]:
#check urea, will include cyclic. cylic (first example), noncylic (second example)
test_u=['CC(C)C1=CC=C(C=C1)N1CCC(C)N(C)C1=O','CC(C)C1=CC=C(NC(=O)N(C)C)C=C1']

for m in test_u:
    smi=m
    mol=Chem.MolFromSmiles(smi)
    urea=(Chem.MolFromSmarts('[#7X3;!$([#7][!#6])][#6X3](=[OX1])[#7X3;!$([#7][!#6])]'))
    urea_atom=list(Chem.Mol.GetSubstructMatches(mol,urea,uniquify=True))
    ur=ChangeStartIdx(urea_atom)
    ur_site=len(ur)
    print('urea site count:', ur_site, '\n', ur)

urea site count: 1 
 [[10, 17, 18, 15]]
urea site count: 1 
 [[8, 9, 10, 11]]
