In [11]:
%%time
# visualizing patterns in FreeSolv...




pattern_smiles = "c1cc(O)c(O)cc1CCN"
target_smiles = "c1c(O)c(O)c(Cl)cc1CCCBr"

from openeye import oechem
def smiles_to_mol(smiles):
    mol = oechem.OEGraphMol()
    oechem.OESmilesToMol(mol, smiles)
    return mol
    
def get_substructure_matches(pattern_smiles, target_smiles):
    pattern = smiles_to_mol(pattern_smiles)
    target = smiles_to_mol(target_smiles)

    atomexpr = oechem.OEExprOpts_DefaultAtoms
    bondexpr = oechem.OEExprOpts_DefaultBonds
    # create maximum common substructure object
    mcss = oechem.OEMCSSearch(pattern, atomexpr, bondexpr, oechem.OEMCSType_Approximate)
    # set scoring function
    mcss.SetMCSFunc(oechem.OEMCSMaxAtoms())
    mcss.SetMinAtoms(max(2, min(len(list(pattern.GetAtoms())), len(list(target.GetAtoms())))))
    unique = True
    # loop over matches
    
    subset_mols = []
    for count, match in enumerate(mcss.Match(target, unique)):

        # create match subgraph
        m = oechem.OEGraphMol()
        oechem.OESubsetMol(m, match, True)
        
        subset_mols.append(m)
    return subset_mols

get_substructure_matches(pattern_smiles, target_smiles)

CPU times: user 905 µs, sys: 605 µs, total: 1.51 ms
Wall time: 1.3 ms


In [None]:
oechem

In [2]:
pattern = oechem.OEGraphMol()
oechem.OESmilesToMol(pattern, pattern_smiles)
len(list(pattern.GetAtoms()))

11

In [1]:
path = "../../../MSKCC/Chodera Lab/feedstock/FreeSolv-0.51/database.txt"
from pickle import load
with open(path, 'r') as f:
    freesolv = f.read()
db = []
for entry in freesolv.split('\n')[3:-1]:
    db.append(entry.split('; '))

In [2]:
db[0]

['mobley_1017962',
 'CCCCCC(=O)OC',
 'methyl hexanoate',
 '-2.49',
 '0.60',
 '-3.30',
 '0.03',
 '10.1021/ct050097l',
 '10.1021/ct800409d',
 'Experimental uncertainty not presently available, so assigned a default value.  ']

In [6]:
for i in range(len(db)):
    if 'ketoprofen' in db[i][2]:
        print(db[i])

['mobley_2099370', 'C[C@@H](c1cccc(c1)C(=O)c2ccccc2)C(=O)O', 'ketoprofen', '-10.78', '0.18', '-17.24', '0.06', '10.1007/s10822-010-9350-8', '10.1007/s10822-010-9343-7', 'Renamed mobley_2099370 from ketoprofen to (2S)-2-(3-benzoylphenyl)propanoic acid (the name from the 3D structure) since the former did not completely specify stereochemistry.  ']


In [4]:
smiles_list = [entry[1] for entry in db]

In [5]:
from tqdm import tqdm
subsets = {}

for i in tqdm(range(len(smiles_list))):
    for j in range(len(smiles_list)):
        subsets[(i,j)] = get_substructure_matches(smiles_list[i], smiles_list[j])

100%|██████████| 643/643 [02:03<00:00,  5.22it/s]


In [6]:
len(subsets) / (len(smiles_list)**2)

1.0

In [15]:
mol = subsets[(0,0)][0]
len(list(mol.GetAtoms()))

9

In [14]:
mol = smiles_to_mol(smiles_list[0])
len(list(mol.GetAtoms()))

9

In [17]:
mol_sizes = [len(list(smiles_to_mol(s).GetAtoms())) for s in smiles_list]

In [19]:
len(list(smiles_to_mol(smiles_list[0]).GetAtoms()))

0

In [21]:
mol = smiles_to_mol(smiles_list[0])
len(list(mol.GetAtoms()))

9

In [18]:
mol_sizes

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
