### Get isomers for all compounds in evaluation sets

In [1]:
import pubchempy as pcp
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem.inchi import *
from tqdm import tqdm 
tqdm.pandas()
def get_formula_from_inchi(inchi):
    formula = inchi.split('/')[1]
    return formula

def find_bad_str(string):
    return string.find('.')==-1
     
def get_isomers_for_formula(formula,inchi,output_path):
    cmpds=pcp.get_cids(formula, 'formula')
    pcp.download('CSV', output_path, cmpds, operation='property/InChi', overwrite=True)
    data=pd.read_csv(output_path, sep=',')
    data['mol']=data['InChI'].progress_apply(MolFromInchi)
    data = data.dropna(subset=['mol'])
    data['mol']=data['mol'].progress_apply(Chem.RemoveHs)
    data['smiles']=data['mol'].progress_apply(rdkit.Chem.MolToSmiles, isomericSmiles=True)
    data['OK']=data['smiles'].progress_apply(find_bad_str)
    data=data[data['OK']==True]
    data.drop_duplicates(subset ="InChI", inplace = True)
    cid = pcp.get_cids(inchi, 'inchi')[0]
    if len(data[data.CID == cid]) == 0:
        df = pd.DataFrame({'CID':[cid], 'InChI':[inchi]})
        data = data.append(df)
    data=data.reset_index()
    data.to_csv(output_path, sep=';', columns=['CID', 'InChI'])
    print (len(data))


  from pandas import Panel


In [5]:
    %%capture
    #specify path to eval sets
    import os
    eval_dir  = '../data/eval_sets/'
    #for file in os.listdir(eval_dir):
    file='unique_Eawag_XBridgeC18.csv'
    name = file.split('.')[0].replace('unique_','')
    os.makedirs( '../data/isomers/'+name,exist_ok=True)
    data = pd.read_csv(eval_dir+file, sep=';')
    data['formula'] = data.InChI.progress_apply(get_formula_from_inchi)
    for i in range (len(data)):
        if (data.Name[i]+'.csv' in os.listdir('../data/isomers/'+name+'/'))==False:
            get_isomers_for_formula(data.formula[i], data.InChI[i], '../data/isomers/'+name+'/'+data.Name[i]+'.csv')