### Generation of predictions using CTS
- Created by: Louis Groff
- PIs: Imran Shah and Grace Patlewicz
- Last modified: 4 March 2024
- Changes made: Test sample SMILES notebook to verify functionality

CTS can be run using its REST API at https://qed.epa.gov/cts/rest. The API is queried via a post call with Python’s request package, where the url parameter is: 
https://qed.epa.gov/cts/rest/metabolizer/run and the POST with the model parameters (json input keyword in request.post) is given as a dictionary with three parameters, “structure” (SMILES string), “generationLimit” (integer value between 1-4 to indicate desired transformation depth), and “transformationLibraries” (indicates model choice). The ChemAxon Human Phase I metabolizer is chosen by setting “transformationLibraries” to [“mammalian_metabolism”]. Thus the json parameter with the POST dictionary structure is:
{"structure": <qsar_ready_smiles>, "generationLimit": <1-4>, "transformationLibraries": ["mammalian_metabolism"]}
The generationLimit of 3 was chosen because the current CTS REST API (as of November 2023) is prone to running out of memory for large metabolism trees that approach the memory allocation limit of the API at 3 generations. Workarounds and solutions to this issue are in progress, but it is recommended to limit the transformation depth between 1-3 generations for the greatest number of successfully returned metabolism queries.

Import relevant libraries

In [3]:
import os
import pandas as pd
import numpy as np
import requests
import time
from rdkit import Chem
import datetime
import pprint
import json

In [8]:
def cts_chemaxon_phaseI(smiles = None, gens = 0):
    """
    Using CTS ChemAxon Human Phase I metabolism via its REST API
    takes an input SMILES string that will serve as the value for the "structure" key in the POST JSON dict and an integer value for number of metabolism cycles (0-4)
    """
    import requests
    import time
    
    
    if smiles != None:
        if gens < 5 and gens > 0:
            url = 'https://qed.epa.gov/cts/rest/metabolizer/run'
            post =  {"structure": smiles, "generationLimit": gens, "transformationLibraries": ["mammalian_metabolism"]}
            time.sleep(0.5)
            
            try:    
                output = requests.post(url = url, json = post, verify=False).json()
                print('CTS API successfully returned metabolism predictions for '+str(gens)+' cycles of Phase I metabolism for input SMILES: '+str(smiles))
                return output
            except:
                print('CTS API failed to generate metabolites.')
                return None
        else:
            print('Generations must be between 1 and 4.')
            return None
    else:
        return None
    
def metsim_cts_phaseI_out(in_smiles = None, cts_data = None, depth = 0, out_dict = None):
    """
    Run CTS Human Phase I metabolism simulator, then recursively process the CTS output into metsim hierarchy structure.
    inputs:
    in_smiles: input SMILES string
    depth: Transformation depth in terms of integer number of generations (1-4)
    cts_data: CTS output JSON dictionary
    out_dict: MetSim output dictionary
    """
    
    #generate initial output dictionary hierarchy, and obtain CTS JSON output from API:
    if out_dict == None:
        out_dict = {'datetime': str(datetime.datetime.now().strftime('%Y-%m-%d_%Hh%Mm%Ss')),
                     'software': 'EPA Chemical Transformation Simulator',
                     'version': '1.3.2.2',
                     'params':{'depth': depth,
                               'organism': 'human',
                               'site_of_metabolism': False,
                               'model': 'ChemAxon Human Phase I'
                              },
                     'input': {'smiles': in_smiles,
                               'inchikey': None,
                               'casrn': None,
                               'hcd_smiles': None,
                               'dtxsid': None,
                               'chem_name': None
                              },
                     'output': []
                   }
    
    #if no CTS output data exists, query the API
    if cts_data == None:            
        cts_data = cts_chemaxon_phaseI(smiles = in_smiles, gens = depth)
        
        #if the API call fails and yields no output
        if cts_data == None or 'error' in list(cts_data.keys()):
            print('CTS API call yielded no output data.')
            out_dict['api_success'] = False
            successor_dict = {'precursor': {'smiles': out_dict['input']['smiles'],
                                            'inchikey': None,
                                            'casrn': None,
                                            'hcd_smiles': None,
                                            'dtxsid': None,
                                            'chem_name': None,
                                            'likelihood': None
                                           },
                               'successors': [{'enzyme': None,
                                               'mechanism': None,
                                               'generation': None,
                                               'metabolite': {'smiles': None,
                                                              'inchikey': None,
                                                              'casrn': None,
                                                              'hcd_smiles': None,
                                                              'dtxsid': None,
                                                              'chem_name': None,
                                                              'likelihood': None
                                                             }
                                              }]
                             }
            
            #At least supplement metadata with inchikey from RDKit if possible
            try:
                out_dict['input']['inchikey'] = Chem.inchi.MolToInchiKey(Chem.MolFromSmiles(out_dict['input']['smiles']))
                print('RDKit InChIKey generation for input SMILES: '+out_dict['input']['smiles']+' succeeded.')
                out_dict['output'].append(successor_dict) 
                return out_dict 
            except:
                print('RDKit InChIKey generation failed.')
                out_dict['output'].append(successor_dict) 
                return out_dict
            
    #successful API call will have a key in the output dictionary for "children"
    if 'children' in list(cts_data.keys()):
        print('children key found in dictionary')
        #if metabolites are present in this level, store their information
        if len(cts_data['children']) > 0:
            print(str(len(cts_data['children']))+' children found for precursor generation level '+str(cts_data['data']['generation']))
            out_dict['api_success'] = True
            successor_dict = {'precursor': {'smiles': cts_data['data']['smiles'],
                                            'inchikey': None,
                                            'casrn': None,
                                            'hcd_smiles': None,
                                            'dtxsid': None,
                                            'chem_name': None,
                                            'likelihood': cts_data['data']['likelihood']
                                           },
                               'successors': [{'enzyme': None,
                                               'mechanism': cts_data['children'][j]['data']['routes'],
                                               'generation': cts_data['children'][j]['data']['generation'],
                                               'metabolite': {'smiles': cts_data['children'][j]['data']['smiles'],
                                                              'inchikey': None,
                                                              'casrn': None,
                                                              'hcd_smiles': None,
                                                              'dtxsid': None,
                                                              'chem_name': None,
                                                              'likelihood': cts_data['children'][j]['data']['likelihood']
                                                             }
                                              } for j in range(len(cts_data['children']))]
                             }
            out_dict['output'] = out_dict['output']+[successor_dict]
            print('precursor-successor relationships appended for current generational level.')
            for i in range(len(cts_data['children'])):
                if len(cts_data['children'][i]['children']) > 0:
                    print('children found in next generational level, recursing...')
                    out_dict = metsim_cts_phaseI_out(in_smiles = None, cts_data = cts_data['children'][i], depth = depth, out_dict = out_dict)
            return out_dict
        else:
            #no children returned, but successful API call:
            print('CTS API call successful. No metabolites produced for SMILES: '+out_dict['input']['smiles'])
            out_dict['api_success'] = True
            successor_dict = {'precursor': {'smiles': cts_data['data']['smiles'],
                                            'inchikey': None,
                                            'casrn': None,
                                            'hcd_smiles': None,
                                            'dtxsid': None,
                                            'chem_name': None,
                                            'likelihood': None
                                           },
                               'successors': [{'enzyme': None,
                                               'mechanism': None,
                                               'generation': None,
                                               'metabolite': {'smiles': None,
                                                              'inchikey': None,
                                                              'casrn': None,
                                                              'hcd_smiles': None,
                                                              'dtxsid': None,
                                                              'chem_name': None,
                                                              'likelihood': None
                                                             }
                                              }]
                             }
            try:
                out_dict['output'][0]['precursor']['inchikey'] = Chem.inchi.MolToInchiKey(Chem.MolFromSmiles(out_dict['input']['smiles']))
                print('RDKit InChIKey generation for input SMILES: '+out_dict['input']['smiles']+' succeeded.')
                out_dict['output'].append(successor_dict) 
                return out_dict 
            except:
                print('RDKit InChIKey generation failed.')
                out_dict['output'].append(successor_dict) 
                return out_dict 
            return out_dict
    elif 'children' not in list(cts_data.keys()):
        cts_data = cts_data['data']
        return metsim_cts_phaseI_out(in_smiles = None, cts_data = cts_data, depth = depth, out_dict = out_dict)

### Testing the function with the a QSAR Ready SMILES from the 112 dataset

In [12]:
#cts_chemaxon_phaseI('O=C1CCC2=CC=C(C=C2N1)OCCCCN1CCN(CC1)C1C=CC=C(Cl)C=1Cl', 3)

Functions to query the Standardizer and CCD

In [18]:
import urllib.request

In [19]:
#Function 1: HCD Queries
def metsim_hcd_out(smiles = None, 
                   casrn = None,
                   dtxsid = None,
                   chem_name = None,
                   likely = None):
    """
    Query function for the Cheminformatics Modules Standardizer API, formerly wrapped within the Hazard Comparison Dashboard (HCD) API. 
    Used to convert an input SMILES string into QSAR-Ready SMILES. Returns InChIKey structural identifier as well,
    along with any other chemical identifer metadata if available, and not already given as inputs (e.g., CASRN, DTXSID, Chemical Name).
    
    If SMILES is not known, but DTXSID is known, can instead query on DTXSID to obtain Daylight SMILES from the Comptox Chemicals Dashboard API (CCD API),
    and subsequently query the Standardizer API using the SMILES obtained from the CCD API.
    
    Required Inputs:
    smiles: Daylight SMILES string
    or
    dtxsid: DSSTox Substance Identifier
    
    Optional Inputs:
    chem_name: Chemical name, whether trade name or IUPAC
    casrn: Chemical Abstracts Services Registry Number
    inchikey: International Chemical Identifier Key (InChIKey)
    likely: If MetSim predictions are obtained from the Chemical Transformation Simulator, can optionally keep the transformation "likelihood" parameter
    
    Returns:
    out_dict: Output dictionary containing all available output data for the given chemical, using the input parameter names as dictionary keys.
    Includes "hcd_smiles" as output dictionary key containing QSAR-Ready version of the input SMILES.
    
    Examples:
    
    SMILES given as sole input:
    input:    
    test_dict = metsim_hcd_out(smiles = "OCCOCCO")
    
    output:
    Attempting query of Cheminformatics Modules Standardizer with SMILES: OCCOCCO...
    Query succeeded.
    test_dict
    {'smiles': 'OCCOCCO',
     'casrn': '111-46-6',
     'hcd_smiles': 'OCCOCCO',
     'inchikey': 'MTHSVFCYNBDYFN-UHFFFAOYNA-N',
     'dtxsid': 'DTXSID8020462',
     'chem_name': 'Diethylene glycol',
     'likelihood': None}
    
    DTXSID given as sole input:
    
    input:    
    test_dict = metsim_hcd_out(dtxsid = "DTXSID4020402")
    
    output:
    Attempting query of Comptox Chemicals Dashboard with DTXSID: DTXSID4020402...
    Query succeeded.
    No SMILES given. Using CCD output SMILES.
    Attempting query of Cheminformatics Modules Standardizer with SMILES: CC1=C(N)C=C(N)C=C1...
    Query succeeded.
    test_dict
    {'smiles': 'CC1=C(N)C=C(N)C=C1',
     'casrn': '95-80-7',
     'hcd_smiles': 'CC1C=CC(N)=CC=1N',
     'inchikey': 'VOZKAJLKRJDJLL-UHFFFAOYNA-N',
     'dtxsid': 'DTXSID4020402',
     'chem_name': '2,4-Diaminotoluene',
     'likelihood': None}
     
     Empty inputs:
     input:
     test_dict = metsim_hcd_out(smiles = None, dtxsid = None)
     
     output:
     test_dict
     {'smiles': None,
      'casrn': None,
      'hcd_smiles': None,
      'inchikey': None,
      'dtxsid': None,
      'chem_name': None,
      'likelihood': None}   
    """
    
    ccd_out = []
    if dtxsid != None and smiles == None:
        #get metadata from Comptox Chemicals Dashboard for a given DTXSID (No structure searching atm).
        ccd_url = 'https://comptox.epa.gov/dashboard-api/ccdapp2/chemical-detail/search/by-dsstoxsid?id='+dtxsid
        ccd_success = 0
        try_count = 0
        while ccd_success == 0 and try_count < 3:
            try:
                print('Attempting query of Comptox Chemicals Dashboard with DTXSID: '+dtxsid+'...')
                ccd_out = json.loads(urllib.request.urlopen(ccd_url).read().decode())
                
                ccd_success = 1
                try_count+=1
            except:
                #Given that this occasionally fails randomly due to timeout errors, 
                #but then works again later, try again after a 1 second pause.
                #Should work on second attempt.
                print('URL Error Occurred, reattempting CCD query in 0.5 seconds.')
                time.sleep(0.5)
                try_count+=1
            print('Query succeeded.')
    if smiles != None or len(ccd_out) > 0:
        if smiles != None:
            smiles_url = urllib.parse.quote_plus(smiles) #URL encode SMILES string.
        elif len(ccd_out) > 0 and ccd_out['smiles'] != None:
            print('No SMILES given. Using CCD output SMILES.')
            smiles = ccd_out['smiles']
            smiles_url = urllib.parse.quote_plus(smiles) #URL enconde CCD smiles string.
        else:
            print('No SMILES given, and no SMILES available from CCD output.')
            smiles_url = None
        
        base_url = "https://hcd.rtpnc.epa.gov/api/stdizer?workflow=qsar-ready&smiles=" #Production environment (current, no VPN needed)
        
        if smiles_url != None:
            hcd_url = base_url+smiles_url
            hcd_success = 0
            try_count = 0
            hcd_out = []
            while hcd_success == 0 and try_count < 3:
                try:
                    print('Attempting query of Cheminformatics Modules Standardizer with SMILES: '+smiles+'...')
                    time.sleep(0.5)
                    hcd_out = json.loads(urllib.request.urlopen(hcd_url).read().decode())
                    print('Query succeeded.')
                    hcd_success = 1
                    try_count+=1
                except:
                    #Given that this occasionally fails randomly due to timeout errors, 
                    #but then works again later, try again after a 0.5 second pause.
                    #Should work on second attempt.
                    print('URL Error Occurred, reattempting Cheminformatics Modules query in 0.5 seconds.')
                    time.sleep(0.5)
                    try_count+=1
            if len(hcd_out) > 0:
                out_dict = {'smiles': smiles, 
                            'casrn': casrn,
                            'hcd_smiles': hcd_out[0]['smiles'],
                            'inchikey': hcd_out[0]['inchiKey'],
                            'dtxsid': dtxsid,
                            'chem_name': chem_name,
                            'likelihood': likely}
                if out_dict['dtxsid'] == None:
                    if 'DTXSID' in hcd_out[0]['id']:
                        out_dict['dtxsid'] = hcd_out[0]['id']
                    elif len(ccd_out) > 0 and ccd_out['dsstoxSubstanceId'] != None:
                        out_dict['dtxsid'] = ccd_out['dsstoxSubstanceId']
                if out_dict['casrn'] == None:
                    if 'casrn' in hcd_out[0].keys():
                        out_dict['casrn'] = hcd_out[0]['casrn']
                    elif len(ccd_out) > 0 and ccd_out['casrn'] != None:
                        out_dict['casrn'] = ccd_out['casrn']
                if out_dict['chem_name'] == None:
                    if len(ccd_out) > 0 and ccd_out['preferredName'] != None:
                        out_dict['chem_name'] = ccd_out['preferredName']
                    elif 'name' in hcd_out[0].keys():
                        out_dict['chem_name'] = hcd_out[0]['name']
                if out_dict['inchikey'] == None and len(ccd_out) > 0:
                    if ccd_out['inchiKey'] != None:
                        out_dict['inchikey'] = ccd_out['inchiKey'] 
            else:
                out_dict = {'smiles': smiles,
                            'casrn': casrn,
                            'hcd_smiles': None,
                            'inchikey': None,
                            'dtxsid': dtxsid,
                            'chem_name': chem_name,
                            'likelihood': likely
                           }
                #HCD Returns empty list. Try to supplement with metadata from RDKit.
                try:
                    smiles_mol = Chem.MolFromSmiles(smiles)
                    out_dict['inchikey'] = Chem.inchi.MolToInchiKey(smiles_mol)
                    print('RDKit generated InChIKey for SMILES: ',smiles)
                except:
                    #Rarely, BioTransformer makes a bad SMILES string for a metabolite, and RDKit can't convert it to an InChIKey. Store None
                    print('RDKit failed to generate an inchikey for SMILES: '+smiles)
                    out_dict['inchikey'] = None
        else:
            out_dict = {'smiles': smiles,
                        'casrn': casrn,
                        'hcd_smiles': None,
                        'inchikey': None,
                        'dtxsid': dtxsid,
                        'chem_name': chem_name,
                        'likelihood': likely
                       }
    else:
        out_dict = {'smiles': smiles,
                    'casrn': casrn,
                    'hcd_smiles': None,
                    'inchikey': None,
                    'dtxsid': dtxsid,
                    'chem_name': chem_name,
                    'likelihood': likely
                   }
    return out_dict

In [20]:
 metsim_hcd_out(smiles = "OCCOCCO")

Attempting query of Cheminformatics Modules Standardizer with SMILES: OCCOCCO...
Query succeeded.


{'smiles': 'OCCOCCO',
 'casrn': '111-46-6',
 'hcd_smiles': 'OCCOCCO',
 'inchikey': 'MTHSVFCYNBDYFN-UHFFFAOYNA-N',
 'dtxsid': 'DTXSID8020462',
 'chem_name': 'Diethylene glycol',
 'likelihood': None}

In [21]:
def metsim_metadata_full(metsim_out = [], 
                         fnam = None):
    if len(metsim_out) > 0:
        #Supplement metadata via serial HCD query through individual input chemicals, precursors, successors/metabolites for a full metsim dataset
        for i in range(len(metsim_out)): # i = number of input chemicals
            if metsim_out[i]['input']['inchikey'] != None:
                    continue
            metsim_out[i]['input'] = metsim_hcd_out(smiles = metsim_out[i]['input']['smiles'],
                                                    casrn = metsim_out[i]['input']['casrn'],
                                                    dtxsid = metsim_out[i]['input']['dtxsid'],
                                                    chem_name = metsim_out[i]['input']['chem_name'])
            for j in range(len(metsim_out[i]['output'])): # j = number of unique precursors
                if 'likelihood' in list(metsim_out[i]['output'][j]['precursor'].keys()):
                    metsim_out[i]['output'][j]['precursor'] = metsim_hcd_out(smiles = metsim_out[i]['output'][j]['precursor']['smiles'],
                                                                             casrn = metsim_out[i]['output'][j]['precursor']['casrn'],
                                                                             dtxsid = metsim_out[i]['output'][j]['precursor']['dtxsid'],
                                                                             chem_name = metsim_out[i]['output'][j]['precursor']['chem_name'],
                                                                             likely = metsim_out[i]['output'][j]['precursor']['likelihood'])
                else:
                    metsim_out[i]['output'][j]['precursor'] = metsim_hcd_out(smiles = metsim_out[i]['output'][j]['precursor']['smiles'],
                                                                             casrn = metsim_out[i]['output'][j]['precursor']['casrn'],
                                                                             dtxsid = metsim_out[i]['output'][j]['precursor']['dtxsid'],
                                                                             chem_name = metsim_out[i]['output'][j]['precursor']['chem_name'])
                for k in range(len(metsim_out[i]['output'][j]['successors'])): # k = number of metabolites per precursor
                    if 'likelihood' in list(metsim_out[i]['output'][j]['successors'][k]['metabolite'].keys()):
                        metsim_out[i]['output'][j]['successors'][k]['metabolite'] = metsim_hcd_out(smiles = metsim_out[i]['output'][j]['successors'][k]['metabolite']['smiles'],
                                                                                                   casrn = metsim_out[i]['output'][j]['successors'][k]['metabolite']['casrn'],
                                                                                                   dtxsid = metsim_out[i]['output'][j]['successors'][k]['metabolite']['dtxsid'],
                                                                                                   chem_name = metsim_out[i]['output'][j]['successors'][k]['metabolite']['chem_name'],
                                                                                                   likely = metsim_out[i]['output'][j]['successors'][k]['metabolite']['likelihood'])
                    else:
                        metsim_out[i]['output'][j]['successors'][k]['metabolite'] = metsim_hcd_out(smiles = metsim_out[i]['output'][j]['successors'][k]['metabolite']['smiles'],
                                                                                                   casrn = metsim_out[i]['output'][j]['successors'][k]['metabolite']['casrn'],
                                                                                                   dtxsid = metsim_out[i]['output'][j]['successors'][k]['metabolite']['dtxsid'],
                                                                                                   chem_name = metsim_out[i]['output'][j]['successors'][k]['metabolite']['chem_name'])
                    print('input: '+str(i+1)+'/'+str(len(metsim_out))+' precursor: '+str(j+1)+'/'+str(len(metsim_out[i]['output']))+' metabolite: '+str(k+1)+'/'+str(len(metsim_out[i]['output'][j]['successors'])))
            if fnam != None:
                json.dump(metsim_out, open(fnam,'w'))
    else:
        raise('Please supply a metsim dataset (list of dictionaries)')
    # print(metsim_out)
    return metsim_out

In [22]:
def metsim_metadata_full(metsim_out = [], fnam = None, metsim_cache = None):
    
    if len(metsim_out) > 0:
        if metsim_cache != None:
            #Supplement metadata via serial HCD query through individual input chemicals, precursors, successors/metabolites for a full metsim dataset
            for i in range(len(metsim_out)): # i = number of input chemicals
                if metsim_out[i]['input']['inchikey'] != None:
                        continue
                if metsim_out[i]['input']['smiles'] not in [cache_item['smiles'] for cache_item in metsim_cache]:
                    metsim_out[i]['input'] = metsim_hcd_out(smiles = metsim_out[i]['input']['smiles'],
                                                            casrn = metsim_out[i]['input']['casrn'],
                                                            dtxsid = metsim_out[i]['input']['dtxsid'],
                                                            chem_name = metsim_out[i]['input']['chem_name'])
                    metsim_cache.append(metsim_out[i]['input'])
                    print('Input query added to metadata cache...')
                else:
                    print('Input SMILES found in cached results. Inserting into dictionary...')
                    metsim_out[i]['input'] = metsim_cache[[idx for idx in range(len(metsim_cache)) if metsim_cache[idx]['smiles'] == metsim_out[i]['input']['smiles']][0]]
                for j in range(len(metsim_out[i]['output'])): # j = number of unique precursors
                    if 'likelihood' in list(metsim_out[i]['output'][j]['precursor'].keys()):
                        if metsim_out[i]['output'][j]['precursor']['smiles'] not in [cache_item['smiles'] for cache_item in metsim_cache]:
                            metsim_out[i]['output'][j]['precursor'] = metsim_hcd_out(smiles = metsim_out[i]['output'][j]['precursor']['smiles'],
                                                                                     casrn = metsim_out[i]['output'][j]['precursor']['casrn'],
                                                                                     dtxsid = metsim_out[i]['output'][j]['precursor']['dtxsid'],
                                                                                     chem_name = metsim_out[i]['output'][j]['precursor']['chem_name'],
                                                                                     likely = metsim_out[i]['output'][j]['precursor']['likelihood'])
                            metsim_cache.append(metsim_out[i]['output'][j]['precursor'])
                            print('Precursor query added to metadata cache...')
                        else:
                            print('Precursor SMILES found in cached results. Inserting into dictionary...')
                            metsim_out[i]['output'][j]['precursor'] = metsim_cache[[idx for idx in range(len(metsim_cache)) if metsim_cache[idx]['smiles'] == metsim_out[i]['output'][j]['precursor']['smiles']][0]]
                    else:
                        if metsim_out[i]['output'][j]['precursor']['smiles'] not in [cache_item['smiles'] for cache_item in metsim_cache]:
                            metsim_out[i]['output'][j]['precursor'] = metsim_hcd_out(smiles = metsim_out[i]['output'][j]['precursor']['smiles'],
                                                                                     casrn = metsim_out[i]['output'][j]['precursor']['casrn'],
                                                                                     dtxsid = metsim_out[i]['output'][j]['precursor']['dtxsid'],
                                                                                     chem_name = metsim_out[i]['output'][j]['precursor']['chem_name'])
                            metsim_cache.append(metsim_out[i]['output'][j]['precursor'])
                            print('Precursor query added to metadata cache...')
                        else:
                            print('Precursor SMILES found in cached results. Inserting into dictionary...')
                            metsim_out[i]['output'][j]['precursor'] = metsim_cache[[idx for idx in range(len(metsim_cache)) if metsim_cache[idx]['smiles'] == metsim_out[i]['output'][j]['precursor']['smiles']][0]]
                    for k in range(len(metsim_out[i]['output'][j]['successors'])): # k = number of metabolites per precursor
                        if 'likelihood' in list(metsim_out[i]['output'][j]['successors'][k]['metabolite'].keys()):
                            if metsim_out[i]['output'][j]['successors'][k]['metabolite']['smiles'] not in [cache_item['smiles'] for cache_item in metsim_cache]:
                                metsim_out[i]['output'][j]['successors'][k]['metabolite'] = metsim_hcd_out(smiles = metsim_out[i]['output'][j]['successors'][k]['metabolite']['smiles'],
                                                                                                           casrn = metsim_out[i]['output'][j]['successors'][k]['metabolite']['casrn'],
                                                                                                           dtxsid = metsim_out[i]['output'][j]['successors'][k]['metabolite']['dtxsid'],
                                                                                                           chem_name = metsim_out[i]['output'][j]['successors'][k]['metabolite']['chem_name'],
                                                                                                           likely = metsim_out[i]['output'][j]['successors'][k]['metabolite']['likelihood'])
                                metsim_cache.append(metsim_out[i]['output'][j]['successors'][k]['metabolite'])
                                print('Successor metabolite query added to metadata cache...')
                            else:
                                print('Successor metabolite SMILES found in cached results. Inserting into dictionary...')
                                metsim_out[i]['output'][j]['successors'][k]['metabolite'] = metsim_cache[[idx for idx in range(len(metsim_cache)) if metsim_cache[idx]['smiles'] == metsim_out[i]['output'][j]['successors'][k]['metabolite']['smiles']][0]] 
                        else:
                            if metsim_out[i]['output'][j]['successors'][k]['metabolite']['smiles'] not in [cache_item['smiles'] for cache_item in metsim_cache]:
                                metsim_out[i]['output'][j]['successors'][k]['metabolite'] = metsim_hcd_out(smiles = metsim_out[i]['output'][j]['successors'][k]['metabolite']['smiles'],
                                                                                                           casrn = metsim_out[i]['output'][j]['successors'][k]['metabolite']['casrn'],
                                                                                                           dtxsid = metsim_out[i]['output'][j]['successors'][k]['metabolite']['dtxsid'],
                                                                                                           chem_name = metsim_out[i]['output'][j]['successors'][k]['metabolite']['chem_name'])
                                metsim_cache.append(metsim_out[i]['output'][j]['successors'][k]['metabolite'])
                                print('Successor metabolite query added to metadata cache...')
                            else:
                                print('Successor metabolite SMILES found in cached results. Inserting into dictionary...')
                                metsim_out[i]['output'][j]['successors'][k]['metabolite'] = metsim_cache[[idx for idx in range(len(metsim_cache)) if metsim_cache[idx]['smiles'] == metsim_out[i]['output'][j]['successors'][k]['metabolite']['smiles']][0]] 
                        print('input: '+str(i+1)+'/'+str(len(metsim_out))+' precursor: '+str(j+1)+'/'+str(len(metsim_out[i]['output']))+' metabolite: '+str(k+1)+'/'+str(len(metsim_out[i]['output'][j]['successors'])))
                if fnam != None:
                    json.dump(metsim_out, open(fnam,'w'))
        else:
            return metsim_metadata_full(metsim_out = metsim_out, fnam = fnam, metsim_cache = [])
    else:
        raise('Please supply a metsim dataset (list of dictionaries)')
    # print(metsim_out)
    return metsim_out