# Retrieving CAS registry numbers

In [1]:
import re
import pubchempy as pcp

Enable debug logging to make it easier to see what is going on:

In [2]:
import logging

logging.getLogger('pubchempy').setLevel(logging.DEBUG)

A function to get the CAS registry numbers for compounds with a particular SMILES substructure:

In [3]:
def get_substructure_cas(smiles):
    cas_rns = []
    results = pcp.get_synonyms(smiles, 'smiles', searchtype='substructure')
    for result in results:
        for syn in result.get('Synonym', []):
            match = re.match('(\d{2,7}-\d\d-\d)', syn)
            if match:
                cas_rns.append(match.group(1))
    return cas_rns

Test some inputs:

In [4]:
cas_rns = get_substructure_cas('[Pb]')
print(len(cas_rns))
print(cas_rns[:10])

856
['7439-92-1', '54076-28-7', '14452-81-4', '1317-36-8', '79120-33-5', '78-00-2', '10099-74-8', '301-04-2', '1314-87-0', '12179-39-4']


In [5]:
cas_rns = get_substructure_cas('[Se]')
print(len(cas_rns))
print(cas_rns[:10])

3672
['7783-00-8', '10102-18-8', '10102-18-8', '14013-56-0', '1464-42-2', '2578-28-1', '7782-49-2', '630-10-4', '60940-34-3', '7446-08-4']


In [6]:
cas_rns = get_substructure_cas('[Ti]')
print(len(cas_rns))
print(cas_rns[:10])

875
['13463-67-7', '1317-70-0', '1317-80-2', '98084-96-9', '7440-32-6', '14067-04-0', '546-68-9', '68585-67-1', '1271-19-8', '12035-95-9']


In [7]:
cas_rns = get_substructure_cas('[Pd]')
print(len(cas_rns))
print(cas_rns[:10])

846
['7647-10-1', '7440-05-3', '19168-23-1', '10025-98-6', '13782-33-7', '14323-43-4', '16919-73-6', '12125-22-3', '3375-31-3', '19807-27-3']


We could potentially get a TimeoutError if there are too many results. In this case, it might be better to perform the substructure search and then get the synonyms separately:

In [8]:
cids = pcp.get_cids('[Pd]', 'smiles', searchtype='substructure')

Then you can do `pcp.get_synonyms(cids)` with the list of CIDs.