# Retrieving CAS registry numbers

In [1]:
import re
import pubchempy as pcp

Enable debug logging to make it easier to see what is going on:

In [2]:
import logging

logging.getLogger('pubchempy').setLevel(logging.DEBUG)

A function to get the CAS registry numbers for compounds with a particular SMILES substructure:

In [3]:
def get_substructure_cas(smiles):
    cas_rns = []
    results = pcp.get_synonyms(smiles, 'smiles', searchtype='substructure')
    for result in results:
        for syn in result.get('Synonym', []):
            match = re.match(r'(\d{2,7}-\d\d-\d)', syn)
            if match:
                cas_rns.append(match.group(1))
    return cas_rns

Test some inputs:

In [4]:
cas_rns = get_substructure_cas('[Pb]')
print(len(cas_rns))
print(cas_rns[:10])

895
['7439-92-1', '54076-28-7', '301-04-2', '15347-57-6', '78-00-2', '1314-87-0', '12179-39-4', '1317-36-8', '79120-33-5', '7446-14-2']


In [5]:
cas_rns = get_substructure_cas('[Se]')
print(len(cas_rns))
print(cas_rns[:10])

3719
['2514685-52-8', '61216-56-6', '2350285-21-9', '239127-62-9', '289702-60-9', '85721-35-3', '178698-98-1', '26699-19-4', '167960-76-1', '24687-35-2']


In [6]:
cas_rns = get_substructure_cas('[Ti]')
print(len(cas_rns))
print(cas_rns[:10])

968
['13463-67-7', '1317-70-0', '1317-80-2', '98084-96-9', '1309-63-3', '7440-32-6', '14067-04-0', '1271-19-8', '1271-19-8', '546-68-9']


In [7]:
cas_rns = get_substructure_cas('[Pd]')
print(len(cas_rns))
print(cas_rns[:10])

963
['7647-10-1', '7440-05-3', '19168-23-1', '13820-55-8', '16919-73-6', '3375-31-3', '19807-27-3', '1314-08-5', '14221-01-3', '12107-56-1']


We could potentially get a TimeoutError if there are too many results. In this case, it might be better to perform the substructure search and then get the synonyms separately:

In [8]:
cids = pcp.get_cids('[Pd]', 'smiles', searchtype='substructure')

Then you can do `pcp.get_synonyms(cids)` with the list of CIDs.