# Retrieving CAS registry numbers

In [1]:
import re
import pubchempy as pcp

Enable debug logging to make it easier to see what is going on:

In [2]:
import logging

logging.getLogger('pubchempy').setLevel(logging.DEBUG)

A function to get the CAS registry numbers for compounds with a particular SMILES substructure:

In [3]:
def get_substructure_cas(smiles):
    cas_rns = []
    results = pcp.get_synonyms(smiles, 'smiles', searchtype='substructure')
    for result in results:
        for syn in result.get('Synonym', []):
            match = re.match('(\d{2,7}-\d\d-\d)', syn)
            if match:
                cas_rns.append(match.group(1))
    return cas_rns

Test some inputs:

In [4]:
get_substructure_cas('[Pb]')

DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON
DEBUG:pubchempy:Request data: smiles=%5BPb%5D
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/2011525948783601866/synonyms/JSON
DEBUG:pubchempy:Request data: None


[u'7439-92-1',
 u'15875-18-0',
 u'54076-28-7',
 u'14701-27-0',
 u'15158-12-0',
 u'52229-97-7',
 u'724427-66-1',
 u'598-63-0',
 u'13427-42-4',
 u'17398-75-3',
 u'25510-11-6',
 u'64470-66-2',
 u'811-54-1',
 u'64-18-6',
 u'7056-83-9',
 u'6080-56-4',
 u'301-04-2',
 u'301-04-2',
 u'15347-57-6',
 u'6080-56-4',
 u'17856-14-3',
 u'37280-55-0',
 u'13302-14-2',
 u'1335-32-6',
 u'1067-14-7',
 u'15710-47-1',
 u'1153-06-6',
 u'5711-19-3',
 u'2587-81-7',
 u'10099-74-8',
 u'18256-98-9',
 u'75-74-1',
 u'1309-60-0',
 u'60525-54-4',
 u'1314-87-0',
 u'12179-39-4',
 u'12179-39-4',
 u'39377-56-5',
 u'51682-73-6',
 u'1317-36-8',
 u'12359-23-8',
 u'1309-59-7',
 u'7446-14-2',
 u'15739-80-7',
 u'52732-72-6',
 u'12673-93-7',
 u'37223-83-9',
 u'37224-42-3',
 u'37251-28-8',
 u'61869-44-1',
 u'90583-07-6',
 u'7758-95-4',
 u'12612-47-4',
 u'145763-27-5',
 u'7758-97-6',
 u'11119-70-3',
 u'15804-54-3',
 u'7758-97-6',
 u'15804-54-3',
 u'181768-98-9',
 u'8049-64-7',
 u'10031-22-8',
 u'12646-11-6',
 u'85941-57-7',
 u'13

In [5]:
get_substructure_cas('[Se]')

DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON
DEBUG:pubchempy:Request data: smiles=%5BSe%5D
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/868321497759353393/synonyms/JSON
DEBUG:pubchempy:Request data: None


[u'10102-18-8',
 u'26970-82-1',
 u'15498-87-0',
 u'7782-82-3',
 u'14013-56-0',
 u'14013-56-0',
 u'29528-97-0',
 u'50647-14-8',
 u'7782-49-2',
 u'11125-23-8',
 u'11133-88-3',
 u'12640-29-8',
 u'12640-30-1',
 u'12641-96-2',
 u'12733-65-2',
 u'37256-19-2',
 u'37258-85-8',
 u'37276-15-6',
 u'37368-02-8',
 u'50954-17-1',
 u'51882-60-1',
 u'95788-45-7',
 u'7446-34-6',
 u'56093-45-9',
 u'66732-34-1',
 u'26970-82-1',
 u'7488-56-4',
 u'12299-46-6',
 u'7783-00-8',
 u'11140-60-6',
 u'1408288-83-4',
 u'60940-34-3',
 u'83705-13-9',
 u'26046-90-2',
 u'630-10-4',
 u'7446-08-4',
 u'12397-00-1',
 u'13814-59-0',
 u'15702-34-8',
 u'13410-01-0',
 u'10112-94-4',
 u'10102-23-5',
 u'1482-82-2',
 u'3211-76-5',
 u'2697-61-2',
 u'3542-13-0',
 u'85539-83-9',
 u'2578-28-1',
 u'1464-42-2',
 u'7246-06-2',
 u'13091-98-0',
 u'1464-43-3',
 u'2897-21-4',
 u'13900-89-5',
 u'26932-45-6',
 u'6996-92-5',
 u'4671-93-6',
 u'2574-71-2',
 u'28274-57-9',
 u'89780-24-5',
 u'153871-75-1',
 u'81743-90-0',
 u'1345257-36-4',
 u'1666

In [6]:
get_substructure_cas('[Ti]')

DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON
DEBUG:pubchempy:Request data: smiles=%5BTi%5D
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/625765605323688825/synonyms/JSON
DEBUG:pubchempy:Request data: None


[u'13463-67-7',
 u'1317-80-2',
 u'1317-70-0',
 u'98084-96-9',
 u'100292-32-8',
 u'101239-53-6',
 u'116788-85-3',
 u'12000-59-8',
 u'12036-20-3',
 u'12701-76-7',
 u'12767-65-6',
 u'12789-63-8',
 u'1309-63-3',
 u'1344-29-2',
 u'158518-86-6',
 u'185323-71-1',
 u'185828-91-5',
 u'188357-76-8',
 u'188357-79-1',
 u'195740-11-5',
 u'221548-98-7',
 u'224963-00-2',
 u'246178-32-5',
 u'252962-41-7',
 u'37230-92-5',
 u'37230-94-7',
 u'37230-95-8',
 u'37230-96-9',
 u'39320-58-6',
 u'39360-64-0',
 u'39379-02-7',
 u'416845-43-7',
 u'494848-07-6',
 u'494848-23-6',
 u'494851-77-3',
 u'494851-98-8',
 u'55068-84-3',
 u'55068-85-4',
 u'552316-51-5',
 u'62338-64-1',
 u'767341-00-4',
 u'97929-50-5',
 u'7440-32-6',
 u'14067-04-0',
 u'15749-33-4',
 u'11147-83-4',
 u'12718-64-8',
 u'12794-00-2',
 u'182260-48-6',
 u'195161-81-0',
 u'37246-34-7',
 u'37246-36-9',
 u'37246-37-0',
 u'37261-68-0',
 u'37269-05-9',
 u'37301-57-8',
 u'37333-92-9',
 u'53549-90-9',
 u'54319-51-6',
 u'57854-37-2',
 u'62650-70-8',
 u'6779

In [7]:
get_substructure_cas('[Pd]')

DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON
DEBUG:pubchempy:Request data: smiles=%5BPd%5D
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/2117094829429603422/synonyms/JSON
DEBUG:pubchempy:Request data: None


[u'7440-05-3',
 u'17637-99-9',
 u'53092-86-7',
 u'7647-10-1',
 u'10038-97-8',
 u'10102-05-3',
 u'14846-30-1',
 u'884739-77-9',
 u'3375-31-3',
 u'19807-27-3',
 u'125089-63-6',
 u'138823-54-8',
 u'24175-85-7',
 u'33571-36-7',
 u'3376-66-7',
 u'4854-43-7',
 u'31277-98-2',
 u'1314-08-5',
 u'11113-77-2',
 u'51364-51-3',
 u'52409-22-0',
 u'10025-98-6',
 u'16919-73-6',
 u'1307-79-5',
 u'19662-89-6',
 u'132901-05-4',
 u'77839-67-9',
 u'87936-24-1',
 u'84180-62-1',
 u'13601-08-6',
 u'13815-17-3',
 u'61495-96-3',
 u'68413-68-3',
 u'84206-78-0',
 u'7647-10-1',
 u'10038-97-8',
 u'74091-55-7',
 u'15977-94-3',
 u'68179-49-7',
 u'15977-94-3',
 u'106747-79-9',
 u'11085-32-8',
 u'20224-80-0',
 u'122991-69-9',
 u'132958-37-3',
 u'139280-48-1',
 u'139280-50-5',
 u'97198-18-0',
 u'74574-38-2',
 u'105038-79-7',
 u'115288-32-9',
 u'58220-71-6',
 u'132901-05-4',
 u'77839-67-9',
 u'107373-32-0',
 u'20224-80-0',
 u'39043-00-0',
 u'14220-64-5',
 u'51546-95-3',
 u'72859-86-0',
 u'14099-33-3',
 u'71852-92-1',
 u'

We could potentially get a TimeoutError if there are too many results. In this case, it might be better to perform the substructure search and then get the synonyms separately:

In [8]:
cids = pcp.get_cids('[Pd]', 'smiles', searchtype='substructure')
synonyms = pcp.get_synonyms(cids)

DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/JSON
DEBUG:pubchempy:Request data: smiles=%5BPd%5D
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/4527290379155673059/cids/JSON
DEBUG:pubchempy:Request data: None
DEBUG:pubchempy:Request URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/synonyms/JSON
DEBUG:pubchempy:Request data: cid=23938%2C24290%2C24932%2C167845%2C2724231%2C4144353%2C5083724%2C9811564%2C61438%2C61852%2C73974%2C131564%2C132670%2C137370%2C150071%2C152124%2C158423%2C165797%2C169479%2C173245%2C194271%2C196610%2C3081450%2C3081705%2C3081802%2C3081846%2C3081848%2C6337493%2C6337785%2C6337987%2C6337993%2C6367224%2C6449897%2C53462193%2C53471865%2C57357318%2C57375560%2C327077%2C426991%2C296461%2C299846%2C354125%2C364986%2C387173%2C421081%2C423386%2C424685%2C424764%2C425002%2C426377%2C426467%2C426670%2C426988%2C426989%2C426990%2C427192%2C427200%2C427201%2C427202%2C427203%2C427605%

In [9]:
print([syn for result in synonyms for syn in result.get('Synonym', []) if re.match('(\d{2,7}-\d\d-\d)', syn)])

[u'7440-05-3', u'17637-99-9', u'53092-86-7', u'7647-10-1', u'10038-97-8 (dihydrate)', u'10102-05-3', u'14846-30-1', u'884739-77-9', u'3375-31-3', u'19807-27-3', u'125089-63-6', u'138823-54-8', u'24175-85-7', u'33571-36-7', u'3376-66-7', u'4854-43-7', u'31277-98-2', u'1314-08-5', u'11113-77-2 (cpd with unspecified MF)', u'51364-51-3', u'52409-22-0', u'10025-98-6', u'16919-73-6', u'1307-79-5', u'19662-89-6', u'132901-05-4', u'77839-67-9', u'87936-24-1', u'84180-62-1', u'13601-08-6 (dinitrate)', u'13815-17-3 (dichloride)', u'61495-96-3 (diacetate)', u'68413-68-3 (dihydroxide)', u'84206-78-0', u'7647-10-1 (Parent)', u'10038-97-8', u'74091-55-7', u'15977-94-3', u'68179-49-7 (Parent)', u'15977-94-3 (di-H salt)', u'106747-79-9', u'11085-32-8', u'20224-80-0', u'122991-69-9', u'132958-37-3', u'139280-48-1', u'139280-50-5', u'97198-18-0', u'74574-38-2', u'105038-79-7', u'115288-32-9', u'58220-71-6', u'132901-05-4', u'77839-67-9', u'107373-32-0', u'20224-80-0', u'39043-00-0', u'14220-64-5', u'515