# 1. Libraries

In [1]:
import pubchempy as pcp
import csv
import re
import pandas as pd

In [2]:
molecules_df = pd.read_excel('data.xlsx', sheet_name = 'in')
molecules_arr = molecules_df['molecule'].tolist()

pattern = re.compile(r'(\d)-(\d)')
pattern2 = re.compile(r'_')
pattern3 = re.compile(r'(\d)-([a-zA-Z])')

# List to store updated molecule names
updated_molecules_arr = []

for molecule in molecules_arr:
    updated_molecule = pattern.sub(r'\1,\2', molecule)
    updated_molecule = pattern2.sub(r' ', updated_molecule)
    updated_molecule = pattern3.sub(r'\1 \2', updated_molecule)
    updated_molecules_arr.append(updated_molecule)

updated_molecules_df = pd.DataFrame(updated_molecules_arr, columns=['updated_molecule'])
updated_molecules_df

Unnamed: 0,updated_molecule
0,"1,2 PROPANEDIOL DIACETATE"
1,1 HEXADECANOL
2,1 OCTADECANOL
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL"
4,2 OXOGLUTARIC ACID
...,...
2900,ZIZYPHUS JUJUBA
2901,ZOLEDRONIC ACID
2902,ZOLMITRIPTAN
2903,ZOLPIDEM


# 2. Querying the PubChem search page

In [3]:
def pull_item(query: str, type: str):
    '''given a compound query string, returns CAS numbers for the compound'''
    if type == 'c':
        compound = pcp.get_compounds(query, 'name')
    elif type == 's':
        compound = pcp.get_substances(query, 'name')

    aliases = compound[0].synonyms
    
    cas_pattern = re.compile(r"(\d{2,7})-(\d{2})-(\d)")
    cas_numbers = []

    for alias in aliases:
        matches = cas_pattern.findall(alias)

        for match in matches:
            cas_numbers.append('-'.join(match))

    return cas_numbers

In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def pull_item_with_type(args):
    '''Wrapper for pull_item to accept a single argument'''
    return pull_item(*args)

def generate_dict(compounds):
    '''Takes a list of compounds, retrieves all their CAS numbers using multithreading and outputs them as a dictionary'''
    compound_dict = {}
    # Tuple list for each compound and type 'c'
    tasks = [(compound, 'c') for compound in compounds]
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks to the executor
        future_to_compound = {executor.submit(pull_item_with_type, task): task[0] for task in tasks}
        
        for future in as_completed(future_to_compound):
            compound = future_to_compound[future]
            try:
                compound_dict[compound] = future.result()
            except Exception as exc:
                print(f"{compound} generated an exception: {exc}")
                compound_dict[compound] = "N/A"
                
    return compound_dict

In [6]:
generate_dict(["1,2 PROPANEDIOL DIACETATE"])

1,2 PROPANEDIOL DIACETATE generated an exception: 'PUGREST.ServerBusy'


{'1,2 PROPANEDIOL DIACETATE': 'N/A'}

In [11]:
def generate_dict(compounds):
    '''Takes a list of compounds, retrieves all their CAS numbers and outputs them as a dictionary'''

    compound_dict = {}
    
    for compound in compounds:
        try:
            compound_dict[compound] = pull_item(compound, 'c')
        except:
            compound_dict[compound] = "N/A"
        
    return compound_dict

# generate_dict(["aspirin", "EGFR", "C9H8O4"])

generate_dict(["1,2- ropanediol Diacetate"])

{'1,2-Propanediol Diacetate': 'N/A'}