# 1. Libraries

In [1]:
import pubchempy as pcp
import csv
import re
import pandas as pd

In [3]:
molecules_df = pd.read_excel('../data.xlsx', sheet_name = 'in')
molecules_arr = molecules_df['molecule'].tolist()

pattern = re.compile(r'(\d)-(\d)')
pattern2 = re.compile(r'_')
pattern3 = re.compile(r'(\d)-([a-zA-Z])')

# List to store updated molecule names
updated_molecules_arr = []

for molecule in molecules_arr:
    updated_molecule = pattern.sub(r'\1,\2', molecule)
    updated_molecule = pattern2.sub(r' ', updated_molecule)
    updated_molecule = pattern3.sub(r'\1 \2', updated_molecule)
    updated_molecules_arr.append(updated_molecule)

updated_molecules_df = pd.DataFrame(updated_molecules_arr, columns=['updated_molecule'])
updated_molecules_df

Unnamed: 0,updated_molecule
0,"1,2 PROPANEDIOL DIACETATE"
1,1 HEXADECANOL
2,1 OCTADECANOL
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL"
4,2 OXOGLUTARIC ACID
...,...
2900,ZIZYPHUS JUJUBA
2901,ZOLEDRONIC ACID
2902,ZOLMITRIPTAN
2903,ZOLPIDEM


# 2. Querying the PubChem search page

In [4]:
def pull_item(query: str, type: str):
    '''given a compound query string, returns CAS numbers for the compound'''
    if type == 'c':
        compound = pcp.get_compounds(query, 'name')
    elif type == 's':
        compound = pcp.get_substances(query, 'name')

    aliases = compound[0].synonyms
    
    cas_pattern = re.compile(r"(\d{2,7})-(\d{2})-(\d)")
    cas_numbers = []

    for alias in aliases:
        matches = cas_pattern.findall(alias)

        for match in matches:
            cas_numbers.append('-'.join(match))

    return cas_numbers

In [5]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def pull_item_with_type(args):
    '''Wrapper for pull_item to accept a single argument'''
    return pull_item(*args)

def generate_dict(compounds):
    '''Takes a list of compounds, retrieves all their CAS numbers using multithreading and outputs them as a dictionary'''
    compound_dict = {}
    # Tuple list for each compound and type 'c'
    tasks = [(compound, 'c') for compound in compounds]
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks to the executor
        future_to_compound = {executor.submit(pull_item_with_type, task): task[0] for task in tasks}
        
        for future in as_completed(future_to_compound):
            compound = future_to_compound[future]
            try:
                compound_dict[compound] = future.result()
            except Exception as exc:
                # print(f"{compound} generated an exception: {exc}")
                compound_dict[compound] = "N/A"
                
    return compound_dict

In [6]:
compound_dict = generate_dict(updated_molecules_arr)

In [7]:
compound_dict

{'1,2 PROPANEDIOL DIACETATE': 'N/A',
 '1 HEXADECANOL': 'N/A',
 '7 KETO DEHYDRANDROSTERONE': 'N/A',
 '1 OCTADECANOL': 'N/A',
 '2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL': 'N/A',
 '7 OXO-DEHYDROEPIANDROSTERONE': 'N/A',
 '2 PROPANOL': 'N/A',
 'ABATACEPT': 'N/A',
 'ABCIXIMAB': 'N/A',
 '2 PHENOXYETHANOL': [],
 '4 AMINOBUTYRIC ACID': ['56-12-2', '20-79-1', '56-12-2', '001-19-2'],
 '2 OXOGLUTARIC ACID': ['328-50-7',
  '378-50-7',
  '34410-46-3',
  '17091-15-5',
  '328-50-7'],
 'ABRUS PRECATORIUS': 'N/A',
 '8 QUINOLINOL': 'N/A',
 'ABSORBABLE GELATIN/COLLAGEN SPONGE': 'N/A',
 'ABALOPARATIDE': 'N/A',
 'ABARELIX': 'N/A',
 'ACAMPROSATE': 'N/A',
 'ACACIA CATECHU': 'N/A',
 'ACACIA SENEGAL': 'N/A',
 'ACALABRUTINIB': 'N/A',
 'ABACAVIR': 'N/A',
 'ACARBOSE': 'N/A',
 'ABEMACICLIB': ['1231929-97-7', '1231929-97-7'],
 'ABIRATERONE ACETATE': ['154229-18-2', '154229-18-2', '001-02-5'],
 'ACETOHYDROXAMIC ACID': 'N/A',
 'ACETONE': 'N/A',
 'ACETONE (KETONE) TESTS': 'N/A',
 'ACESULFAME': 'N/A',
 'ACEPROMAZINE': 'N/A'

In [8]:
compound = pcp.get_compounds("Nickel", 'name')

PubChemHTTPError: 'PUGREST.ServerBusy'