# 1. Libraries

In [1]:
import pubchempy as pcp
import csv
import re
import pandas as pd
import time

In [2]:
molecules_df = pd.read_excel('data.xlsx', sheet_name = 'in')
molecules_arr = molecules_df['molecule'].tolist()

# The pattern to match: digit, dash, digit
pattern = re.compile(r'(\d)-(\d)')

# List to store updated molecule names
updated_molecules_arr = []

for molecule in molecules_arr:
    # Replace '-' with ',' when between digits
    updated_molecule = pattern.sub(r'\1,\2', molecule)
    updated_molecules_arr.append(updated_molecule)

# Optionally, update your DataFrame or other storage
# Here's an example of updating a DataFrame
updated_molecules_df = pd.DataFrame(updated_molecules_arr, columns=['updated_molecule'])
updated_molecules_df

Unnamed: 0,updated_molecule
0,"1,2-PROPANEDIOL_DIACETATE"
1,1-HEXADECANOL
2,1-OCTADECANOL
3,"2,3-(2-IODOPROPYLIDENEDIOXY)PROPANOL"
4,2-OXOGLUTARIC_ACID
...,...
2900,ZIZYPHUS_JUJUBA
2901,ZOLEDRONIC_ACID
2902,ZOLMITRIPTAN
2903,ZOLPIDEM


# 2. Querying the PubChem search page

In [3]:
def pull_item(query: str, type: str):
    '''given a compound query string, returns CAS numbers for the compound'''
    if type == 'c':
        compound = pcp.get_compounds(query, 'name')
    elif type == 's':
        compound = pcp.get_substances(query, 'name')

    aliases = compound[0].synonyms
    
    cas_pattern = re.compile(r"(\d{2,7})-(\d{2})-(\d)")
    cas_numbers = []

    for alias in aliases:
        matches = cas_pattern.findall(alias)

        for match in matches:
            cas_numbers.append('-'.join(match))

    return cas_numbers

In [4]:
def generate_dict(compounds):
    '''Takes a list of compounds, retrieves all their CAS numbers and outputs them as a dictionary'''

    compound_dict = {}
    
    for compound in compounds:
        try:
            compound_dict[compound] = pull_item(compound, 'c')
        except:
            compound_dict[compound] = "N/A"
        
    return compound_dict

# generate_dict(["aspirin", "EGFR", "C9H8O4"])

In [5]:
compounds = pcp.get_compounds('Aspirin', 'name')

In [6]:
def pull_cid(molecules):
    cid_arr = []
    for molecule in molecules:
        compounds = pcp.get_compounds({molecule}, 'name')
        for compound in compounds:
            cid_arr.append(compound.cid)
    return cid_arr

In [7]:
start = time.time()
cid_arr = pull_cid(updated_molecules_arr[0:1000])
end = time.time()
print("The time of execution of above program is :",
      (end-start) * 10*3, "ms")

The time of execution of above program is : 10815.008878707886 ms


In [8]:
cid_arr

[2682, 8221, 31236, 3776, 193313, 1923, 441300, 145705876, 16131215, 46220502]