# Libraries

In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

# Parser
- Convert original non-queryable names to queryable.

In [2]:
molecules_df = pd.read_excel('data.xlsx', sheet_name = 'in')
molecules_arr = molecules_df['molecule'].tolist()

pattern = re.compile(r'(\d)-(\d)')
pattern2 = re.compile(r'_')
pattern3 = re.compile(r'(\d)-([a-zA-Z])')

# List to store updated molecule names
updated_molecules_arr = []

for molecule in molecules_arr:
    updated_molecule = pattern.sub(r'\1,\2', molecule)
    updated_molecule = pattern2.sub(r' ', updated_molecule)
    updated_molecule = pattern3.sub(r'\1 \2', updated_molecule)
    updated_molecules_arr.append(updated_molecule)

updated_molecules_df = pd.DataFrame(updated_molecules_arr, columns=['updated_molecule'])
updated_molecules_df

# updated_molecules_arr = ['GEMIFLOXACIN', 'VILAZODONE', 'DIPIVEFRINE', 'TESTOSTERONE', 'EDARAVONE', 'DONEPEZIL', 'ETHANOL', 'BEPOTASTINE', 'DOFETILIDE']

Unnamed: 0,updated_molecule
0,"1,2 PROPANEDIOL DIACETATE"
1,1 HEXADECANOL
2,1 OCTADECANOL
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL"
4,2 OXOGLUTARIC ACID
...,...
2900,ZIZYPHUS JUJUBA
2901,ZOLEDRONIC ACID
2902,ZOLMITRIPTAN
2903,ZOLPIDEM


# PubChem Query Result Scraper

In [3]:
def get_pubchem_url(chemical):
    return f'https://pubchem.ncbi.nlm.nih.gov/#query={chemical}'

def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def get_best_match(molecule, driver):
    url = "N/A"  # Default in case of failure
    result_type = "Not Found"  # Default result type

    driver.get(get_pubchem_url(molecule))
    try:
        WebDriverWait(driver, 6).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[data-action="featured-result-link"], a[data-action="result-link"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[data-action="featured-result-link"], a[data-action="result-link"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')
            action = element.get_attribute('data-action')
            if action == 'featured-result-link':
                result_type = 'FEATURED'
            else:
                result_type = 'RELEVANT'
    except (NoSuchElementException, TimeoutException):
        pass  # URL remains "N/A" and result_type remains "Not Found" if no match is found or if there's a timeout

    return molecule, url, result_type

def get_best_matches(parsed_molecules):
    best_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once
    try:
        for parsed_molecule in parsed_molecules:
            molecule, url, result_type = get_best_match(parsed_molecule, driver)
            best_matches.append((molecule, url, result_type))
    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return best_matches

In [4]:
best_matches = get_best_matches(updated_molecules_arr)

In [5]:
df = pd.DataFrame(best_matches, columns=['Molecule', 'Link', 'Result Type'])

In [6]:
df

Unnamed: 0,Molecule,Link,Result Type
0,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT
1,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT
2,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",,Not Found
4,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED
...,...,...,...
2900,ZIZYPHUS JUJUBA,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT
2901,ZOLEDRONIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/68740,FEATURED
2902,ZOLMITRIPTAN,https://pubchem.ncbi.nlm.nih.gov/compound/60857,FEATURED
2903,ZOLPIDEM,https://pubchem.ncbi.nlm.nih.gov/compound/5732,FEATURED


In [7]:
# df_merged = pd.concat([molecules_df, df], ignore_index=True, sort=False, axis=1)
df_merged = df
df_merged.columns = ['Parsed', 'Link', 'Result Type']
df_merged

Unnamed: 0,Parsed,Link,Result Type
0,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT
1,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT
2,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",,Not Found
4,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED
...,...,...,...
2900,ZIZYPHUS JUJUBA,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT
2901,ZOLEDRONIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/68740,FEATURED
2902,ZOLMITRIPTAN,https://pubchem.ncbi.nlm.nih.gov/compound/60857,FEATURED
2903,ZOLPIDEM,https://pubchem.ncbi.nlm.nih.gov/compound/5732,FEATURED


In [8]:
df_merged_csv = df_merged.to_csv('FINAL_MOLECULE_LINKS.csv', index = False) 

In [9]:
links_arr = df_merged["Link"].tolist()
links_arr

['https://pubchem.ncbi.nlm.nih.gov/compound/12198',
 'https://pubchem.ncbi.nlm.nih.gov/compound/2682',
 'https://pubchem.ncbi.nlm.nih.gov/compound/8221',
 'N/A',
 'https://pubchem.ncbi.nlm.nih.gov/compound/51',
 'https://pubchem.ncbi.nlm.nih.gov/compound/17972278',
 'https://pubchem.ncbi.nlm.nih.gov/compound/3776',
 'https://pubchem.ncbi.nlm.nih.gov/compound/119',
 'N/A',
 'https://pubchem.ncbi.nlm.nih.gov/compound/193313',
 'https://pubchem.ncbi.nlm.nih.gov/compound/1923',
 'https://pubchem.ncbi.nlm.nih.gov/compound/441300',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Abaloparatide',
 'https://pubchem.ncbi.nlm.nih.gov/compound/16131215',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Abatacept',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Abciximab',
 'https://pubchem.ncbi.nlm.nih.gov/compound/46220502',
 'https://pubchem.ncbi.nlm.nih.gov/compound/9821849',
 'https://pubchem.ncbi.nlm.nih.gov/substance/482044096',
 'https://pubmed.ncbi.nlm.nih.gov/24377182',
 'https://pubchem.ncbi.nlm.nih

# PubChem CAS Scraper

In [10]:
def get_cas_numbers(link, driver):
    cas_numbers = {"CAS": "N/A", "Deprecated CAS": "N/A"}  # Default in case of failure

    try:
        driver.get(link)
        # Wait and locate the CAS number section
        try:
            cas_elements = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#CAS div.break-words')))
            cas_numbers["CAS"] = ', '.join([el.text.strip() for el in cas_elements])
        except (NoSuchElementException, TimeoutException):
            cas_numbers["CAS"] = "N/A"  # CAS number not found

        # Wait and locate the Deprecated CAS number section, if present
        try:
            deprecated_cas_elements = driver.find_elements(By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')
            # deprecated_cas_elements = WebDriverWait(driver, 3).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')))
            cas_numbers["Deprecated CAS"] = ', '.join([el.text.strip() for el in deprecated_cas_elements])
            
        except (NoSuchElementException, TimeoutException):
            cas_numbers["Deprecated CAS"] = "N/A"  # Deprecated CAS number not found
            
    except Exception as e:
        print(f"Error retrieving CAS numbers for {link}: {e}")

    return cas_numbers

def get_cas_numbers_concurrently(links):
    cas_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once

    try:
        for link in links:
            cas_matches.append(get_cas_numbers(link, driver))

    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return cas_matches      

In [11]:
cas_numbers = get_cas_numbers_concurrently(links_arr)

Error retrieving CAS numbers for N/A: Message: invalid argument
  (Session info: chrome-headless-shell=125.0.6422.76)
Stacktrace:
0   chromedriver                        0x00000001063dd6c8 chromedriver + 6149832
1   chromedriver                        0x00000001063d4cea chromedriver + 6114538
2   chromedriver                        0x0000000105e61b91 chromedriver + 400273
3   chromedriver                        0x0000000105e48098 chromedriver + 295064
4   chromedriver                        0x0000000105e46c4f chromedriver + 289871
5   chromedriver                        0x0000000105e46f3a chromedriver + 290618
6   chromedriver                        0x0000000105e647b7 chromedriver + 411575
7   chromedriver                        0x0000000105ef1da5 chromedriver + 990629
8   chromedriver                        0x0000000105ed1cb2 chromedriver + 859314
9   chromedriver                        0x0000000105ef10db chromedriver + 987355
10  chromedriver                        0x0000000105ed1a53

In [12]:
df_cas = pd.DataFrame(cas_numbers, columns=['CAS', 'Deprecated CAS'])
df_cas

Unnamed: 0,CAS,Deprecated CAS
0,623-84-7,"134236-23-0, 1432741-27-9"
1,"36653-82-4, 36311-34-9, 124-29-8","168679-13-8, 1173838-87-3, 124-29-8, 55069-45-..."
2,"112-92-5, 26762-44-7","193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, ..."
3,,
4,"328-50-7, 34410-46-3, 17091-15-5",27175-99-1
...,...,...
2900,,
2901,118072-93-8,
2902,139264-17-8,
2903,82626-48-0,


# Combining Molecule Links Dataframe with CAS Dataframe

In [13]:
final_df = pd.concat([df_merged, df_cas], ignore_index=True, sort=False, axis=1)

In [14]:
final_df

Unnamed: 0,0,1,2,3,4
0,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT,623-84-7,"134236-23-0, 1432741-27-9"
1,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT,"36653-82-4, 36311-34-9, 124-29-8","168679-13-8, 1173838-87-3, 124-29-8, 55069-45-..."
2,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT,"112-92-5, 26762-44-7","193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, ..."
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",,Not Found,,
4,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED,"328-50-7, 34410-46-3, 17091-15-5",27175-99-1
...,...,...,...,...,...
2900,ZIZYPHUS JUJUBA,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,
2901,ZOLEDRONIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/68740,FEATURED,118072-93-8,
2902,ZOLMITRIPTAN,https://pubchem.ncbi.nlm.nih.gov/compound/60857,FEATURED,139264-17-8,
2903,ZOLPIDEM,https://pubchem.ncbi.nlm.nih.gov/compound/5732,FEATURED,82626-48-0,


In [16]:
final_df = pd.concat([molecules_df, final_df], ignore_index=True, sort=False, axis=1)
final_df

Unnamed: 0,0,1,2,3,4,5
0,1-2-PROPANEDIOL_DIACETATE,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT,623-84-7,"134236-23-0, 1432741-27-9"
1,1-HEXADECANOL,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT,"36653-82-4, 36311-34-9, 124-29-8","168679-13-8, 1173838-87-3, 124-29-8, 55069-45-..."
2,1-OCTADECANOL,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT,"112-92-5, 26762-44-7","193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, ..."
3,2-3-(2-IODOPROPYLIDENEDIOXY)PROPANOL,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",,Not Found,,
4,2-OXOGLUTARIC_ACID,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED,"328-50-7, 34410-46-3, 17091-15-5",27175-99-1
...,...,...,...,...,...,...
2900,ZIZYPHUS_JUJUBA,ZIZYPHUS JUJUBA,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,
2901,ZOLEDRONIC_ACID,ZOLEDRONIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/68740,FEATURED,118072-93-8,
2902,ZOLMITRIPTAN,ZOLMITRIPTAN,https://pubchem.ncbi.nlm.nih.gov/compound/60857,FEATURED,139264-17-8,
2903,ZOLPIDEM,ZOLPIDEM,https://pubchem.ncbi.nlm.nih.gov/compound/5732,FEATURED,82626-48-0,


In [17]:
final_df.columns=['Original Molecule', 'Parsed Molecule', 'Link', 'Result Type', 'CAS', 'Deprecated CAS']

# Flagging Compound Types

In [18]:
final_df['Compound/Substance'] = final_df['Link'].apply(lambda x: 'COMPOUND' if 'compound' in str(x) else ('SUBSTANCE' if 'substance' in str(x) else 'N/A'))

In [19]:
final_df

Unnamed: 0,Original Molecule,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
0,1-2-PROPANEDIOL_DIACETATE,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT,623-84-7,"134236-23-0, 1432741-27-9",COMPOUND
1,1-HEXADECANOL,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT,"36653-82-4, 36311-34-9, 124-29-8","168679-13-8, 1173838-87-3, 124-29-8, 55069-45-...",COMPOUND
2,1-OCTADECANOL,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT,"112-92-5, 26762-44-7","193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, ...",COMPOUND
3,2-3-(2-IODOPROPYLIDENEDIOXY)PROPANOL,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",,Not Found,,,
4,2-OXOGLUTARIC_ACID,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED,"328-50-7, 34410-46-3, 17091-15-5",27175-99-1,COMPOUND
...,...,...,...,...,...,...,...
2900,ZIZYPHUS_JUJUBA,ZIZYPHUS JUJUBA,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,,SUBSTANCE
2901,ZOLEDRONIC_ACID,ZOLEDRONIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/68740,FEATURED,118072-93-8,,COMPOUND
2902,ZOLMITRIPTAN,ZOLMITRIPTAN,https://pubchem.ncbi.nlm.nih.gov/compound/60857,FEATURED,139264-17-8,,COMPOUND
2903,ZOLPIDEM,ZOLPIDEM,https://pubchem.ncbi.nlm.nih.gov/compound/5732,FEATURED,82626-48-0,,COMPOUND


In [20]:
final_df['Original Molecule'] = final_df['Parsed Molecule']
final_df = final_df[['Original Molecule', 'Parsed Molecule', 'Link', 'Result Type', 'CAS', 'Deprecated CAS', 'Compound/Substance']]
final_df

Unnamed: 0,Original Molecule,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
0,"1,2 PROPANEDIOL DIACETATE","1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT,623-84-7,"134236-23-0, 1432741-27-9",COMPOUND
1,1 HEXADECANOL,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT,"36653-82-4, 36311-34-9, 124-29-8","168679-13-8, 1173838-87-3, 124-29-8, 55069-45-...",COMPOUND
2,1 OCTADECANOL,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT,"112-92-5, 26762-44-7","193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, ...",COMPOUND
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL","2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",,Not Found,,,
4,2 OXOGLUTARIC ACID,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED,"328-50-7, 34410-46-3, 17091-15-5",27175-99-1,COMPOUND
...,...,...,...,...,...,...,...
2900,ZIZYPHUS JUJUBA,ZIZYPHUS JUJUBA,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,,SUBSTANCE
2901,ZOLEDRONIC ACID,ZOLEDRONIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/68740,FEATURED,118072-93-8,,COMPOUND
2902,ZOLMITRIPTAN,ZOLMITRIPTAN,https://pubchem.ncbi.nlm.nih.gov/compound/60857,FEATURED,139264-17-8,,COMPOUND
2903,ZOLPIDEM,ZOLPIDEM,https://pubchem.ncbi.nlm.nih.gov/compound/5732,FEATURED,82626-48-0,,COMPOUND


# Export Dataframe as CSV

In [21]:
final_df_csv = final_df.to_csv('pubchem_longer_wait.csv', index = False) 