In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [2]:
substance_df = pd.read_csv("scraped_pubchem_data_final.csv")
substance_df = substance_df[substance_df["Compound/Substance"] == "SUBSTANCE"]
substance_df.head()

Unnamed: 0,Original Molecule,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
18,ABRUS_PRECATORIUS,ABRUS PRECATORIUS,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,,SUBSTANCE
20,ACACIA_CATECHU,ACACIA CATECHU,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,,SUBSTANCE
21,ACACIA_SENEGAL,ACACIA SENEGAL,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,,SUBSTANCE
53,ADRENAL_CORTEX,ADRENAL CORTEX,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,,SUBSTANCE
54,AESCULUS_HIPPOCASTANUM,AESCULUS HIPPOCASTANUM,https://pubchem.ncbi.nlm.nih.gov/substance/483...,RELEVANT,,,SUBSTANCE


In [6]:
def get_cc_url(chemical):
    return f'https://commonchemistry.cas.org/results?q={chemical}'

def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def get_best_match(molecule, driver):
    url = "N/A"  # Default in case of failure
    result_type = "Not Found"  # Default result type

    driver.get(get_cc_url(molecule))
    try:
        WebDriverWait(driver, 2.5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[data-action="result-rn"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[data-action="result-rn"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')
                
    except (NoSuchElementException, TimeoutException):
        pass  # URL remains "N/A" and result_type remains "Not Found" if no match is found or if there's a timeout

    return molecule, url, result_type

def get_best_matches(parsed_molecules):
    best_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once
    i = 0
    try:
        for parsed_molecule in parsed_molecules:
            molecule, url, result_type = get_best_match(parsed_molecule, driver)
            best_matches.append((molecule, url, result_type))
            print(best_matches[i])
            i+=1
    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return best_matches

In [7]:
substances = substance_df['Parsed Molecule'].tolist()

In [8]:
best_matches = get_best_matches(substances)

('ABRUS PRECATORIUS', 'N/A', 'Not Found')
('ACACIA CATECHU', 'N/A', 'Not Found')
('ACACIA SENEGAL', 'N/A', 'Not Found')
('ADRENAL CORTEX', 'N/A', 'Not Found')
('AESCULUS HIPPOCASTANUM', 'N/A', 'Not Found')
('AGARICUS CAMPESTRIS', 'N/A', 'Not Found')
('ALGLUCERASE', 'N/A', 'Not Found')
('ALLIUM CEPA', 'N/A', 'Not Found')
('ALLIUM SATIVUM', 'N/A', 'Not Found')
('ALPINIA OFFICINARUM', 'N/A', 'Not Found')
('ALTHAEA OFFICINALIS', 'N/A', 'Not Found')
('AMBROSIA ARTEMISIFOLIA', 'N/A', 'Not Found')
('AMINOACIDS', 'N/A', 'Not Found')
('ANANAS COMOSUS', 'N/A', 'Not Found')
('ANAS BARBARIAE', 'N/A', 'Not Found')
('ANETHUM GRAVEOLENS', 'N/A', 'Not Found')
('ANISTREPLASE', 'N/A', 'Not Found')
('ANTITHROMBIN ALFA', 'N/A', 'Not Found')
('ANTITHROMBIN III', 'N/A', 'Not Found')
('APIS MELLIFICA', 'N/A', 'Not Found')
('APIUM GRAVEOLENS', 'N/A', 'Not Found')
('APOAEQUORIN', 'N/A', 'Not Found')
('ARCTIUM LAPPA', 'N/A', 'Not Found')
('ARCTOSTAPHYLOS UVA-URSI', 'N/A', 'Not Found')
('ARGANIA SPINOSA', 'N/A',

WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome-headless-shell=123.0.6312.59)
Stacktrace:
0   chromedriver                        0x000000010803d0f8 chromedriver + 4595960
1   chromedriver                        0x0000000108034e63 chromedriver + 4562531
2   chromedriver                        0x0000000107c3839a chromedriver + 381850
3   chromedriver                        0x0000000107c20a8e chromedriver + 285326
4   chromedriver                        0x0000000107c20993 chromedriver + 285075
5   chromedriver                        0x0000000107c3a7d2 chromedriver + 391122
6   chromedriver                        0x0000000107cc387b chromedriver + 952443
7   chromedriver                        0x0000000107ca3ee3 chromedriver + 823011
8   chromedriver                        0x0000000107c74be4 chromedriver + 629732
9   chromedriver                        0x0000000107c7579e chromedriver + 632734
10  chromedriver                        0x0000000108003012 chromedriver + 4358162
11  chromedriver                        0x0000000108007c5d chromedriver + 4377693
12  chromedriver                        0x00000001080075d3 chromedriver + 4376019
13  chromedriver                        0x0000000108007f05 chromedriver + 4378373
14  chromedriver                        0x0000000107feca35 chromedriver + 4266549
15  chromedriver                        0x000000010800828d chromedriver + 4379277
16  chromedriver                        0x0000000107fdf080 chromedriver + 4210816
17  chromedriver                        0x0000000108025ac8 chromedriver + 4500168
18  chromedriver                        0x0000000108025c41 chromedriver + 4500545
19  chromedriver                        0x0000000108034aa3 chromedriver + 4561571
20  libsystem_pthread.dylib             0x00007ff809f384e1 _pthread_start + 125
21  libsystem_pthread.dylib             0x00007ff809f33f6b thread_start + 15


In [None]:
best_matches