In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [2]:
molecules_df = pd.read_excel('data.xlsx', sheet_name = 'in')
molecules_arr = molecules_df['molecule'].tolist()

pattern = re.compile(r'(\d)-(\d)')
pattern2 = re.compile(r'_')
pattern3 = re.compile(r'(\d)-([a-zA-Z])')

# List to store updated molecule names
updated_molecules_arr = []

for molecule in molecules_arr:
    updated_molecule = pattern.sub(r'\1,\2', molecule)
    updated_molecule = pattern2.sub(r' ', updated_molecule)
    updated_molecule = pattern3.sub(r'\1 \2', updated_molecule)
    updated_molecules_arr.append(updated_molecule)

updated_molecules_df = pd.DataFrame(updated_molecules_arr, columns=['updated_molecule'])
updated_molecules_df

Unnamed: 0,updated_molecule
0,"1,2 PROPANEDIOL DIACETATE"
1,1 HEXADECANOL
2,1 OCTADECANOL
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL"
4,2 OXOGLUTARIC ACID
...,...
2900,ZIZYPHUS JUJUBA
2901,ZOLEDRONIC ACID
2902,ZOLMITRIPTAN
2903,ZOLPIDEM


In [3]:
def get_pubchem_url(chemical):
    url_start = f'https://pubchem.ncbi.nlm.nih.gov/#query={chemical}'
    return url_start

from concurrent.futures import ThreadPoolExecutor, as_completed

def get_best_match(molecule):
    # This function is adjusted to work on a single molecule.
    # It's similar to your existing function but designed to be called in parallel.
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    url = "N/A"  # Default in case of failure

    try:
        service = Service(ChromeDriverManager().install())
        with webdriver.Chrome(service=service, options=chrome_options) as driver:
            driver.get(get_pubchem_url(molecule))
            WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[data-action="result-link"]')))
            element = driver.find_element(By.CSS_SELECTOR, 'a[data-action="result-link"]')
            url = element.get_attribute('href')
    except (NoSuchElementException, TimeoutException):
        pass

    return molecule, url

def get_best_matches_concurrently(molecules):
    best_matches = []
    with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers based on your system
        future_to_molecule = {executor.submit(get_best_match, molecule): molecule for molecule in molecules}
        for future in as_completed(future_to_molecule):
            molecule, url = future.result()
            best_matches.append((molecule, url))
            print(best_matches)
    return best_matches

In [None]:
best_matches = get_best_matches_concurrently(updated_molecules_arr)

[('2 OXOGLUTARIC ACID', 'https://pubchem.ncbi.nlm.nih.gov/compound/51')]
[('2 OXOGLUTARIC ACID', 'https://pubchem.ncbi.nlm.nih.gov/compound/51'), ('1 HEXADECANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/2682')]
[('2 OXOGLUTARIC ACID', 'https://pubchem.ncbi.nlm.nih.gov/compound/51'), ('1 HEXADECANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/2682'), ('2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL', 'N/A')]
[('2 OXOGLUTARIC ACID', 'https://pubchem.ncbi.nlm.nih.gov/compound/51'), ('1 HEXADECANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/2682'), ('2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL', 'N/A'), ('1 OCTADECANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/8221')]
[('2 OXOGLUTARIC ACID', 'https://pubchem.ncbi.nlm.nih.gov/compound/51'), ('1 HEXADECANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/2682'), ('2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL', 'N/A'), ('1 OCTADECANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/8221'), ('1,2 PROPANEDIOL DIACETATE', 'https://pubchem.ncbi.nlm.nih.gov/compound/1