In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [2]:
molecules_df = pd.read_excel('data.xlsx', sheet_name = 'in')
molecules_arr = molecules_df['molecule'].tolist()

pattern = re.compile(r'(\d)-(\d)')
pattern2 = re.compile(r'_')
pattern3 = re.compile(r'(\d)-([a-zA-Z])')

# List to store updated molecule names
updated_molecules_arr = []

for molecule in molecules_arr:
    updated_molecule = pattern.sub(r'\1,\2', molecule)
    updated_molecule = pattern2.sub(r' ', updated_molecule)
    updated_molecule = pattern3.sub(r'\1 \2', updated_molecule)
    updated_molecules_arr.append(updated_molecule)

updated_molecules_df = pd.DataFrame(updated_molecules_arr, columns=['updated_molecule'])
updated_molecules_df

Unnamed: 0,updated_molecule
0,"1,2 PROPANEDIOL DIACETATE"
1,1 HEXADECANOL
2,1 OCTADECANOL
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL"
4,2 OXOGLUTARIC ACID
...,...
2900,ZIZYPHUS JUJUBA
2901,ZOLEDRONIC ACID
2902,ZOLMITRIPTAN
2903,ZOLPIDEM


In [3]:
def get_pubchem_url(chemical):
    url_start = f'https://pubchem.ncbi.nlm.nih.gov/#query={chemical}'
    return url_start

from concurrent.futures import ThreadPoolExecutor, as_completed

def get_best_match(molecule):
    # This function is adjusted to work on a single molecule.
    # It's similar to your existing function but designed to be called in parallel.
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    url = "N/A"  # Default in case of failure

    try:
        service = Service(ChromeDriverManager().install())
        with webdriver.Chrome(service=service, options=chrome_options) as driver:
            driver.get(get_pubchem_url(molecule))
            WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[data-action="result-link"]')))
            element = driver.find_element(By.CSS_SELECTOR, 'a[data-action="result-link"]')
            url = element.get_attribute('href')
    except (NoSuchElementException, TimeoutException):
        pass

    return molecule, url

def get_best_matches_concurrently(molecules):
    best_matches = []
    with ThreadPoolExecutor(max_workers=2) as executor:
        future_to_molecule = {executor.submit(get_best_match, molecule): molecule for molecule in molecules}
        for future in as_completed(future_to_molecule):
            molecule, url = future.result()
            best_matches.append((molecule, url))
    return best_matches

In [4]:
best_matches = get_best_matches_concurrently(updated_molecules_arr[0:20])

In [5]:
best_matches

[('1 HEXADECANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/2682'),
 ('1,2 PROPANEDIOL DIACETATE',
  'https://pubchem.ncbi.nlm.nih.gov/compound/12198'),
 ('1 OCTADECANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/8221'),
 ('2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL', 'N/A'),
 ('2 OXOGLUTARIC ACID', 'https://pubchem.ncbi.nlm.nih.gov/compound/51'),
 ('2 PHENOXYETHANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/31236'),
 ('2 PROPANOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/3776'),
 ('4 AMINOBUTYRIC ACID', 'https://pubchem.ncbi.nlm.nih.gov/compound/6137'),
 ('7 KETO DEHYDRANDROSTERONE', 'N/A'),
 ('7 OXO-DEHYDROEPIANDROSTERONE',
  'https://pubchem.ncbi.nlm.nih.gov/compound/193313'),
 ('8 QUINOLINOL', 'https://pubchem.ncbi.nlm.nih.gov/compound/1923'),
 ('ABACAVIR', 'https://pubchem.ncbi.nlm.nih.gov/compound/743'),
 ('ABALOPARATIDE', 'https://pubchem.ncbi.nlm.nih.gov/compound/145705876'),
 ('ABARELIX', 'https://pubchem.ncbi.nlm.nih.gov/compound/16131215'),
 ('ABATACEPT', 'https://pubche

In [6]:
df = pd.DataFrame(best_matches, columns=['Molecule', 'Link'])

In [7]:
df

Unnamed: 0,0,1
0,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682
1,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198
2,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",
4,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51
5,2 PHENOXYETHANOL,https://pubchem.ncbi.nlm.nih.gov/compound/31236
6,2 PROPANOL,https://pubchem.ncbi.nlm.nih.gov/compound/3776
7,4 AMINOBUTYRIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/6137
8,7 KETO DEHYDRANDROSTERONE,
9,7 OXO-DEHYDROEPIANDROSTERONE,https://pubchem.ncbi.nlm.nih.gov/compound/193313


In [8]:
df_merged = pd.concat([molecules_df, df], ignore_index=True, sort=False, axis=1)

In [9]:
df_merged_csv = df.to_csv('molecule_links.csv', index = False) 

In [10]:
links_arr = df_merged[2].tolist()

In [11]:
def get_cas_numbers(link):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    cas_numbers = {"CAS": "N/A", "Deprecated CAS": "N/A"}  # Default in case of failure

    try:
        service = Service(ChromeDriverManager().install())
        with webdriver.Chrome(service=service, options=chrome_options) as driver:
            driver.get(link)
            # Wait and locate the CAS number section
            try:
                cas_element = WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'section#CAS div.break-words a')))
                cas_numbers["CAS"] = cas_element.text.strip()
            except (NoSuchElementException, TimeoutException):
                cas_numbers["CAS"] = "N/A"  # CAS number not found

            # Wait and locate the Deprecated CAS number section, if present
            try:
                deprecated_cas_elements = WebDriverWait(driver, 3).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')))
                deprecated_cas_numbers = ', '.join([el.text.strip() for el in deprecated_cas_elements])
                cas_numbers["Deprecated CAS"] = deprecated_cas_numbers if deprecated_cas_numbers else "N/A"
            except (NoSuchElementException, TimeoutException):
                cas_numbers["Deprecated CAS"] = "N/A"  # Deprecated CAS number not found
    except Exception as e:
        print(f"Error retrieving CAS numbers for {link}: {e}")

    return cas_numbers

def get_cas_numbers_concurrently(links):
    cas_matches = []
    with ThreadPoolExecutor(max_workers=5) as executor:  
        future_to_link = {executor.submit(get_cas_numbers, link): link for link in links}
        for future in as_completed(future_to_link):
            link = future_to_link[future]
            try:
                cas_info = future.result()
                cas_matches.append(cas_info)
            except Exception as e:
                print(f"Error processing link {link}: {e}")
    return cas_matches

In [13]:
cas_numbers = get_cas_numbers_concurrently(links_arr[0:20])

Error retrieving CAS numbers for N/A: Message: invalid argument
  (Session info: chrome-headless-shell=123.0.6312.59)
Stacktrace:
0   chromedriver                        0x00000001091290f8 chromedriver + 4595960
1   chromedriver                        0x0000000109120e63 chromedriver + 4562531
2   chromedriver                        0x0000000108d24225 chromedriver + 381477
3   chromedriver                        0x0000000108d0d28a chromedriver + 287370
4   chromedriver                        0x0000000108d0c14a chromedriver + 282954
5   chromedriver                        0x0000000108d0c46a chromedriver + 283754
6   chromedriver                        0x0000000108d26dde chromedriver + 392670
7   chromedriver                        0x0000000108db0ac5 chromedriver + 957125
8   chromedriver                        0x0000000108d90142 chromedriver + 823618
9   chromedriver                        0x0000000108db014d chromedriver + 954701
10  chromedriver                        0x0000000108d8fee3

In [14]:
cas_numbers

[('N/A', {'CAS': 'N/A', 'Deprecated CAS': 'N/A'}),
 ('https://pubchem.ncbi.nlm.nih.gov/compound/51',
  {'CAS': '328-50-7', 'Deprecated CAS': '27175-99-1'}),
 ('https://pubchem.ncbi.nlm.nih.gov/compound/8221',
  {'CAS': '112-92-5',
   'Deprecated CAS': '193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, 8034-90-0, 8034-90-0, 8032-21-1, 8014-37-7, 193766-48-2'}),
 ('https://pubchem.ncbi.nlm.nih.gov/compound/2682',
  {'CAS': '36653-82-4',
   'Deprecated CAS': '168679-13-8, 124-29-8, 55069-45-9, 8014-51-5, 8023-37-8, 8032-16-4, 8032-17-5, 8032-89-1, 1173838-87-3, 1173838-87-3, 124-29-8, 55069-45-9, 8014-51-5, 8023-37-8, 8032-16-4, 8032-17-5, 8032-89-1, 8014-51-5, 1173838-87-3, 8032-16-4, 8032-89-1, 8023-37-8, 55069-45-9, 8032-17-5, 124-29-8'}),
 ('https://pubchem.ncbi.nlm.nih.gov/compound/12198',
  {'CAS': '623-84-7',
   'Deprecated CAS': '134236-23-0, 1432741-27-9, 1432741-27-9, 134236-23-0'}),
 ('N/A', {'CAS': 'N/A', 'Deprecated CAS': 'N/A'}),
 ('https://pubchem.ncbi.nlm.nih.gov/compound/3123

In [17]:
df_cas = pd.DataFrame(cas_numbers, columns=['Link', 'CAS', 'Deprecated CAS'])
df_cas

ValueError: 3 columns passed, passed data had 2 columns