In [1]:
import selenium 
import csv
import re
import pandas as pd
import ast
import numpy as np
import math

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [2]:
component_df = pd.read_csv('margaret_with_components.csv')

In [3]:
def split_components(component_string):
    component_string = component_string.strip('[]')
    component_links = [link.strip() for link in component_string.split(',')]
    return component_links

component_arr = component_df['components'].to_list()
component_arr = map(split_components, component_arr)
component_arr = list(component_arr)

In [4]:
cleaned_links = []
for sublist in component_arr:
    links = [link.strip("'") for link in sublist if link.strip("'") != '']
    cleaned_links.append(links)

cleaned_links

[[],
 [],
 [],
 [],
 [],
 [],
 ['https://pubchem.ncbi.nlm.nih.gov/compound/311',
  'https://pubchem.ncbi.nlm.nih.gov/compound/305',
  'https://pubchem.ncbi.nlm.nih.gov/compound/23925'],
 [],
 ['https://pubchem.ncbi.nlm.nih.gov/compound/6083',
  'https://pubchem.ncbi.nlm.nih.gov/compound/5462222'],
 ['https://pubchem.ncbi.nlm.nih.gov/compound/67145',
  'https://pubchem.ncbi.nlm.nih.gov/compound/10909430'],
 ['https://pubchem.ncbi.nlm.nih.gov/compound/51049968',
  'https://pubchem.ncbi.nlm.nih.gov/compound/1118'],
 ['https://pubchem.ncbi.nlm.nih.gov/compound/444266',
  'https://pubchem.ncbi.nlm.nih.gov/compound/9822750'],
 ['https://pubchem.ncbi.nlm.nih.gov/compound/6918837',
  'https://pubchem.ncbi.nlm.nih.gov/compound/962'],
 [],
 ['https://pubchem.ncbi.nlm.nih.gov/compound/1118',
  'https://pubchem.ncbi.nlm.nih.gov/compound/124087'],
 [],
 ['https://pubchem.ncbi.nlm.nih.gov/compound/6395',
  'https://pubchem.ncbi.nlm.nih.gov/compound/65016'],
 ['https://pubchem.ncbi.nlm.nih.gov/compou

In [5]:
def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def get_cas_numbers(link, driver):
    cas_numbers = {"CAS": "N/A", "Deprecated CAS": "N/A"}  # Default in case of failure

    try:
        driver.get(link)
        # Wait and locate the CAS number section
        try:
            cas_elements = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#CAS div.break-words')))
            cas_numbers["CAS"] = ', '.join([el.text.strip() for el in cas_elements])
        except (NoSuchElementException, TimeoutException):
            cas_numbers["CAS"] = "N/A"  # CAS number not found

        # Wait and locate the Deprecated CAS number section, if present
        try:
            deprecated_cas_elements = driver.find_elements(By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')
            # deprecated_cas_elements = WebDriverWait(driver, 3).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')))
            cas_numbers["Deprecated CAS"] = ', '.join([el.text.strip() for el in deprecated_cas_elements])
            
        except (NoSuchElementException, TimeoutException):
            cas_numbers["Deprecated CAS"] = "N/A"  # Deprecated CAS number not found
            
    except Exception as e:
        print(f"Error retrieving CAS numbers for {link}: {e}")

    return cas_numbers

def get_cas_numbers_concurrently(links):
    cas_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once

    try:
        for link in links:
            cas_matches.append(get_cas_numbers(link, driver))

    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return cas_matches      

In [6]:
all_links = []
for sublist in cleaned_links:
    all_links.extend(sublist)

In [8]:
cas_numbers = get_cas_numbers_concurrently(all_links)

In [9]:
df_cas = pd.DataFrame(cas_numbers, columns=['CAS', 'Deprecated CAS'])
df_cas

Unnamed: 0,CAS,Deprecated CAS
0,77-92-9,"1192555-95-5, 12262-73-6, 136108-93-5, 2023788..."
1,"62-49-7, 5413-08-1","139741-81-4, 2087491-45-8, 139741-81-4, 207280..."
2,"7439-89-6, 33485-98-2, 8048-10-0, 12597-68-1, ...","129048-51-7, 161135-39-3, 1867181-06-3, 190454..."
3,"61-19-8, 136920-07-5, 82530-89-0","162756-82-3, 47286-65-7, 47287-97-8, 53624-78-..."
4,7440-09-7,31079-13-7
...,...,...
112,78266-06-5,
113,7440-23-5,"1061193-24-5, 184637-88-5, 213530-35-9, 351903..."
114,7440-09-7,31079-13-7
115,"143-07-7, 203714-07-2, 7632-48-6, 8000-62-2, 8...","203714-07-2, 7632-48-6, 8000-62-2, 8045-27-0, ..."


In [14]:
df_links = pd.DataFrame(all_links, columns=['links'])
df_links

Unnamed: 0,links
0,https://pubchem.ncbi.nlm.nih.gov/compound/311
1,https://pubchem.ncbi.nlm.nih.gov/compound/305
2,https://pubchem.ncbi.nlm.nih.gov/compound/23925
3,https://pubchem.ncbi.nlm.nih.gov/compound/6083
4,https://pubchem.ncbi.nlm.nih.gov/compound/5462222
...,...
112,https://pubchem.ncbi.nlm.nih.gov/compound/54158
113,https://pubchem.ncbi.nlm.nih.gov/compound/5360545
114,https://pubchem.ncbi.nlm.nih.gov/compound/5462222
115,https://pubchem.ncbi.nlm.nih.gov/compound/3893


In [15]:
final_df = pd.concat([df_links, df_cas], ignore_index=True, sort=False, axis=1)

In [16]:
final_df

Unnamed: 0,0,1,2
0,https://pubchem.ncbi.nlm.nih.gov/compound/311,77-92-9,"1192555-95-5, 12262-73-6, 136108-93-5, 2023788..."
1,https://pubchem.ncbi.nlm.nih.gov/compound/305,"62-49-7, 5413-08-1","139741-81-4, 2087491-45-8, 139741-81-4, 207280..."
2,https://pubchem.ncbi.nlm.nih.gov/compound/23925,"7439-89-6, 33485-98-2, 8048-10-0, 12597-68-1, ...","129048-51-7, 161135-39-3, 1867181-06-3, 190454..."
3,https://pubchem.ncbi.nlm.nih.gov/compound/6083,"61-19-8, 136920-07-5, 82530-89-0","162756-82-3, 47286-65-7, 47287-97-8, 53624-78-..."
4,https://pubchem.ncbi.nlm.nih.gov/compound/5462222,7440-09-7,31079-13-7
...,...,...,...
112,https://pubchem.ncbi.nlm.nih.gov/compound/54158,78266-06-5,
113,https://pubchem.ncbi.nlm.nih.gov/compound/5360545,7440-23-5,"1061193-24-5, 184637-88-5, 213530-35-9, 351903..."
114,https://pubchem.ncbi.nlm.nih.gov/compound/5462222,7440-09-7,31079-13-7
115,https://pubchem.ncbi.nlm.nih.gov/compound/3893,"143-07-7, 203714-07-2, 7632-48-6, 8000-62-2, 8...","203714-07-2, 7632-48-6, 8000-62-2, 8045-27-0, ..."


In [18]:
final_df.to_csv('component_cas_values.csv', index=False)