# Libraries

In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

# Parser
- Convert original non-queryable names to queryable.

In [2]:
# molecules_df = pd.read_excel('data.xlsx', sheet_name = 'in')
# molecules_arr = molecules_df['molecule'].tolist()

# pattern = re.compile(r'(\d)-(\d)')
# pattern2 = re.compile(r'_')
# pattern3 = re.compile(r'(\d)-([a-zA-Z])')

# # List to store updated molecule names
# updated_molecules_arr = []

# for molecule in molecules_arr:
#     updated_molecule = pattern.sub(r'\1,\2', molecule)
#     updated_molecule = pattern2.sub(r' ', updated_molecule)
#     updated_molecule = pattern3.sub(r'\1 \2', updated_molecule)
#     updated_molecules_arr.append(updated_molecule)

# updated_molecules_df = pd.DataFrame(updated_molecules_arr, columns=['updated_molecule'])
# updated_molecules_df

updated_molecules_arr = ['SPARFLOXACIN', 'ACETYLCYSTEINE', 'CYSTEINE', 'CYSTINE', 'DASATINIB', 'DEFIBROTIDE', 'TYLOSIN', 'AMFETAMINE', 'AMPHETAMINE', 'CHROMIUM', 'COUGH AND COLD PREPARATIONS', 'FIXATIVES', 'GLUCOSE', 'UREA', 'BENZALKONIUM CHLORIDE', 'BENZYL ALCOHOL', '2 PROPANOL', 'ACETIC ACID', 'ANTACIDS', 'ASPARTIC ACID', 'AXITINIB', 'AZTREONAM', 'BABY LOTION', 'DEFEROXAMINE']

# PubChem Query Result Scraper

In [3]:
def get_pubchem_url(chemical):
    return f'https://pubchem.ncbi.nlm.nih.gov/#query={chemical}'

def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def get_best_match(molecule, driver):
    url = "N/A"  # Default in case of failure
    result_type = "Not Found"  # Default result type

    driver.get(get_pubchem_url(molecule))
    try:
        WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[data-action="featured-result-link"], a[data-action="result-link"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[data-action="featured-result-link"], a[data-action="result-link"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')
            action = element.get_attribute('data-action')
            if action == 'featured-result-link':
                result_type = 'FEATURED'
            else:
                result_type = 'RELEVANT'
    except (NoSuchElementException, TimeoutException):
        pass  # URL remains "N/A" and result_type remains "Not Found" if no match is found or if there's a timeout

    return molecule, url, result_type

def get_best_matches(parsed_molecules):
    best_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once
    try:
        for parsed_molecule in parsed_molecules:
            molecule, url, result_type = get_best_match(parsed_molecule, driver)
            best_matches.append((molecule, url, result_type))
    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return best_matches

In [4]:
best_matches = get_best_matches(updated_molecules_arr)

In [5]:
df = pd.DataFrame(best_matches, columns=['Molecule', 'Link', 'Result Type'])

In [6]:
df

Unnamed: 0,Molecule,Link,Result Type
0,SPARFLOXACIN,https://pubchem.ncbi.nlm.nih.gov/compound/60464,FEATURED
1,ACETYLCYSTEINE,https://pubchem.ncbi.nlm.nih.gov/compound/12035,FEATURED
2,CYSTEINE,https://pubchem.ncbi.nlm.nih.gov/compound/5862,FEATURED
3,CYSTINE,https://pubchem.ncbi.nlm.nih.gov/compound/67678,FEATURED
4,DASATINIB,https://pubchem.ncbi.nlm.nih.gov/compound/3062316,FEATURED
5,DEFIBROTIDE,https://pubchem.ncbi.nlm.nih.gov/compound/1355...,FEATURED
6,TYLOSIN,https://pubchem.ncbi.nlm.nih.gov/compound/5280440,FEATURED
7,AMFETAMINE,https://pubchem.ncbi.nlm.nih.gov/compound/3007,FEATURED
8,AMPHETAMINE,https://pubchem.ncbi.nlm.nih.gov/compound/3007,FEATURED
9,CHROMIUM,https://pubchem.ncbi.nlm.nih.gov/compound/23976,FEATURED


In [7]:
# df_merged = pd.concat([molecules_df, df], ignore_index=True, sort=False, axis=1)
df_merged = df
df_merged.columns = ['Parsed', 'Link', 'Result Type']
df_merged

Unnamed: 0,Parsed,Link,Result Type
0,SPARFLOXACIN,https://pubchem.ncbi.nlm.nih.gov/compound/60464,FEATURED
1,ACETYLCYSTEINE,https://pubchem.ncbi.nlm.nih.gov/compound/12035,FEATURED
2,CYSTEINE,https://pubchem.ncbi.nlm.nih.gov/compound/5862,FEATURED
3,CYSTINE,https://pubchem.ncbi.nlm.nih.gov/compound/67678,FEATURED
4,DASATINIB,https://pubchem.ncbi.nlm.nih.gov/compound/3062316,FEATURED
5,DEFIBROTIDE,https://pubchem.ncbi.nlm.nih.gov/compound/1355...,FEATURED
6,TYLOSIN,https://pubchem.ncbi.nlm.nih.gov/compound/5280440,FEATURED
7,AMFETAMINE,https://pubchem.ncbi.nlm.nih.gov/compound/3007,FEATURED
8,AMPHETAMINE,https://pubchem.ncbi.nlm.nih.gov/compound/3007,FEATURED
9,CHROMIUM,https://pubchem.ncbi.nlm.nih.gov/compound/23976,FEATURED


In [8]:
df_merged_csv = df_merged.to_csv('FINAL_MOLECULE_LINKS.csv', index = False) 

In [9]:
links_arr = df_merged["Link"].tolist()
links_arr

['https://pubchem.ncbi.nlm.nih.gov/compound/60464',
 'https://pubchem.ncbi.nlm.nih.gov/compound/12035',
 'https://pubchem.ncbi.nlm.nih.gov/compound/5862',
 'https://pubchem.ncbi.nlm.nih.gov/compound/67678',
 'https://pubchem.ncbi.nlm.nih.gov/compound/3062316',
 'https://pubchem.ncbi.nlm.nih.gov/compound/135565962',
 'https://pubchem.ncbi.nlm.nih.gov/compound/5280440',
 'https://pubchem.ncbi.nlm.nih.gov/compound/3007',
 'https://pubchem.ncbi.nlm.nih.gov/compound/3007',
 'https://pubchem.ncbi.nlm.nih.gov/compound/23976',
 'https://pubchem.ncbi.nlm.nih.gov/compound/12035',
 'https://pubchem.ncbi.nlm.nih.gov/compound/3485',
 'https://pubchem.ncbi.nlm.nih.gov/compound/5793',
 'https://pubchem.ncbi.nlm.nih.gov/compound/1176',
 'https://pubchem.ncbi.nlm.nih.gov/compound/244',
 'https://pubchem.ncbi.nlm.nih.gov/compound/244',
 'https://pubchem.ncbi.nlm.nih.gov/compound/3776',
 'https://pubchem.ncbi.nlm.nih.gov/compound/176',
 'https://pubchem.ncbi.nlm.nih.gov/compound/10112',
 'https://pubchem

# PubChem CAS Scraper

In [10]:
def get_cas_numbers(link, driver):
    cas_numbers = {"CAS": "N/A", "Deprecated CAS": "N/A"}  # Default in case of failure

    try:
        driver.get(link)
        # Wait and locate the CAS number section
        try:
            cas_elements = WebDriverWait(driver, 3).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#CAS div.break-words')))
            cas_numbers["CAS"] = ', '.join([el.text.strip() for el in cas_elements])
        except (NoSuchElementException, TimeoutException):
            cas_numbers["CAS"] = "N/A"  # CAS number not found

        # Wait and locate the Deprecated CAS number section, if present
        try:
            deprecated_cas_elements = driver.find_elements(By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')
            # deprecated_cas_elements = WebDriverWait(driver, 3).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')))
            cas_numbers["Deprecated CAS"] = ', '.join([el.text.strip() for el in deprecated_cas_elements])
            
        except (NoSuchElementException, TimeoutException):
            cas_numbers["Deprecated CAS"] = "N/A"  # Deprecated CAS number not found
            
    except Exception as e:
        print(f"Error retrieving CAS numbers for {link}: {e}")

    return cas_numbers

def get_cas_numbers_concurrently(links):
    cas_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once

    try:
        for link in links:
            cas_matches.append(get_cas_numbers(link, driver))

    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return cas_matches      

In [11]:
cas_numbers = get_cas_numbers_concurrently(links_arr)

In [12]:
df_cas = pd.DataFrame(cas_numbers, columns=['CAS', 'Deprecated CAS'])
df_cas

Unnamed: 0,CAS,Deprecated CAS
0,"110871-86-8, 111542-93-9",
1,"616-91-1, 7218-04-4","1261105-20-7, 7696-05-1"
2,"52-90-4, 3374-22-9","1404190-35-7, 154605-72-8, 4371-52-2, 154605-7..."
3,"56-89-3, 923-32-0","154605-69-3, 24645-67-8, 24645-67-8"
4,"302962-49-8, 863127-77-9",
5,"83712-60-1, 1118915-78-8",
6,"1401-69-0, 1405-54-5","11112-11-1, 39282-33-2, 8026-48-0, 11112-11-1,..."
7,"300-62-9, 2706-50-5, 60-15-1, 103-97-9","17108-96-2, 60-15-1, 96332-84-2, 28841-71-6, 4..."
8,"300-62-9, 2706-50-5, 60-15-1, 103-97-9","17108-96-2, 60-15-1, 96332-84-2, 28841-71-6, 4..."
9,"7440-47-3, 257-07-8, 19498-56-7","188785-87-7, 195161-82-1, 195161-82-1"


# Combining Molecule Links Dataframe with CAS Dataframe

In [13]:
final_df = pd.concat([df_merged, df_cas], ignore_index=True, sort=False, axis=1)

In [14]:
final_df

Unnamed: 0,0,1,2,3,4
0,SPARFLOXACIN,https://pubchem.ncbi.nlm.nih.gov/compound/60464,FEATURED,"110871-86-8, 111542-93-9",
1,ACETYLCYSTEINE,https://pubchem.ncbi.nlm.nih.gov/compound/12035,FEATURED,"616-91-1, 7218-04-4","1261105-20-7, 7696-05-1"
2,CYSTEINE,https://pubchem.ncbi.nlm.nih.gov/compound/5862,FEATURED,"52-90-4, 3374-22-9","1404190-35-7, 154605-72-8, 4371-52-2, 154605-7..."
3,CYSTINE,https://pubchem.ncbi.nlm.nih.gov/compound/67678,FEATURED,"56-89-3, 923-32-0","154605-69-3, 24645-67-8, 24645-67-8"
4,DASATINIB,https://pubchem.ncbi.nlm.nih.gov/compound/3062316,FEATURED,"302962-49-8, 863127-77-9",
5,DEFIBROTIDE,https://pubchem.ncbi.nlm.nih.gov/compound/1355...,FEATURED,"83712-60-1, 1118915-78-8",
6,TYLOSIN,https://pubchem.ncbi.nlm.nih.gov/compound/5280440,FEATURED,"1401-69-0, 1405-54-5","11112-11-1, 39282-33-2, 8026-48-0, 11112-11-1,..."
7,AMFETAMINE,https://pubchem.ncbi.nlm.nih.gov/compound/3007,FEATURED,"300-62-9, 2706-50-5, 60-15-1, 103-97-9","17108-96-2, 60-15-1, 96332-84-2, 28841-71-6, 4..."
8,AMPHETAMINE,https://pubchem.ncbi.nlm.nih.gov/compound/3007,FEATURED,"300-62-9, 2706-50-5, 60-15-1, 103-97-9","17108-96-2, 60-15-1, 96332-84-2, 28841-71-6, 4..."
9,CHROMIUM,https://pubchem.ncbi.nlm.nih.gov/compound/23976,FEATURED,"7440-47-3, 257-07-8, 19498-56-7","188785-87-7, 195161-82-1, 195161-82-1"


In [15]:
final_df.to_csv("test.csv", index=False)

In [16]:
final_df.columns=['Original Molecule', 'Parsed Molecule', 'Link', 'Result Type', 'CAS', 'Deprecated CAS']

ValueError: Length mismatch: Expected axis has 5 elements, new values have 6 elements

# Flagging Compound Types

In [None]:
final_df['Compound/Substance'] = final_df['Link'].apply(lambda x: 'COMPOUND' if 'compound' in str(x) else ('SUBSTANCE' if 'substance' in str(x) else 'N/A'))

In [None]:
final_df

In [None]:
final_df['Original Molecule'] = final_df['Parsed Molecule']
final_df = final_df[['Original Molecule', 'Parsed Molecule', 'Link', 'Result Type', 'CAS', 'Deprecated CAS', 'Compound/Substance']]
final_df

# Export Dataframe as CSV

In [None]:
final_df_csv = final_df.to_csv('pubchem_new_wait.csv', index = False) 