In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [2]:
df = pd.read_csv("pubchem_longer_wait.csv")
df

Unnamed: 0,Original Molecule,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
0,"1,2 PROPANEDIOL DIACETATE","1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT,623-84-7,"134236-23-0, 1432741-27-9",COMPOUND
1,1 HEXADECANOL,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT,"36653-82-4, 36311-34-9, 124-29-8","168679-13-8, 1173838-87-3, 124-29-8, 55069-45-...",COMPOUND
2,1 OCTADECANOL,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT,"112-92-5, 26762-44-7","193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, ...",COMPOUND
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL","2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",,Not Found,,,
4,2 OXOGLUTARIC ACID,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED,"328-50-7, 34410-46-3, 17091-15-5",27175-99-1,COMPOUND
...,...,...,...,...,...,...,...
2900,ZIZYPHUS JUJUBA,ZIZYPHUS JUJUBA,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,,SUBSTANCE
2901,ZOLEDRONIC ACID,ZOLEDRONIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/68740,FEATURED,118072-93-8,,COMPOUND
2902,ZOLMITRIPTAN,ZOLMITRIPTAN,https://pubchem.ncbi.nlm.nih.gov/compound/60857,FEATURED,139264-17-8,,COMPOUND
2903,ZOLPIDEM,ZOLPIDEM,https://pubchem.ncbi.nlm.nih.gov/compound/5732,FEATURED,82626-48-0,,COMPOUND


In [3]:
def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def custom_title_case(s):
    # Find the first occurrence of an alphabetical character
    match = re.search(r'[a-zA-Z]', s)
    if match:
        # Capitalize the found character and lowercase the rest of the string
        first_letter_index = match.start()
        return s[:first_letter_index].lower() + s[first_letter_index].upper() + s[first_letter_index+1:].lower()
    else:
        # Return the lowercase string if no alphabetical character is found
        return s.lower()

In [4]:
from concurrent.futures import ThreadPoolExecutor, TimeoutError

def search_for_synonym(synonym_elements, comparisons):
    # Iterate through elements to find a match
    for element in synonym_elements:
        element_text = element.text.strip().lower()
        if element_text in comparisons:
            return True  # Found a match
    return False  # No match found

def synonym_checker(name, parsed, link, driver):
    synonym = 2  # Default to an error state

    if not pd.isna(link) and 'pubchem' in str(link):
        try:
            driver.get(link)
            WebDriverWait(driver, 6).until(
                EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#Depositor-Supplied-Synonyms a[data-action="content-link"]'))
            )
            synonym_elements = driver.find_elements(By.CSS_SELECTOR, 'section#Depositor-Supplied-Synonyms a[data-action="content-link"]')
    
            parsed_hyphenated = re.sub(r'(\d) (\w)', r'\1-\2', parsed)
            first_capitalised_h = custom_title_case(parsed_hyphenated)
            first_capitalised_n = custom_title_case(name)
            first_capitalised_p = custom_title_case(parsed)
            
            comparisons = {name.lower(), parsed.lower(), name.upper(), parsed.upper(), name.title(), parsed.title(),
                           parsed_hyphenated.lower(), parsed_hyphenated.upper(), parsed_hyphenated.title(),
                           first_capitalised_h.lower(), first_capitalised_n.lower(), first_capitalised_p.lower()}
            
            with ThreadPoolExecutor(max_workers=1) as executor:
                future = executor.submit(search_for_synonym, synonym_elements, comparisons)
                try:
                    synonym_found = future.result(timeout=30)  # Set timeout to 10 seconds
                    synonym = 0 if synonym_found else 1
                except TimeoutError:
                    print("Timed out waiting for synonym match.")
                    synonym = 2  # Or another value to indicate a timeout occurred
    
        except (NoSuchElementException, TimeoutException):
            synonym = 2
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            synonym = 2
    else:
        print(synonym)
        return synonym

    print(synonym)
    return synonym


def scan(df):
    synonym_flags = []
    driver = setup_webdriver()  # Initialize the WebDriver once

    try:
        for row in df.index:
            check = synonym_checker(df['Original Molecule'][row], df['Parsed Molecule'][row], df['Link'][row], driver)
            if check == 1:
                synonym_flags.append("NO")
            elif check == 0:
                synonym_flags.append("YES")
            else:
                synonym_flags.append("MISSING")

            print(f"remaining: {2904 - len(synonym_flags)}")
                
    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return synonym_flags   

In [5]:
list = scan(df)

2
remaining: 2903
2
remaining: 2902
2
remaining: 2901
2
remaining: 2900
2
remaining: 2899
2
remaining: 2898
2
remaining: 2897
2
remaining: 2896
2
remaining: 2895
2
remaining: 2894
2
remaining: 2893
2
remaining: 2892
2
remaining: 2891
2
remaining: 2890
2
remaining: 2889
2
remaining: 2888
2
remaining: 2887
2
remaining: 2886
2
remaining: 2885
2
remaining: 2884
2
remaining: 2883
2
remaining: 2882
2
remaining: 2881
2
remaining: 2880
2
remaining: 2879
2
remaining: 2878
2
remaining: 2877
2
remaining: 2876
2
remaining: 2875
2
remaining: 2874
2
remaining: 2873
2
remaining: 2872
2
remaining: 2871
2
remaining: 2870
2
remaining: 2869
2
remaining: 2868
2
remaining: 2867
2
remaining: 2866


KeyboardInterrupt: 

In [None]:
df['Synonym?'] = list

In [None]:
df.tail(10)

In [None]:
df

In [None]:
final_df_csv = df.to_csv('pubchem_longer_wait_flagged.csv', index = False)
# final_df_csv = df.to_csv('scraped_pubchem_data_with_synonyms_corrected.csv', index = False)