In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [2]:
df = pd.read_csv("scraped_pubchem_data_with_synonyms.csv")
df

Unnamed: 0,Original Molecule,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance,Synonym?
0,1-2-PROPANEDIOL_DIACETATE,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT,623-84-7,"134236-23-0, 1432741-27-9, 1432741-27-9, 13423...",COMPOUND,YES
1,1-HEXADECANOL,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT,"36653-82-4, 36311-34-9, 124-29-8","168679-13-8, 124-29-8, 55069-45-9, 8014-51-5, ...",COMPOUND,YES
2,1-OCTADECANOL,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT,"112-92-5, 68911-61-5, 26762-44-7","193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, ...",COMPOUND,YES
3,2-3-(2-IODOPROPYLIDENEDIOXY)PROPANOL,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",,Not Found,,,,MISSING
4,2-OXOGLUTARIC_ACID,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED,"328-50-7, 34410-46-3, 17091-15-5",27175-99-1,COMPOUND,YES
...,...,...,...,...,...,...,...,...
2900,ZIZYPHUS_JUJUBA,ZIZYPHUS JUJUBA,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,,SUBSTANCE,NO
2901,ZOLEDRONIC_ACID,ZOLEDRONIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/68740,FEATURED,118072-93-8,,COMPOUND,YES
2902,ZOLMITRIPTAN,ZOLMITRIPTAN,https://pubchem.ncbi.nlm.nih.gov/compound/60857,FEATURED,139264-17-8,,COMPOUND,YES
2903,ZOLPIDEM,ZOLPIDEM,https://pubchem.ncbi.nlm.nih.gov/compound/5732,FEATURED,82626-48-0,,COMPOUND,YES


In [3]:
def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def custom_title_case(s):
    # Find the first occurrence of an alphabetical character
    match = re.search(r'[a-zA-Z]', s)
    if match:
        # Capitalize the found character and lowercase the rest of the string
        first_letter_index = match.start()
        return s[:first_letter_index].lower() + s[first_letter_index].upper() + s[first_letter_index+1:].lower()
    else:
        # Return the lowercase string if no alphabetical character is found
        return s.lower()

In [4]:
from concurrent.futures import ThreadPoolExecutor, TimeoutError

def search_for_synonym(synonym_elements, comparisons, limit=30):
    # Iterate through elements to find a match, but stop after 'limit' comparisons
    for count, element in enumerate(synonym_elements, start=1):
        if element.text.strip().lower() in comparisons:
            return True  # Found a match
        if count >= limit:
            break  # Stop searching after 'limit' comparisons
    return False  # No match found within the first 'limit' elements

def synonym_checker(name, parsed, link, syn, driver):
    synonym = 2  # Default to an error state
    
    if syn != "YES":
        if not pd.isna(link) and 'pubchem' in str(link):
            try:
                driver.get(link)
                WebDriverWait(driver, 10).until(
                    EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#Synonyms ul'))
                )
                synonym_elements = driver.find_elements(By.CSS_SELECTOR, 'section#Synonyms ul li')
                
                parsed_hyphenated = re.sub(r'(\d) (\w)', r'\1-\2', parsed)
                first_capitalised_h = custom_title_case(parsed_hyphenated)
                first_capitalised_n = custom_title_case(name)
                first_capitalised_p = custom_title_case(parsed)
                
                comparisons = {name.lower(), parsed.lower(), name.upper(), parsed.upper(), name.title(), parsed.title(),
                               parsed_hyphenated.lower(), parsed_hyphenated.upper(), parsed_hyphenated.title(),
                               first_capitalised_h.lower(), first_capitalised_n.lower(), first_capitalised_p.lower()}
                
                synonym_found = search_for_synonym(synonym_elements, comparisons, limit=30)
                synonym = 0 if synonym_found else 1
        
            except TimeoutException:
                print("Timed out during web operations.")
                synonym = 2
            except Exception as e:
                print(f"An unexpected error occurred: {e}")
                synonym = 2
        else:
            print(f"Invalid link for {name}.")
            return synonym
    else:
        synonym = 0
    
    print(f"{name}: {synonym}")
    return synonym



def scan(df):
    synonym_flags = []
    driver = setup_webdriver()  # Initialize the WebDriver once

    try:
        for row in df.index:
            check = synonym_checker(df['Original Molecule'][row], df['Parsed Molecule'][row], df['Link'][row], df['Synonym?'][row], driver)
            if check == 1:
                synonym_flags.append("NO")
            elif check == 0:
                synonym_flags.append("YES")
            else:
                synonym_flags.append("MISSING")

            print(f"remaining: {2904 - len(synonym_flags)}")
                
    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return synonym_flags   

In [5]:
list = scan(df)

1-2-PROPANEDIOL_DIACETATE: 0
remaining: 2903
1-HEXADECANOL: 0
remaining: 2902
1-OCTADECANOL: 0
remaining: 2901
Invalid link for 2-3-(2-IODOPROPYLIDENEDIOXY)PROPANOL.
remaining: 2900
2-OXOGLUTARIC_ACID: 0
remaining: 2899
Timed out during web operations.
2-PHENOXYETHANOL: 2
remaining: 2898
2-PROPANOL: 0
remaining: 2897
4-AMINOBUTYRIC_ACID: 0
remaining: 2896
Invalid link for 7-KETO_DEHYDRANDROSTERONE.
remaining: 2895
7-OXO-DEHYDROEPIANDROSTERONE: 0
remaining: 2894
8-QUINOLINOL: 0
remaining: 2893
ABACAVIR: 0
remaining: 2892
ABALOPARATIDE: 0
remaining: 2891
ABARELIX: 0
remaining: 2890
ABATACEPT: 0
remaining: 2889
ABCIXIMAB: 0
remaining: 2888
ABEMACICLIB: 0
remaining: 2887
ABIRATERONE_ACETATE: 0
remaining: 2886
Timed out during web operations.
ABRUS_PRECATORIUS: 2
remaining: 2885
Invalid link for ABSORBABLE_GELATIN/COLLAGEN_SPONGE.
remaining: 2884
Timed out during web operations.
ACACIA_CATECHU: 2
remaining: 2883
Timed out during web operations.
ACACIA_SENEGAL: 2
remaining: 2882
ACALABRUTINI

In [6]:
df['Synonym2?'] = list

In [7]:
df

Unnamed: 0,Original Molecule,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance,Synonym?,Synonym2?
0,1-2-PROPANEDIOL_DIACETATE,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT,623-84-7,"134236-23-0, 1432741-27-9, 1432741-27-9, 13423...",COMPOUND,YES,YES
1,1-HEXADECANOL,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT,"36653-82-4, 36311-34-9, 124-29-8","168679-13-8, 124-29-8, 55069-45-9, 8014-51-5, ...",COMPOUND,YES,YES
2,1-OCTADECANOL,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT,"112-92-5, 68911-61-5, 26762-44-7","193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, ...",COMPOUND,YES,YES
3,2-3-(2-IODOPROPYLIDENEDIOXY)PROPANOL,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",,Not Found,,,,MISSING,MISSING
4,2-OXOGLUTARIC_ACID,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED,"328-50-7, 34410-46-3, 17091-15-5",27175-99-1,COMPOUND,YES,YES
...,...,...,...,...,...,...,...,...,...
2900,ZIZYPHUS_JUJUBA,ZIZYPHUS JUJUBA,https://pubchem.ncbi.nlm.nih.gov/substance/482...,RELEVANT,,,SUBSTANCE,NO,MISSING
2901,ZOLEDRONIC_ACID,ZOLEDRONIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/68740,FEATURED,118072-93-8,,COMPOUND,YES,YES
2902,ZOLMITRIPTAN,ZOLMITRIPTAN,https://pubchem.ncbi.nlm.nih.gov/compound/60857,FEATURED,139264-17-8,,COMPOUND,YES,YES
2903,ZOLPIDEM,ZOLPIDEM,https://pubchem.ncbi.nlm.nih.gov/compound/5732,FEATURED,82626-48-0,,COMPOUND,YES,YES


In [10]:
non_equal_count = (df["Synonym?"] != df["Synonym2?"]).sum()
non_equal_count

748

In [12]:
synonyms_df = df[df["Synonym2?"] == "YES"]
synonyms_df.describe()

Unnamed: 0,Original Molecule,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance,Synonym?,Synonym2?
count,2282,2282,2282,2282,2155,958,2281,2282,2282
unique,2282,2282,2270,2,2140,943,2,3,1
top,1-2-PROPANEDIOL_DIACETATE,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/7696...,FEATURED,122575-21-7,"101921-26-0, 102785-31-9, 12656-11-0, 913079-23-9",COMPOUND,YES,YES
freq,1,1,2,2219,2,3,2235,1827,2282


In [13]:
final_df_csv = df.to_csv('scraped_with_both_synonyms.csv', index = False)