In [1]:
import re
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By

import time

import pandas as pd

from selenium.webdriver.common.keys import Keys

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_pine = pd.read_csv("src/smiles_pine.csv")[["name","cas","composition"]]
df_pine.head()

Unnamed: 0,name,cas,composition
0,alpha-Pinene,80-56-8,0.281
1,delta-3-Carene,13466-78-9,0.237
2,Caryophyllene oxide,1139-30-6,0.049
3,delta-Cadinene,483-76-1,0.048
4,Camphene,79-92-5,0.034


In [4]:
df_fetched = pd.read_csv("scaped_vp.csv")
df_fetched = df_fetched[["fetched_name","vapour_pressure","url","vapour_pressure2"]]
df_fetched.head()

Unnamed: 0,fetched_name,vapour_pressure,url,vapour_pressure2
0,(+)-α-Pinene,3.5±0.1 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,3.5
1,carene,1.9±0.1 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,1.9
2,(−)-β-Caryophyllene epoxide,0.0±0.6 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,0.0
3,d-Cadinene,0.0±0.3 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,0.0
4,Camphene,3.4±0.1 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,3.4


In [5]:
df = pd.concat([df_pine,df_fetched], axis=1)
df.head()

Unnamed: 0,name,cas,composition,fetched_name,vapour_pressure,url,vapour_pressure2
0,alpha-Pinene,80-56-8,0.281,(+)-α-Pinene,3.5±0.1 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,3.5
1,delta-3-Carene,13466-78-9,0.237,carene,1.9±0.1 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,1.9
2,Caryophyllene oxide,1139-30-6,0.049,(−)-β-Caryophyllene epoxide,0.0±0.6 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,0.0
3,delta-Cadinene,483-76-1,0.048,d-Cadinene,0.0±0.3 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,0.0
4,Camphene,79-92-5,0.034,Camphene,3.4±0.1 mmHg at 25°C,https://www.chemspider.com/Chemical-Structure....,3.4


In [6]:
def accept_cookies(driver,url):
    driver.get(url)
    time.sleep(3)
    cookies = driver.find_elements(By.ID, 'onetrust-accept-btn-handler')
    cookies[0].click()
    time.sleep(1)
    return

In [7]:
def detect_water_sol(text):
    # Pattern to extract water solubility values
    # From [logP, stucture, experimental]
    pattern = r"Water Solubility at 25 deg C \(mg/L\):\s+([\d.]+)|Wat Sol \(v1\.01 est\)\s*=\s*([\d.]+) mg/L|Wat Sol \(Exper\. database match\)\s*=\s*([\d.]+)"


    # Find matches
    matches = re.findall(pattern, text)
    labels = ["sol. from logP", "sol. from structure", "sol. experiental"]
    # Extract and format results
    water_sol_values = [None for i in range(3)]
    for match in matches:
        # Each match has two groups; one of them will be filled
        for i, val in enumerate(match):
            if val != '' and water_sol_values[i]==None:
                water_sol_values[i]=float(val)
            elif val != '' and water_sol_values[i]!=None:
                print(f"Double value for index {labels[i]} detected")
            
        #value = next(v for v in match if v)  # Get the non-empty group
        #water_sol_values.append(float(value))  # Convert to float for numerical processing
    return water_sol_values

In [8]:
def get_add_data(url, driver):
    driver.get(url)
    time.sleep(3)
    
    properties_tab = driver.find_element(By.ID, "tab1")
    properties_tab.click()
    time.sleep(1)
    acd_tab = driver.find_element(By.ID, "accordion-acd/labs-title")
    acd_tab.click()
    time.sleep(1)
    logP_label = driver.find_element(By.XPATH, '//tr[th[text()="ACD/LogP"]]/td')
    logP_value = logP_label.text  # Get the text from the corresponding <td>

    

    mol_name = driver.find_element(By.ID, "cmp-title-label").text
    current_url = driver.current_url

    legacy_btns = driver.find_elements(By.ID, "button-download")
    #for i in legacy_btns:
        #print(i.get_attribute('outerHTML'))
    time.sleep(1)
    # Get the parent element using XPath
    legacy_url = legacy_btns[1].find_element(By.XPATH, "..").get_attribute('href')
    #print(legacy_url)
    driver.get(legacy_url)
    time.sleep(3)
    driver.find_element(By.ID, "ctl00_ctl00_ContentSection_ContentPlaceHolder1_RecordViewTabDetailsControl_prop").click()
    time.sleep(2)
    epi_pred = driver.find_elements(By.XPATH, "//*[text()=' - EPISuite']")
    if len(epi_pred) != 1:
        print("Warning: tab error!")
    #print(epi_pred[0].text)
    epi_pred[0].click()

    
    time.sleep(1)
    predicted = driver.find_element(By.TAG_NAME, "pre")
    #print(predicted.text)
    water_sol_values = detect_water_sol(predicted.text)
    #ui-id-7
    #legacy_btns[1].click()
    time.sleep(5)
    return mol_name, logP_value, legacy_url, water_sol_values

In [81]:
def read_extra(df):
    options = webdriver.ChromeOptions()
    options.page_load_strategy = 'normal'
    driver = webdriver.Chrome(options=options)
    url="https://www.chemspider.com/"
    accept_cookies(driver,url)
    #mol_name, logP_value, legacy_url, water_sol_values
    results_df = pd.DataFrame({
        "mol_name":[],
        "logP_value":[],
        "legacy_url":[],
        "water_sol_from_logP":[],
        "water_sol_from_structure":[],
        "water_sol_experimental":[]
    })
    
    for i in df.index:
        row = list(df[["cas", "name", "url"]].loc[i])
        cas_num = row[0]
        name = row[1]
        url = row[2]
        if name=="epsilon-Muurolene":
            url ="https://www.chemspider.com/Chemical-Structure.453991.html"

        if name=="alpha-Ylangene":
            url="https://www.chemspider.com/Chemical-Structure.552232.html"

        print(f" {i} Fetching {cas_num} ({name})", end=" ")
        fetched_name, logP_value, legacy_url, water_sol_values =get_add_data(url, driver) 
        print(f"={fetched_name} logP: {logP_value} VP: {water_sol_values}")
        results_df.loc[len(results_df)] = [fetched_name, logP_value,legacy_url,water_sol_values[0], water_sol_values[1], water_sol_values[2] ]
        if i%10 == 0:
            results_df.to_csv("extra_data.csv")
    driver.quit()

    return results_df

In [9]:
def read_extra_from_save(df):
    
    options = webdriver.ChromeOptions()
    options.page_load_strategy = 'normal'
    driver = webdriver.Chrome(options=options)
    url="https://www.chemspider.com/"
    accept_cookies(driver,url)
    #mol_name, logP_value, legacy_url, water_sol_values
    results_df = pd.read_csv("extra_data.csv")[["mol_name","logP_value","legacy_url","water_sol_from_logP","water_sol_from_structure","water_sol_experimental"]]
    first_index = len(results_df)
    for i in df.index:
        if i >= first_index:
            row = list(df[["cas", "name", "url"]].loc[i])
            cas_num = row[0]
            name = row[1]
            url = row[2]
            if name=="epsilon-Muurolene":
                url ="https://www.chemspider.com/Chemical-Structure.453991.html"

            if name=="alpha-Ylangene":
                url="https://www.chemspider.com/Chemical-Structure.552232.html"

            print(f" {i} Fetching {cas_num} ({name})", end=" ")
            fetched_name, logP_value, legacy_url, water_sol_values =get_add_data(url, driver) 
            print(f"={fetched_name} logP: {logP_value} VP: {water_sol_values}")
            results_df.loc[len(results_df)] = [fetched_name, logP_value,legacy_url,water_sol_values[0], water_sol_values[1], water_sol_values[2] ]
            if i%10 == 0:
                results_df.to_csv("extra_data.csv", index=False)
    driver.quit()

    return results_df

In [77]:
soldata_df = read_extra(df)

 0 Fetching 80-56-8 (alpha-Pinene) =(+)-α-Pinene logP: 4.37 VP: [4.071, 3.4834, 2.49]
 1 Fetching 13466-78-9 (delta-3-Carene) =carene logP: 4.37 VP: [4.581, 3.4834, None]
 2 Fetching 1139-30-6 (Caryophyllene oxide) =(−)-β-Caryophyllene epoxide logP: 4.57 VP: [2.21, 4.626, None]
 3 Fetching 483-76-1 (delta-Cadinene) =d-Cadinene logP: 6.54 VP: [0.04863, 0.80763, None]
 4 Fetching 79-92-5 (Camphene) =Camphene logP: 4.37 VP: [6.275, 2.6192, 4.6]
 5 Fetching 87-44-5 (beta-Caryophyllene) =(−)-β-caryophyllene logP: 6.78 VP: [0.05011, 0.54268, None]
 6 Fetching 31983-22-9 (alpha-Muurolene) =α-Muurolene logP: 6.56 VP: [0.06276, 0.5123, None]
 7 Fetching 481-34-5 (alpha-Cadinol) =alpha-cadinol logP: 4.95 VP: [9.13, 14.739, None]
 8 Fetching 586-62-9 (Terpinolene) =TERPINOLENE logP: 4.67 VP: [3.838, 93.066, 9.5]
 9 Fetching 127-91-3 (beta-Pinene) =(±)-β-Pinene logP: 4.37 VP: [7.061, 2.6192, None]
 10 Fetching 123-35-3 (Myrcene) =β-Myrcene logP: 4.58 VP: [6.923, 17.814, 5.6]
 11 Fetching 30021-74-

IndexError: list index out of range

In [10]:
soldata_df = read_extra_from_save(df)

 51 Fetching 947-59-1 (alpha-Ylangene) =(1S,2R,6R,7R,8R)-8-Isopropyl-1,3-dimethyltricyclo[4.4.0.0~2,7~]dec-3-ene logP: 6.17 VP: [0.3163, 0.030229, None]
 52 Fetching 141-12-8 (Neryl acetate) =Neryl Acetate logP: 4.10 VP: [18.24, 57.985, None]
 53 Fetching 105-87-3 (Geranyl acetate) =Geranyl acetate logP: 4.10 VP: [18.24, 57.985, None]


In [12]:
len(soldata_df)

54

In [13]:
soldata_df.to_csv("extra_data.csv", index=False)

In [80]:
df.loc[51]["url"]

'https://www.chemspider.com/Chemical-Structure.34987048.html'

In [82]:
options = webdriver.ChromeOptions()
options.page_load_strategy = 'normal'
driver = webdriver.Chrome(options=options)
url="https://www.chemspider.com/"
accept_cookies(driver,url)
url = df.loc[51]["url"]
print(get_add_data(url, driver))
driver.quit()



IndexError: list index out of range