In [1]:
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By

import time

import pandas as pd

In [2]:
def query_pubchem_cid(cas):
    """
    
    """
    #url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/name/{cas}/property/VaporPressure/JSON"
    url =f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{cas}/property/Title,CanonicalSMILES/JSON"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()["PropertyTable"]["Properties"][0]
        print(f"Fetching {cas} ({data['Title']})")
        return data["CID"]
    else:
        return None

In [3]:
query_pubchem_cid("99-87-6")

Fetching 99-87-6 (p-CYMENE)


7463

In [4]:
def query_pubchem_by_cas(cas_number, driver):
    # Set up WebDriver (replace 'path_to_chromedriver' with your ChromeDriver path)
    #service = Service("C:/Users/paulj/Downloads/chromedriver_win32/chromedriver.exe")

    #driver = webdriver.Chrome(service=service)
    cid = query_pubchem_cid(cas_number)
    url = f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}#section=Vapor-Pressure&fullscreen=true"
    try:
        driver.get(url)
        time.sleep(3)  # Allow time for the page to load

        # Locate and parse the Antoine coefficients table
        section = driver.find_elements(By.TAG_NAME, "section")
        if len(section)>1:
            print("long list detected")
        section= section[0]
        if section.get_attribute("id") != "Vapor-Pressure":
            print("Not Vapor-Pressure detected")
        soup = BeautifulSoup(section.get_attribute("outerHTML"), "html.parser")
        data=soup.find_all("div", class_="break-words space-y-1")
        #values = section.find_elements(By.CLASS_NAME, "break-words space-y-1")
        #print(values)
        #result = [value.get_attribute("outerHTML") for value in values]

        result=data
        result = [v.getText() for v in data]
        return result
    except Exception as e:
        print(f"Error retrieving data for CAS {cas_number}: {e}")
        return None


In [5]:
def read_cas_data(csv_filepath="src/smiles_pine.csv"):
    df = pd.read_csv(csv_filepath)
    df = df[["cas", "name"]]
    return df 


In [6]:
def main():
    df = read_cas_data()
    options = webdriver.ChromeOptions()
    options.page_load_strategy = 'normal'
    driver = webdriver.Chrome(options=options)

    results = []

    for i in df.index:
        row = list(df.loc[i])
        cas_num = row[0]
        name = row[1]
        print(f"Fetching {cas_num} ({name})")
        results.append(query_pubchem_by_cas(cas_num, driver))

    driver.quit()

    return results

In [None]:
scraped = main()

Fetching 80-56-8 (alpha-Pinene)
Fetching 80-56-8 (alpha-PINENE)
Fetching 13466-78-9 (delta-3-Carene)
Fetching 13466-78-9 (3-Carene)
Fetching 1139-30-6 (Caryophyllene oxide)
Fetching 1139-30-6 (beta-CARYOPHYLLENE OXIDE)
Error retrieving data for CAS 1139-30-6: list index out of range
Fetching 483-76-1 (delta-Cadinene)
Fetching 483-76-1 ((+)-delta-Cadinene)
Error retrieving data for CAS 483-76-1: list index out of range
Fetching 79-92-5 (Camphene)
Fetching 79-92-5 (Camphene)
Fetching 87-44-5 (beta-Caryophyllene)
Fetching 87-44-5 (Caryophyllene)
Error retrieving data for CAS 87-44-5: list index out of range
Fetching 31983-22-9 (alpha-Muurolene)
Fetching 31983-22-9 (alpha-Muurolene)
Error retrieving data for CAS 31983-22-9: list index out of range
Fetching 481-34-5 (alpha-Cadinol)
Fetching 481-34-5 (Cadin-4-en-10-ol)
Error retrieving data for CAS 481-34-5: list index out of range
Fetching 586-62-9 (Terpinolene)
Fetching 586-62-9 (Terpinolene)
Fetching 127-91-3 (beta-Pinene)
Fetching 127-91

[['10 mmHg at 99.1 °F (NTP, 1992) ',
  '4.75 [mmHg] ',
  '4.75 mm Hg at 25 °C ',
  '10 mmHg at 99.1 °F '],
 ['3.72 [mmHg] '],
 None,
 None,
 ['2.5 [mmHg] ', '2.5 mm Hg at 25 °C ', 'Vapor pressure, kPa at 20 °C: 0.4 '],
 None,
 None,
 None,
 ['0.74 [mmHg] ', '0.74 mm Hg at 25 °C '],
 ['2.93 [mmHg] ', '2.93 mm Hg at 25 °C ', '2.93 mmHg '],
 ['4.61 mmHg (USCG, 1999) ', '2.01 [mmHg] ', '2.09 mm Hg at 25 °C '],
 None,
 ['0.6 [mmHg] '],
 None,
 None,
 None,
 None,
 ['1.55 [mmHg] ', '1.55 mm Hg at 25 °C /extrapolated/ '],
 None,
 None,
 ['1.9 [mmHg] ', '1.90 mm Hg at 25 °C '],
 ['1.59 [mmHg] '],
 None,
 None,
 None,
 None,
 ['0.03 [mmHg] ', '5.02X10-2 mm Hg at 25 °C '],
 None,
 ['1.09 [mmHg] '],
 None,
 None,
 None,
 None,
 None,
 None,
 ['1.4 [mmHg] '],
 None,
 None,
 None,
 None,
 ['0.04 [mmHg] ',
  '0.03 [mmHg] ',
  'VP: 5 mm Hg at 80 to 81 mm Hg /L-alpha-Terpineol/',
  '0.0423 mm Hg at 24 °C '],
 ['1.46 [mmHg] ', '1.50 mm Hg at 25 °C ', 'Vapor pressure, Pa at 20 °C: 200 '],
 None,
 None,


In [8]:
"""cas_numbers =['99-87-6']

for cas in cas_numbers:
    data = query_pubchem_by_cas(cas)
    if data:
        print(f"Data for CAS {cas}: {data}")
    else:
        print(f"No data found for CAS {cas}")"""

'cas_numbers =[\'99-87-6\']\n\nfor cas in cas_numbers:\n    data = query_pubchem_by_cas(cas)\n    if data:\n        print(f"Data for CAS {cas}: {data}")\n    else:\n        print(f"No data found for CAS {cas}")'

In [10]:
results = [['10 mmHg at 99.1 °F (NTP, 1992) ',
  '4.75 [mmHg] ',
  '4.75 mm Hg at 25 °C ',
  '10 mmHg at 99.1 °F '],
 ['3.72 [mmHg] '],
 None,
 None,
 ['2.5 [mmHg] ', '2.5 mm Hg at 25 °C ', 'Vapor pressure, kPa at 20 °C: 0.4 '],
 None,
 None,
 None,
 ['0.74 [mmHg] ', '0.74 mm Hg at 25 °C '],
 ['2.93 [mmHg] ', '2.93 mm Hg at 25 °C ', '2.93 mmHg '],
 ['4.61 mmHg (USCG, 1999) ', '2.01 [mmHg] ', '2.09 mm Hg at 25 °C '],
 None,
 ['0.6 [mmHg] '],
 None,
 None,
 None,
 None,
 ['1.55 [mmHg] ', '1.55 mm Hg at 25 °C /extrapolated/ '],
 None,
 None,
 ['1.9 [mmHg] ', '1.90 mm Hg at 25 °C '],
 ['1.59 [mmHg] '],
 None,
 None,
 None,
 None,
 ['0.03 [mmHg] ', '5.02X10-2 mm Hg at 25 °C '],
 None,
 ['1.09 [mmHg] '],
 None,
 None,
 None,
 None,
 None,
 None,
 ['1.4 [mmHg] '],
 None,
 None,
 None,
 None,
 ['0.04 [mmHg] ',
  '0.03 [mmHg] ',
  'VP: 5 mm Hg at 80 to 81 mm Hg /L-alpha-Terpineol/',
  '0.0423 mm Hg at 24 °C '],
 ['1.46 [mmHg] ', '1.50 mm Hg at 25 °C ', 'Vapor pressure, Pa at 20 °C: 200 '],
 None,
 None,
 None,
 ['40 mmHg at 355.6 °F ; 1 mmHg at 214.3 °F; 760 mmHg at 527 °F (NTP, 1992) '],
 None,
 ['0.04 [mmHg] ', '0.02 mm Hg at 25 °C '],
 ['0.03 [mmHg] '],
 None,
 ['0.2 mmHg (NIOSH, 2024) ',
  '0.65 [mmHg] ',
  'Vapor pressure = 27 Pa at 20 °C (= 0.20 mm Hg) ',
  '0.65 mm Hg at 25 °C ',
  'Vapor pressure, Pa at 20 °C: 27 ',
  '0.2 mmHg '],
 None,
 None,
 ['0.02 mmHg at 68 °F ; 1 mmHg at 164.3 °F (NTP, 1992) ', '0.03 [mmHg] ']]

In [21]:
lsts = [[None for i in results] for i in range(6)]
print(lsts)
for i, result in enumerate(results):
    if result!=None:
        for j, element in enumerate(result):
            lsts[j][i] = result[j]

    else:
        for j in range(6):
            lsts[j][i] = None

lsts

[[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], [None, None, None, No

[['10 mmHg at 99.1 °F (NTP, 1992) ',
  '3.72 [mmHg] ',
  None,
  None,
  '2.5 [mmHg] ',
  None,
  None,
  None,
  '0.74 [mmHg] ',
  '2.93 [mmHg] ',
  '4.61 mmHg (USCG, 1999) ',
  None,
  '0.6 [mmHg] ',
  None,
  None,
  None,
  None,
  '1.55 [mmHg] ',
  None,
  None,
  '1.9 [mmHg] ',
  '1.59 [mmHg] ',
  None,
  None,
  None,
  None,
  '0.03 [mmHg] ',
  None,
  '1.09 [mmHg] ',
  None,
  None,
  None,
  None,
  None,
  None,
  '1.4 [mmHg] ',
  None,
  None,
  None,
  None,
  '0.04 [mmHg] ',
  '1.46 [mmHg] ',
  None,
  None,
  None,
  '40 mmHg at 355.6 °F ; 1 mmHg at 214.3 °F; 760 mmHg at 527 °F (NTP, 1992) ',
  None,
  '0.04 [mmHg] ',
  '0.03 [mmHg] ',
  None,
  '0.2 mmHg (NIOSH, 2024) ',
  None,
  None,
  '0.02 mmHg at 68 °F ; 1 mmHg at 164.3 °F (NTP, 1992) '],
 ['4.75 [mmHg] ',
  None,
  None,
  None,
  '2.5 mm Hg at 25 °C ',
  None,
  None,
  None,
  '0.74 mm Hg at 25 °C ',
  '2.93 mm Hg at 25 °C ',
  '2.01 [mmHg] ',
  None,
  None,
  None,
  None,
  None,
  None,
  '1.55 mm Hg at 25 

In [24]:
new_df=pd.DataFrame(
    {
        i: lsts[i] for i in range(6)
    }
)
new_df

Unnamed: 0,0,1,2,3,4,5
0,"10 mmHg at 99.1 °F (NTP, 1992)",4.75 [mmHg],4.75 mm Hg at 25 °C,10 mmHg at 99.1 °F,,
1,3.72 [mmHg],,,,,
2,,,,,,
3,,,,,,
4,2.5 [mmHg],2.5 mm Hg at 25 °C,"Vapor pressure, kPa at 20 °C: 0.4",,,
5,,,,,,
6,,,,,,
7,,,,,,
8,0.74 [mmHg],0.74 mm Hg at 25 °C,,,,
9,2.93 [mmHg],2.93 mm Hg at 25 °C,2.93 mmHg,,,
