In [None]:
import urllib2
import urllib
from collections import OrderedDict

from bs4 import BeautifulSoup
from cookielib import CookieJar

In [None]:
# Set up cookie jar to store cookies
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

In [None]:
# Make simple function to get webpage and soup
def get(url, data=None):
    response = opener.open(url, data=data)

    return BeautifulSoup(response, 'html.parser')

In [None]:
# Search for experimental data on the species page
def get_expt(spc_soup):
    rows = spc_soup.find_all('tr')
    for row in rows:
        columns = row.find_all('td')
        for i, column in enumerate(columns):
            if column.get_text().strip() == 'Enthalpy 298.15K':
                # Next column is experimental data
                next_column = columns[i + 1]
                if 'x' in next_column.get_text():
                    return base_url + next_column.a['href']
    return False

In [None]:
# Search for InChI from experimental data page
def get_inchi(expt_soup):
    rows = expt_soup.find_all('tr')
    for row in rows:
        columns = row.find_all('td')
        for column in columns:
            text = column.get_text().strip()
            if 'InChI=' in text:
                return text

In [None]:
# Search for H298 data from experimental data page
def get_h(expt_soup):
    rows = expt_soup.find_all('tr')
    for row in rows:
        columns = row.find_all('td')
        for i, column in enumerate(columns):
            if column.get_text().strip() == 'Hfg(298.15K)':
                # Next three columns are value, uncertainty, and units
                return (columns[i + 1].get_text().strip(), columns[i + 2].get_text().strip(), columns[i + 3].get_text().strip())

In [None]:
base_url = 'https://cccbdb.nist.gov/'
list_url = base_url + 'listallx.asp'

In [None]:
# Retrieve full species list
list_soup = get(list_url)

In [None]:
# Parse links to species pages from list
links = list_soup.find_all('a')

spc_list = OrderedDict()
for link in links:
    if link.has_attr('href') and 'casno=' in link['href']:
        target = link['href']
        # It seems that some link to old data page? Replace with new link for consistency
        target = target.replace('alldata2.asp', 'alldata2x.asp')
        spc_list[link.get_text()] = target

print len(spc_list)

In [None]:
# Perform a normal request to set up cookies properly
# For some reason, following links on the species list page doesn't work unless you do a normal search first
data = urllib.urlencode({'formula': 'H', 'SUBMIT1': 'Submit'})
form_url = 'https://cccbdb.nist.gov/getformx.asp'
test_soup = get(form_url, data)

In [None]:
# Try getting experimental data for every species in the list
errors = OrderedDict()
all_data = []
for i, (name, partial_url) in enumerate(spc_list.iteritems()):
    if 'ion' in name.lower():
        # Skip ions (?)
        continue
    spc_url = base_url + partial_url
    try:
        spc_soup = get(spc_url)
    except urllib2.HTTPError as e:
        # Server error, might work if we try again later, so save the species
        errors[name] = partial_url
        print name + ' - ' + e.message
        continue
    expt_url = get_expt(spc_soup)
    if expt_url:
        expt_soup = get(expt_url)
        inchi = get_inchi(expt_soup)
        value, uncertainty, units = get_h(expt_soup)
        if value:
            value = float(value)
        else:
            value = None
        if uncertainty:
            uncertainty = float(uncertainty)
        else:
            uncertainty = None
        all_data.append([inchi, value, uncertainty, units])
        print name + ' - found expt data'
    else:
        print name + ' - no expt data found'