# CCCBDB Reqests and parser playground

In [61]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from html.parser import HTMLParser
import pandas as pd
import io
import re
from pathlib import Path

In [62]:
FORM_URL = "https://cccbdb.nist.gov/getformx.asp"
EXP1_URL = "https://cccbdb.nist.gov/exp1x.asp"
EXP2_URL = "https://cccbdb.nist.gov/exp2x.asp"

def headers(**kwargs):
    defaults = {
        "Host": "cccbdb.nist.gov",
        "Connection": "keep-alive",
        "Content-Length": "26",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
        "Origin": "https://cccbdb.nist.gov",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        "Content-Type": "application/x-www-form-urlencoded",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "en-CA,en-GB;q=0.8,en-US;q=0.6,en;q=0.4",
        #"Referer": referer, ## Depends on query
    }
    defaults.update(kwargs)
    return defaults

In [63]:
def get_exp_data_by_cas(cas: int) -> bytes:
    """
    """

    session = requests.Session()
    payload = {
        "formula": cas,
        "submit1": "Submit",
    }
    query_headers = headers(Referer=EXP1_URL)
    print(f"{cas:>12} :: Submitting Query ...")
    query_res = session.post(
        FORM_URL,
        data=payload,
        headers=query_headers,
        allow_redirects=False,
    )
    if query_res.status_code != 302:
        raise requests.RequestException(
            f"Wrong Query responce {query_response.status_code}: must be 302"
        )

    print(f"{cas:>12} :: Collecting Data ...")
    data_res = session.get(EXP2_URL)
    if data_res.status_code != 200:
        raise requests.RequestException(
            f"Wrong Query responce {data_res.status_code}: must be 200"
        )

    return data_res.content

In [64]:
cas_ids = [
    71432,  # Benzene
    78820,  # 2-methylpropanenitrile
]
for cas in cas_ids:
    print(f"{cas:>12} :: Processing CCCBDB ...")
    for try_ in range(5):
        try:
            data = get_exp_data_by_cas(cas)
            break
        except Exception as e:
            print(f"{cas:>12} :: try {try_ + 1}")
            print(f"{cas:>12} :: {e}")
            sleep(5)
            continue
    with open(f"{cas}.html", "bw") as dump:
        dump.write(data)
    print(f"{cas:>12} :: Done\n")

       71432 :: Processing CCCBDB ...
       71432 :: Submitting Query ...
       71432 :: Collecting Data ...
       71432 :: Done

       78820 :: Processing CCCBDB ...
       78820 :: Submitting Query ...
       78820 :: Collecting Data ...
       78820 :: Done



## Parse Tables

In [65]:
with open("71432.html", "br") as htm:
    content = htm.read()
soup = BeautifulSoup(content, 'html.parser')

In [66]:
def decorate_html(tag):
    return io.StringIO(
        "<!DOCTYPE HTML><html><body>"
        f"{tag}"
        "</body></html>"
    )

In [67]:
vibrations_div = soup.find(
    "div",
    attrs={
        "class": "box",
        "title": "Vibrational symmetries, frequencies, and intensities"
    },
)
vibrations_table = decorate_html(vibrations_div.find('table'))

refs_div = soup.find(
    "div",
    attrs={
        "class": "box",
        "title": "References"
    }
)
refs_table = decorate_html(refs_div.find("table", attrs={"id": "reftable"}))

In [68]:
pd.read_html(vibrations_table)[0]

Unnamed: 0_level_0,Mode Number,Symmetry,Frequency,Frequency,Frequency,Intensity,Intensity,Intensity,Comment,Description
Unnamed: 0_level_1,Mode Number,Symmetry,Fundamental(cm-1),Harmonic(cm-1),Reference,(km mol-1),unc.,Reference,Comment,Description
0,1,A1g,3062,,Shim,,,,,
1,2,A1g,992,,Shim,,,,,
2,3,A2g,1326,,Shim,,,,,
3,4,A2u,673,,Shim,,,,,
4,5,B1u,3068,,Shim,,,,,
5,6,B1u,1010,,Shim,,,,,
6,7,B2g,995,,Shim,,,,,
7,8,B2g,703,,Shim,,,,,
8,9,B2u,1310,,Shim,,,,,
9,10,B2u,1150,,Shim,,,,,


In [69]:
pd.read_html(refs_table)[0]

Unnamed: 0,squib,reference,DOI
0,1966Herzberg,"Herzberg, G., Electronic spectra and electroni...",
1,1981Bat/Buc:421,"MR Battaglia, AD Buckingham, JH Williams ""The ...",10.1016/0009-2614(81)85228-1
2,1998Gus/Rui:163,"M Gussoni, R Rui, G Zerbi ""Electronic and rela...",10.1016/S0022-2860(97)00292-5
3,NSRDS-NBS10,"R. D. Nelson Jr., D. R. Lide, A. A. Maryott ""S...",10.6028/NBS.NSRDS.10
4,Shim,"Shimanouchi, T. , Tables of Molecular Vibratio...",10.6028/NBS.NSRDS.39
5,TRC,"Frenkel, M; Marsh, K.N.; Wilhoit, R.C.; Kabo, ...",
6,webbook,NIST Chemistry Webbook (http://webbook.nist.go...,10.18434/T4D303


### Making function

In [77]:
def extract_info(cas, html_content, data_dir=Path("./")):
    soup = BeautifulSoup(html_content, 'html.parser')

    vibrations_div = soup.find(
        "div",
        attrs={
            "class": "box",
            "title": "Vibrational symmetries, frequencies, and intensities"
        },
    )
    vibrations_table = decorate_html(vibrations_div.find('table'))
    vibs = pd.read_html(vibrations_table)[0]
    vibs.to_csv(
        data_dir/f"{cas}_vibs.csv",
    )

    refs_div = soup.find(
        "div",
        attrs={
            "class": "box",
            "title": "References"
        }
    )
    refs_table = decorate_html(refs_div.find("table", attrs={"id": "reftable"}))   
    refs = pd.read_html(refs_table)[0]
    refs.to_csv(
        data_dir/f"{cas}_refs.csv",
    )

In [78]:
for cas in [78820, 71432]:
    with open(f"{cas}.html", "br") as html_handler:
        html_content = html_handler.read()
    extract_info(cas, html_content)

## Full List of Compounds

In [45]:
raw_table = pd.read_html("cccbdb_list.html", extract_links="body")[3]

In [46]:
raw_table

Unnamed: 0_level_0,link,species,name,atoms,sketch
Unnamed: 0_level_1,1 atom,1 atom,1 atom,1 atom,1 atom
0,"(data, alldata2.asp?casno=12385136&charge=-1)","(H-, None)","(Hydrogen atom anion, None)","(1, None)","(H-, None)"
1,"(data, alldata2.asp?casno=12385136&charge=0)","(H, None)","(Hydrogen atom, None)","(1, None)","(H, None)"
2,"(data, alldata2.asp?casno=12385136&charge=1)","(H+, None)","(Hydrogen atom cation, None)","(1, None)","(H+, None)"
3,"(data, alldata2.asp?casno=16873179&charge=-1)","(D-, None)","(Deuterium atom anion, None)","(1, None)","(D-, None)"
4,"(data, alldata2.asp?casno=16873179&charge=0)","(D, None)","(Deuterium atom, None)","(1, None)","(D, None)"
...,...,...,...,...,...
2213,"(data, alldata2.asp?casno=519733&charge=0)","(C19H16, None)","(Triphenylmethane, None)","(35, None)","(, None)"
2214,"(36 atoms, None)","(36 atoms, None)","(36 atoms, None)","(36 atoms, None)","(36 atoms, None)"
2215,"(data, alldata2.asp?casno=191071&charge=0)","(C24H12, None)","(Coronene, None)","(36, None)","(, None)"
2216,"(60 atoms, None)","(60 atoms, None)","(60 atoms, None)","(60 atoms, None)","(60 atoms, None)"


In [49]:
processed_table = pd.DataFrame()

In [51]:
cas_ch_re = re.compile(r"^alldata2.asp\?casno=(\d+)\&charge=([-+]?[0-9]+)$", re.X|re.I)
def extract_cas(line):
    if not isinstance(line, str):
        return None
    m = cas_ch_re.match(line)
    if m:
        return m.group(1)

def extract_ch(line):
    if not isinstance(line, str):
        return None
    m = cas_ch_re.match(line)
    if m:
        return int(m.group(2))

In [52]:
extract_cas("alldata2.asp?casno=7440553&charge=-1")

'7440553'

In [53]:
extract_ch("alldata2.asp?casno=7440553&charge=-1")

-1

In [54]:
processed_table["link"] = raw_table[("link", '1 atom')].map(
    lambda val: val[1]
)
processed_table["species"] = raw_table[("species", '1 atom')].map(
    lambda val: val[0]
)
processed_table["name"] = raw_table[("name", '1 atom')].map(
    lambda val: val[0]
)
processed_table["atoms"] = raw_table[("atoms", '1 atom')].map(
    lambda val: int(val[0]) if val[0].isdigit() else 0
)
processed_table["cas_no"] = processed_table["link"].map(
    extract_cas
)
processed_table["charge"] = processed_table["link"].map(
    extract_ch
)

In [55]:
processed_table.sort_values(["atoms", "cas_no"], axis=0).head(40)

Unnamed: 0,link,species,name,atoms,cas_no,charge
107,,2 atoms,2 atoms,0,,
558,,3 atoms,3 atoms,0,,
826,,4 atoms,4 atoms,0,,
1050,,5 atoms,5 atoms,0,,
1227,,6 atoms,6 atoms,0,,
1371,,7 atoms,7 atoms,0,,
1454,,8 atoms,8 atoms,0,,
1579,,9 atoms,9 atoms,0,,
1646,,10 atoms,10 atoms,0,,
1723,,11 atoms,11 atoms,0,,


In [56]:
data_to_store = processed_table[processed_table["charge"] == 0].drop("link", axis=1)
data_to_store

Unnamed: 0,species,name,atoms,cas_no,charge
1,H,Hydrogen atom,1,12385136,0.0
4,D,Deuterium atom,1,16873179,0.0
7,He,Helium atom,1,7440597,0.0
10,Li,Lithium atom,1,7439932,0.0
13,Be,Beryllium atom,1,7440417,0.0
...,...,...,...,...,...
2209,C18H12,Tetracene,30,92240,0.0
2211,C10H22,Decane,32,124185,0.0
2213,C19H16,Triphenylmethane,35,519733,0.0
2215,C24H12,Coronene,36,191071,0.0


In [57]:
data_to_store.to_csv("all_neutrals_cccbdb.csv", index=False)