# Exploration - Scraping NIST SPs

In [7]:
import requests
from lxml import html

In [5]:
response = requests.get("https://csrc.nist.gov/publications/sp")
assert response.status_code == 200

In [37]:
page = html.fromstring(response.content)

results_table_items = page.xpath("//table[@id='publications-results-table']/tbody/tr")

print("Number of SPs:", len(results_table_items))

Number of SPs: 293


In [32]:
sps = {}

for result in results_table_items:
    series = result.xpath("td[starts-with(@id, 'pub-series-')]/text()")[0].strip()
    number = result.xpath("td[starts-with(@id, 'pub-number-')]/text()")[0].strip()
    status = result.xpath("td[starts-with(@id, 'pub-status-')]/text()")[0].strip()
    release_date = result.xpath("td[starts-with(@id, 'pub-release-date-')]/text()")[0].strip()

    title = result.xpath(".//a[starts-with(@id, 'pub-title-link-')]/text()")[0].strip()
    href = result.xpath(".//a[starts-with(@id, 'pub-title-link-')]/@href")[0].strip()

    sps[f"{series} {number}"] = {
        'status': status,
        'release_date': release_date,
        'title': title,
        'href': f"https://csrc.nist.gov{href}",
    }

sps["SP 800-53B"]

{'status': 'Final',
 'release_date': '12/10/2020',
 'title': 'Control Baselines for Information Systems and Organizations',
 'href': 'https://csrc.nist.gov/pubs/sp/800/53/b/upd1/final'}

In [35]:
def get_sp_pub_links(href: str) -> list[str]:
    response = requests.get(href)
    page = html.fromstring(response.content)

    # Note that multiple links will share the same ID :/
    return page.xpath("//a[@id='pub-local-download-link']/@href")

get_sp_pub_links(sps["SP 800-53B"]["href"])

['https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-53B.pdf']

In [36]:
# TODO maybe avoid DOS-ing a government website
for key, value in sps.items():
    sps[key]["pub_links"] = get_sp_pub_links(value["href"])

sps["SP 800-53B"]

{'status': 'Final',
 'release_date': '12/10/2020',
 'title': 'Control Baselines for Information Systems and Organizations',
 'href': 'https://csrc.nist.gov/pubs/sp/800/53/b/upd1/final',
 'pub_links': ['https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-53B.pdf']}