In [None]:
import re
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd

# import display
from IPython.display import display, HTML

# The original, poorly structured HTML table provided by the user.
root_url = "https://www.british-history.ac.uk"
main_page = (
    "https://www.british-history.ac.uk/series/calendar-state-papers-venice?page="
)

total_pages = 4


def get_volumes_on_page(url):
    """
    Get the urls for each volume on the
    """
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")
    links = soup.find_all(lambda tag: tag.name == "a" and "Volume" in tag.text)
    return [link.get("href") for link in links]


volume_urls = []
for i in range(total_pages):
    volume_url = get_volumes_on_page(f"{main_page}{i}")
    volume_url = [root_url + url for url in volume_url]
    volume_urls.extend(volume_url)

display(volume_urls)

In [None]:
def get_urls(main_url):
    soup = BeautifulSoup(requests.get(main_url).text, "html.parser")
    table = soup.find("table")
    interested_rows = table.find_all(lambda tag: "Venice:" in tag.text)
    urls = [row.get("href") for row in interested_rows]
    return [url for url in urls if url != None]


data_links = []
for url in volume_urls:
    data_links.extend(get_urls(url))

display(data_links)

In [None]:
def parse_historical_table(html_content, df=None):
    """
    Parses the messy HTML table into a structured list of dictionaries.
    """
    next_root = "https://www.british-history.ac.uk"
    html = requests.get(next_root + html_content).text
    soup = BeautifulSoup(html, "html.parser")
    next_row = soup.find("div", {"id": "block-componentpager"})
    # if next_row:
    #     next = next_row.find(lambda tag: "Next" in tag.text)
    #     if next:
    #         next_html = next.get("href")
    #         print(next_root + next_html)
    rows = soup.find(class_="table-wrap").find("table").find_all(["th", "td"])
    idx = [i for i, element in enumerate(rows) if element.name == "th"]
    cols = ["date", "title", "text"]
    lines = []
    for index, i in enumerate(idx):
        row = []
        row.append(rows[i].text)
        row.append(rows[i + 1].text)
        if index == idx.index(idx[-1]):
            row.append("\n".join(t.text for t in rows[i + 2 :]))
        else:
            row.append("\n".join(t.text for t in rows[i + 2 : idx[index + 1]]))
        lines.append(row)

    df1 = pd.DataFrame(columns=cols, data=lines)
    df1["title"] = df1["title"].str.replace("\n", "")
    df1["text"] = df1["text"].str.replace("\n", "")

    return df1

In [None]:
def find_important_words(text):
    """
    Finds content that is of significance to me.
    """

    word_list = [
        "France",
        "Ottoman Empire",
        "Turk",
        "Osman",
        "Sultan",
        "french",
        "lewis",
        "louis",
        "de medici",
        "ambassador",
        "",
    ]

In [None]:
urls = get_urls("https://www.british-history.ac.uk/cal-state-papers/venice/vol35")

# display(urls)

dfs = []

for url in data_links:
    df = parse_historical_table(url)
    dfs.append(df)
    print(f"finished {url}")

main_df = pd.concat(dfs)

display(main_df.shape)