In [3]:
import re
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd

# import display
from IPython.display import display, HTML

# The original, poorly structured HTML table provided by the user.
root_url = "https://www.british-history.ac.uk"
main_page = (
    "https://www.british-history.ac.uk/series/calendar-state-papers-venice?page="
)

total_pages = 4


def get_volumes_on_page(url):
    """
    Get the urls for each volume on the
    """
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")
    links = soup.find_all(lambda tag: tag.name == "a" and "Volume" in tag.text)
    return [link.get("href") for link in links]


volume_urls = []
for i in range(total_pages):
    volume_url = get_volumes_on_page(f"{main_page}{i}")
    volume_url = [root_url + url for url in volume_url]
    volume_urls.extend(volume_url)

display(volume_urls)

['https://www.british-history.ac.uk/cal-state-papers/venice/vol1',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol2',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol3',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol4',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol5',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol6',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol7',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol8',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol9',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol10',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol11',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol12',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol13',
 'https://www.british-history.ac.uk/cal-state-papers/venice/vol14',
 'https://www.british-history.ac.uk/cal-state-papers/veni

In [4]:
def get_urls(main_url):
    soup = BeautifulSoup(requests.get(main_url).text, "html.parser")
    table = soup.find("table")
    interested_rows = table.find_all(lambda tag: "Venice:" in tag.text)
    urls = [row.get("href") for row in interested_rows]
    return [url for url in urls if url != None]


data_links = []
for url in volume_urls:
    data_links.extend(get_urls(url))

display(data_links)

['/cal-state-papers/venice/vol1/pp1-3',
 '/cal-state-papers/venice/vol1/pp3-39',
 '/cal-state-papers/venice/vol1/pp39-52',
 '/cal-state-papers/venice/vol1/pp52-61',
 '/cal-state-papers/venice/vol1/pp61-63',
 '/cal-state-papers/venice/vol1/pp63-65',
 '/cal-state-papers/venice/vol1/pp65-74',
 '/cal-state-papers/venice/vol1/pp74-92',
 '/cal-state-papers/venice/vol1/pp92-126',
 '/cal-state-papers/venice/vol1/pp126-140',
 '/cal-state-papers/venice/vol1/pp141-159',
 '/cal-state-papers/venice/vol1/pp159-203',
 '/cal-state-papers/venice/vol1/pp203-226',
 '/cal-state-papers/venice/vol1/pp226-252',
 '/cal-state-papers/venice/vol1/pp252-266',
 '/cal-state-papers/venice/vol1/pp267-276',
 '/cal-state-papers/venice/vol1/pp276-286',
 '/cal-state-papers/venice/vol1/pp286-289',
 '/cal-state-papers/venice/vol1/pp289-291',
 '/cal-state-papers/venice/vol1/pp291-295',
 '/cal-state-papers/venice/vol1/pp295-298',
 '/cal-state-papers/venice/vol1/pp298-300',
 '/cal-state-papers/venice/vol1/pp300-310',
 '/cal-s

In [20]:
def parse_historical_table(html_content, df=None):
    """
    Parses the messy HTML table into a structured list of dictionaries.
    """
    next_root = "https://www.british-history.ac.uk"
    html = requests.get(next_root + html_content).text
    soup = BeautifulSoup(html, "html.parser")
    next_row = soup.find("div", {"id": "block-componentpager"})
    # if next_row:
    #     next = next_row.find(lambda tag: "Next" in tag.text)
    #     if next:
    #         next_html = next.get("href")
    #         print(next_root + next_html)
    rows = soup.find(class_="table-wrap").find("table").find_all(["th", "td"])
    idx = [i for i, element in enumerate(rows) if element.name == "th"]
    citation = soup.find(class_="chicago").text.strip()
    cols = ["date", "text", "citation"]
    lines = []
    for index, i in enumerate(idx):
        row = []
        row.append(rows[i].text)
        if index == idx.index(idx[-1]):
            row.append("\n".join(t.text for t in rows[i + 1 :]))
        else:
            row.append("\n".join(t.text for t in rows[i + 1 : idx[index + 1]]))
        row.append(citation)
        lines.append(row)

    df1 = pd.DataFrame(columns=cols, data=lines)
    df1["text"] = df1["text"].str.replace("\n", "")

    return df1

In [None]:
# parse_historical_table("/cal-state-papers/venice/vol1/pp1-3")

Unnamed: 0,date,text,citation
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape..."
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape..."
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape..."
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape..."
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape..."
5,1274. Oct. 9. “Comune II.” “Deliberazioni.” Gr...,6. Sterling Silver.Overseers of money gross (m...,"'Venice: 1202-1295', in Calendar of State Pape..."
6,"1287. Sept. 16. “Deliberazioni,” Grand Council.",7. Saffron.[Page 3]Act passed by the Grand Cou...,"'Venice: 1202-1295', in Calendar of State Pape..."
7,"1295. July 17. “Deliberazioni,” Grand Council....",8. Sterling Money.Whereas the Duke and Council...,"'Venice: 1202-1295', in Calendar of State Pape..."


In [None]:
def find_important_words(text):
    """
    Finds content that is of significance to me.
    """

    word_list = [
        "France",
        "Ottoman Empire",
        "Turk",
        "Osman",
        "Sultan",
        "french",
        "lewis",
        "louis",
        "de medici",
        "ambassador",
        "",
    ]

In [None]:
urls = get_urls("https://www.british-history.ac.uk/cal-state-papers/venice/vol35")

# display(urls)

dfs = []

for url in data_links:
    df = parse_historical_table(url)
    dfs.append(df)
    print(f"finished {url}")

main_df = pd.concat(dfs)

display(main_df.shape)