In [21]:
import pandas as pd

# from cluster import embed_items, create_cluster
import requests
from bs4 import BeautifulSoup


def parse_historical_table(html_content, df=None):
    """
    Parses the messy HTML table into a structured list of dictionaries.
    """
    next_root = "https://www.british-history.ac.uk"
    html = requests.get(next_root + html_content).text
    soup = BeautifulSoup(html, "html.parser")
    next_row = soup.find("div", {"id": "block-componentpager"})
    # if next_row:
    #     next = next_row.find(lambda tag: "Next" in tag.text)
    #     if next:
    #         next_html = next.get("href")
    #         print(next_root + next_html)
    rows = soup.find(class_="table-wrap").find("table").find_all(["th", "td"])
    idx = [i for i, element in enumerate(rows) if element.name == "th"]
    citation = soup.find(class_="chicago").text.strip()
    cols = ["date", "text", "citation"]
    lines = []
    for index, i in enumerate(idx):
        row = []
        row.append(rows[i].text)
        if index == idx.index(idx[-1]):
            row.append("\n".join(t.text for t in rows[i + 1 :]))
        else:
            row.append("\n".join(t.text for t in rows[i + 1 : idx[index + 1]]))
        row.append(citation)
        lines.append(row)

    df1 = pd.DataFrame(columns=cols, data=lines)
    df1["text"] = df1["text"].str.replace("\n", "")

    return df1

In [22]:
# from cluster import embed_items
url_num = 4

with open("data_links.txt", "r") as f:
    data_links = f.read().splitlines()

data_links = data_links[:url_num]

dfs = []
for url in data_links:
    df = parse_historical_table(url)
    dfs.append(df)
    print(f"finished {url}")
table = pd.concat(dfs).reset_index(drop=True)
display(table.shape)

finished /cal-state-papers/venice/vol1/pp1-3
finished /cal-state-papers/venice/vol1/pp3-39
finished /cal-state-papers/venice/vol1/pp39-52
finished /cal-state-papers/venice/vol1/pp52-61


(239, 3)

In [23]:
# Run this in your notebook
print(f"Number of texts: {len(table['text'])}")
print(f"Average length: {table['text'].str.split(' ').str.len().mean():.0f} words")
print(f"Total characters: {table['text'].str.split(' ').str.len().sum():,}")

Number of texts: 239
Average length: 106 words
Total characters: 25,452


In [24]:
table.head()

Unnamed: 0,date,text,citation
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape..."
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape..."
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape..."
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape..."
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape..."


In [25]:
def get_year(row: pd.Series) -> float:
    match = pd.Series(row["date"]).str.extract(r"(\d{4})", expand=False)
    year = match.iloc[0]

    if not pd.isna(year):
        return float(year)
    idx = row.name  # row.name contains the index

    if pd.isna(year):
        while idx > 0:
            idx -= 1
            prev_date = table.iloc[idx]
            match = pd.Series(prev_date["date"]).str.extract(r"(\d{4})", expand=False)
            year = match.iloc[0]
            if not pd.isna(year):
                break
    return float(year)


table["year"] = table.apply(get_year, axis=1)
# table["year"] = table["date"].str.extract(r"(\d{4})").astype(float)
# table["year"]
# table.iloc[39:43]
table["year"]

0      1202.0
1      1224.0
2      1265.0
3      1272.0
4      1273.0
        ...  
234    1419.0
235    1419.0
236    1420.0
237    1420.0
238    1420.0
Name: year, Length: 239, dtype: float64

In [26]:
table["embeds"] = embed_items(table["text"])
table["embeds"]

0      [-0.050438251346349716, 0.015423907898366451, ...
1      [0.02090444043278694, 0.03915192559361458, 0.0...
2      [0.014525304548442364, 0.025349894538521767, 0...
3      [0.01985478401184082, 0.003296412993222475, 0....
4      [0.007098940201103687, 0.01995459944009781, 0....
                             ...                        
234    [0.021589407697319984, 0.03307503089308739, 0....
235    [0.01747804880142212, 0.01886497065424919, 0.0...
236    [-0.00818114634603262, 0.06117068976163864, 0....
237    [0.005689109209924936, 0.02060399204492569, 0....
238    [0.005258034914731979, 0.03298444673418999, 0....
Name: embeds, Length: 239, dtype: object

In [28]:
import importlib
import cluster

importlib.reload(cluster)
from cluster import create_cluster, spectral_clusters

In [30]:
# embeds = table["embeds"].to_numpy()
embeds = list(table["embeds"])
sims, clusters = create_cluster(embeds, num_clusters=3)
sims, clusters

# spectral_labels = spectral_clusters(embeds, n_clusters=3, random_state=42)
[len(c) for c in clusters]

[90, 64, 198]

In [20]:
# urls = get_urls("https://www.british-history.ac.uk/cal-state-papers/venice/vol35")
# dfs = []
# for url in data_links:
#     df = parse_historical_table(url)
#     dfs.append(df)
#     print(f"finished {url}")
# main_df = pd.concat(dfs)
# display(main_df.shape)