In [3]:
import pandas as pd

# from cluster import embed_items, create_cluster
import requests
from bs4 import BeautifulSoup


def parse_historical_table(html_content, df=None):
    """
    Parses the messy HTML table into a structured list of dictionaries.
    """
    next_root = "https://www.british-history.ac.uk"
    html = requests.get(next_root + html_content).text
    soup = BeautifulSoup(html, "html.parser")
    next_row = soup.find("div", {"id": "block-componentpager"})
    # if next_row:
    #     next = next_row.find(lambda tag: "Next" in tag.text)
    #     if next:
    #         next_html = next.get("href")
    #         print(next_root + next_html)
    rows = soup.find(class_="table-wrap").find("table").find_all(["th", "td"])
    idx = [i for i, element in enumerate(rows) if element.name == "th"]
    citation = soup.find(class_="chicago").text.strip()
    cols = ["date", "text", "citation"]
    lines = []
    for index, i in enumerate(idx):
        row = []
        row.append(rows[i].text)
        if index == idx.index(idx[-1]):
            row.append("\n".join(t.text for t in rows[i + 1 :]))
        else:
            row.append("\n".join(t.text for t in rows[i + 1 : idx[index + 1]]))
        row.append(citation)
        lines.append(row)

    df1 = pd.DataFrame(columns=cols, data=lines)
    df1["text"] = df1["text"].str.replace("\n", "")

    return df1

In [4]:
# from cluster import embed_items
url_num = 4

with open("data_links.txt", "r") as f:
    data_links = f.read().splitlines()

data_links = data_links[:url_num]

dfs = []
for url in data_links:
    df = parse_historical_table(url)
    dfs.append(df)
    print(f"finished {url}")
table = pd.concat(dfs).reset_index(drop=True)
display(table.shape)

finished /cal-state-papers/venice/vol1/pp1-3
finished /cal-state-papers/venice/vol1/pp3-39
finished /cal-state-papers/venice/vol1/pp39-52
finished /cal-state-papers/venice/vol1/pp52-61


(239, 3)

In [5]:
# Run this in your notebook
print(f"Number of texts: {len(table['text'])}")
print(f"Average length: {table['text'].str.split(' ').str.len().mean():.0f} words")
print(f"Total characters: {table['text'].str.split(' ').str.len().sum():,}")

Number of texts: 239
Average length: 106 words
Total characters: 25,452


In [6]:
table.head()

Unnamed: 0,date,text,citation
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape..."
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape..."
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape..."
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape..."
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape..."


In [7]:
def get_year(row: pd.Series) -> float:
    match = pd.Series(row["date"]).str.extract(r"(\d{4})", expand=False)
    year = match.iloc[0]

    if not pd.isna(year):
        return float(year)
    idx = row.name  # row.name contains the index

    if pd.isna(year):
        while idx > 0:
            idx -= 1
            prev_date = table.iloc[idx]
            match = pd.Series(prev_date["date"]).str.extract(r"(\d{4})", expand=False)
            year = match.iloc[0]
            if not pd.isna(year):
                break
    return float(year)


table["year"] = table.apply(get_year, axis=1)
# table["year"] = table["date"].str.extract(r"(\d{4})").astype(float)
# table["year"]
# table.iloc[39:43]
table["year"]

0      1202.0
1      1224.0
2      1265.0
3      1272.0
4      1273.0
        ...  
234    1419.0
235    1419.0
236    1420.0
237    1420.0
238    1420.0
Name: year, Length: 239, dtype: float64

In [8]:
from cluster import embed_items

table["embeds"] = embed_items(table["text"])
table["embeds"]

0      [-0.050486620515584946, 0.01540993433445692, 0...
1      [0.02098303660750389, 0.03918666020035744, 0.0...
2      [0.014511803165078163, 0.02529653161764145, 0....
3      [0.01985478401184082, 0.003296412993222475, 0....
4      [0.007098940201103687, 0.01995459944009781, 0....
                             ...                        
234    [0.021591104567050934, 0.033047910779714584, 0...
235    [0.017445694655179977, 0.01891718991100788, 0....
236    [-0.008166288956999779, 0.06116767227649689, 0...
237    [0.005722603760659695, 0.020581543445587158, 0...
238    [0.005258034914731979, 0.03298444673418999, 0....
Name: embeds, Length: 239, dtype: object

In [35]:
import numpy as np
import importlib
import cluster


importlib.reload(cluster)
from cluster import create_cluster, spectral_clusters

In [36]:
%%time

embeds = np.stack(table["embeds"].to_list()).astype(np.float64)
clusters, samples = create_cluster(embeds, num_clusters=3)
# sims, clusters

# spectral_labels = spectral_clusters(embeds, n_clusters=3, random_state=42)
# [len(c) for c in clusters]
clusters

CPU times: user 204 ms, sys: 13.8 ms, total: 217 ms
Wall time: 151 ms


[1,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 2,
 2,
 2,
 0,
 1,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2]

In [37]:
table["cluster"] = clusters

table["cluster"]

0      1
1      0
2      2
3      2
4      2
      ..
234    2
235    0
236    2
237    2
238    2
Name: cluster, Length: 239, dtype: int64

In [38]:
table.head()

Unnamed: 0,date,text,citation,year,embeds,cluster
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape...",1202.0,"[-0.050486620515584946, 0.01540993433445692, 0...",1
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape...",1224.0,"[0.02098303660750389, 0.03918666020035744, 0.0...",0
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape...",1265.0,"[0.014511803165078163, 0.02529653161764145, 0....",2
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape...",1272.0,"[0.01985478401184082, 0.003296412993222475, 0....",2
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape...",1273.0,"[0.007098940201103687, 0.01995459944009781, 0....",2


In [40]:
cluster_samples = [table.loc[sample, "text"] for sample in samples]
cluster_samples

[148    138. Letters Patent from Henry IV.Inspeximus a...
 52     42. Edward III. to Doge Andrea Contarini.Ackno...
 226    216. Decree of the Senate.That a letter be wri...
 140    130. Letters Patent for Venetian Subjects, the...
 51     41. Edward III. to Doge Andrea Contarini.Ackno...
 53     43. Reply of the Senate to the Demands made by...
 63     53. Letters patent from Edward III. to all adm...
 27     3. Similar release from Johan de Isle de Wyght...
 182    172. Commission to the nobleman “Ser” Antonio ...
 50     40. Letters patent from Edward III., acquainti...
 Name: text, dtype: object,
 69    59. Sir John Hawkwood to Lodovico Gonzaga, Lor...
 72    62. Sir John Hawkwood to Lodovico Gonzaga.In r...
 83    73. Sir John Hawkwood to the Lord Ludovico Gon...
 71    61. Sir John Hawkwood, Jacopo de Cavalli, and ...
 91    81. Sir John Hawkwood to Lodovico de Gonzaga, ...
 68    58. Sir John Hawkwood to Lodovico Gonzaga, Lor...
 85    75. Sir John Hawkwood to Lodovico de Gonzag

In [None]:
# urls = get_urls("https://www.british-history.ac.uk/cal-state-papers/venice/vol35")
# dfs = []
# for url in data_links:
#     df = parse_historical_table(url)
#     dfs.append(df)
#     print(f"finished {url}")
# main_df = pd.concat(dfs)
# display(main_df.shape)