In [1]:
import pandas as pd

# from cluster import embed_items, create_cluster
import requests
from bs4 import BeautifulSoup


def parse_historical_table(html_content, df=None):
    """
    Parses the messy HTML table into a structured list of dictionaries.
    """
    next_root = "https://www.british-history.ac.uk"
    html = requests.get(next_root + html_content).text
    soup = BeautifulSoup(html, "html.parser")
    next_row = soup.find("div", {"id": "block-componentpager"})
    # if next_row:
    #     next = next_row.find(lambda tag: "Next" in tag.text)
    #     if next:
    #         next_html = next.get("href")
    #         print(next_root + next_html)
    rows = soup.find(class_="table-wrap").find("table").find_all(["th", "td"])
    idx = [i for i, element in enumerate(rows) if element.name == "th"]
    citation = soup.find(class_="chicago").text.strip()
    cols = ["date", "text", "citation"]
    lines = []
    for index, i in enumerate(idx):
        row = []
        row.append(rows[i].text)
        if index == idx.index(idx[-1]):
            row.append("\n".join(t.text for t in rows[i + 1 :]))
        else:
            row.append("\n".join(t.text for t in rows[i + 1 : idx[index + 1]]))
        row.append(citation)
        lines.append(row)

    df1 = pd.DataFrame(columns=cols, data=lines)
    df1["text"] = df1["text"].str.replace("\n", "")

    return df1

In [2]:
# from cluster import embed_items
url_num = 4

with open("data_links.txt", "r") as f:
    data_links = f.read().splitlines()

data_links = data_links[:url_num]

dfs = []
for url in data_links:
    df = parse_historical_table(url)
    dfs.append(df)
    print(f"finished {url}")
table = pd.concat(dfs).reset_index(drop=True)
display(table.shape)

finished /cal-state-papers/venice/vol1/pp1-3
finished /cal-state-papers/venice/vol1/pp3-39
finished /cal-state-papers/venice/vol1/pp39-52
finished /cal-state-papers/venice/vol1/pp52-61


(239, 3)

In [3]:
# Run this in your notebook
print(f"Number of texts: {len(table['text'])}")
print(f"Average length: {table['text'].str.split(' ').str.len().mean():.0f} words")
print(f"Total characters: {table['text'].str.split(' ').str.len().sum():,}")

Number of texts: 239
Average length: 106 words
Total characters: 25,452


In [4]:
table.head()

Unnamed: 0,date,text,citation
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape..."
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape..."
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape..."
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape..."
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape..."


In [5]:
def get_year(row: pd.Series) -> float:
    match = pd.Series(row["date"]).str.extract(r"(\d{4})", expand=False)
    year = match.iloc[0]

    if not pd.isna(year):
        return float(year)
    idx = row.name  # row.name contains the index

    if pd.isna(year):
        while idx > 0:
            idx -= 1
            prev_date = table.iloc[idx]
            match = pd.Series(prev_date["date"]).str.extract(r"(\d{4})", expand=False)
            year = match.iloc[0]
            if not pd.isna(year):
                break
    return float(year)


table["year"] = table.apply(get_year, axis=1)
# table["year"] = table["date"].str.extract(r"(\d{4})").astype(float)
# table["year"]
# table.iloc[39:43]
table["year"]

0      1202.0
1      1224.0
2      1265.0
3      1272.0
4      1273.0
        ...  
234    1419.0
235    1419.0
236    1420.0
237    1420.0
238    1420.0
Name: year, Length: 239, dtype: float64

In [None]:
from calendar import month


def get_month(item: str) -> pd.Series[int | bool]:
    # extract three letters that exist in month names
    match = pd.Series([item]).str.extract(r"([A-Za-z]{3})(?:[ ]|[^A-Za-z]|$)")
    month_map = {
        "january": 1,
        "february": 2,
        "march": 3,
        "april": 4,
        "may": 5,
        "june": 6,
        "july": 7,
        "august": 8,
        "september": 9,
        "october": 10,
        "november": 11,
        "december": 12,
    }
    months = month_map.keys()
    if not match.empty and not pd.isna(match.iloc[0, 0]):
        month_abbrev = match.iloc[0, 0].lower()
        for m in months:
            if month_abbrev in m:
                return pd.Series([month_map[m.lower()], True])
        else:
            return pd.Series([1, False])
    else:
        return pd.Series([1, False])


table[["month", "is_month_precise"]] = table["date"].apply(get_month)
table

Unnamed: 0,date,text,citation,year,embeds,cluster,month,"(month, is_month_precise)",is_month_precise
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape...",1202.0,"[-0.050486620515584946, 0.01540993433445692, 0...",18,10,"[10, True]",True
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape...",1224.0,"[0.02098303660750389, 0.03918666020035744, 0.0...",13,9,"[9, True]",True
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape...",1265.0,"[0.014525304548442364, 0.025349894538521767, 0...",18,11,"[11, True]",True
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape...",1272.0,"[0.01985478401184082, 0.003296412993222475, 0....",18,2,"[2, True]",True
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape...",1273.0,"[0.007098940201103687, 0.01995459944009781, 0....",13,12,"[12, True]",True
...,...,...,...,...,...,...,...,...,...
234,Nov. 16. “Misti Senato.” v. liii. p. 13.,224. Decrees of the Senate concerning Safecond...,"'Venice: 1411-1420', in Calendar of State Pape...",1419.0,"[0.021604035049676895, 0.0330449603497982, 0.0...",18,11,"[11, True]",True
235,Ibid.,In consequence of the mission as ambassador to...,"'Venice: 1411-1420', in Calendar of State Pape...",1419.0,"[0.017431549727916718, 0.018888896331191063, 0...",18,1,"[1, False]",False
236,1420. Jan. 12. “Misti Senato”,225. Decree of the Senate for fitting out four...,"'Venice: 1411-1420', in Calendar of State Pape...",1420.0,"[-0.008195324800908566, 0.06116862967610359, 0...",12,1,"[1, True]",True
237,Jan. 12. “Misti Senato” v. liii. p. 22.,226. Appointment of “Ser” Marco Barbo by the S...,"'Venice: 1411-1420', in Calendar of State Pape...",1420.0,"[0.0056990971788764, 0.02055922895669937, 0.03...",7,1,"[1, True]",True


In [95]:
def get_day(item: str) -> pd.Series:
    match = pd.Series([item]).str.extract(r"(?:^|\D)(\d{1,2})(?:\D|$)")
    if not match.empty and not pd.isna(match.iloc[0, 0]):
        day = int(match.iloc[0, 0])
        if 1 <= day <= 31:  # validate day range
            return pd.Series([day, True])
    return pd.Series([1, False])


table[["day", "is_day_precise"]] = table["date"].apply(get_day)
table

Unnamed: 0,date,text,citation,year,embeds,cluster,month,"(month, is_month_precise)",is_month_precise,day,is_day_precise
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape...",1202.0,"[-0.050486620515584946, 0.01540993433445692, 0...",18,10,"[10, True]",True,1,False
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape...",1224.0,"[0.02098303660750389, 0.03918666020035744, 0.0...",13,9,"[9, True]",True,13,True
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape...",1265.0,"[0.014525304548442364, 0.025349894538521767, 0...",18,11,"[11, True]",True,6,True
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape...",1272.0,"[0.01985478401184082, 0.003296412993222475, 0....",18,2,"[2, True]",True,15,True
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape...",1273.0,"[0.007098940201103687, 0.01995459944009781, 0....",13,12,"[12, True]",True,13,True
...,...,...,...,...,...,...,...,...,...,...,...
234,Nov. 16. “Misti Senato.” v. liii. p. 13.,224. Decrees of the Senate concerning Safecond...,"'Venice: 1411-1420', in Calendar of State Pape...",1419.0,"[0.021604035049676895, 0.0330449603497982, 0.0...",18,11,"[11, True]",True,16,True
235,Ibid.,In consequence of the mission as ambassador to...,"'Venice: 1411-1420', in Calendar of State Pape...",1419.0,"[0.017431549727916718, 0.018888896331191063, 0...",18,1,"[1, False]",False,1,False
236,1420. Jan. 12. “Misti Senato”,225. Decree of the Senate for fitting out four...,"'Venice: 1411-1420', in Calendar of State Pape...",1420.0,"[-0.008195324800908566, 0.06116862967610359, 0...",12,1,"[1, True]",True,12,True
237,Jan. 12. “Misti Senato” v. liii. p. 22.,226. Appointment of “Ser” Marco Barbo by the S...,"'Venice: 1411-1420', in Calendar of State Pape...",1420.0,"[0.0056990971788764, 0.02055922895669937, 0.03...",7,1,"[1, True]",True,12,True


In [96]:
def get_precision(row: pd.Series) -> str:
    if row["is_day_precise"]:
        return "day"
    elif row["is_month_precise"]:
        return "month"
    else:
        return "year"


table["precision"] = table.apply(get_precision, axis=1)

In [97]:
table

Unnamed: 0,date,text,citation,year,embeds,cluster,month,"(month, is_month_precise)",is_month_precise,day,is_day_precise,precision
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape...",1202.0,"[-0.050486620515584946, 0.01540993433445692, 0...",18,10,"[10, True]",True,1,False,month
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape...",1224.0,"[0.02098303660750389, 0.03918666020035744, 0.0...",13,9,"[9, True]",True,13,True,day
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape...",1265.0,"[0.014525304548442364, 0.025349894538521767, 0...",18,11,"[11, True]",True,6,True,day
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape...",1272.0,"[0.01985478401184082, 0.003296412993222475, 0....",18,2,"[2, True]",True,15,True,day
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape...",1273.0,"[0.007098940201103687, 0.01995459944009781, 0....",13,12,"[12, True]",True,13,True,day
...,...,...,...,...,...,...,...,...,...,...,...,...
234,Nov. 16. “Misti Senato.” v. liii. p. 13.,224. Decrees of the Senate concerning Safecond...,"'Venice: 1411-1420', in Calendar of State Pape...",1419.0,"[0.021604035049676895, 0.0330449603497982, 0.0...",18,11,"[11, True]",True,16,True,day
235,Ibid.,In consequence of the mission as ambassador to...,"'Venice: 1411-1420', in Calendar of State Pape...",1419.0,"[0.017431549727916718, 0.018888896331191063, 0...",18,1,"[1, False]",False,1,False,year
236,1420. Jan. 12. “Misti Senato”,225. Decree of the Senate for fitting out four...,"'Venice: 1411-1420', in Calendar of State Pape...",1420.0,"[-0.008195324800908566, 0.06116862967610359, 0...",12,1,"[1, True]",True,12,True,day
237,Jan. 12. “Misti Senato” v. liii. p. 22.,226. Appointment of “Ser” Marco Barbo by the S...,"'Venice: 1411-1420', in Calendar of State Pape...",1420.0,"[0.0056990971788764, 0.02055922895669937, 0.03...",7,1,"[1, True]",True,12,True,day


In [6]:
from cluster import embed_items

table["embeds"] = embed_items(table["text"])
table["embeds"]

0      [-0.050486620515584946, 0.01540993433445692, 0...
1      [0.02098303660750389, 0.03918666020035744, 0.0...
2      [0.014525304548442364, 0.025349894538521767, 0...
3      [0.01985478401184082, 0.003296412993222475, 0....
4      [0.007098940201103687, 0.01995459944009781, 0....
                             ...                        
234    [0.021604035049676895, 0.0330449603497982, 0.0...
235    [0.017431549727916718, 0.018888896331191063, 0...
236    [-0.008195324800908566, 0.06116862967610359, 0...
237    [0.0056990971788764, 0.02055922895669937, 0.03...
238    [0.005254759918898344, 0.032985880970954895, 0...
Name: embeds, Length: 239, dtype: object

In [68]:
import numpy as np
import importlib
import cluster


importlib.reload(cluster)
from cluster import create_cluster, spectral_clusters

In [69]:
%%time

embeds = np.stack(table["embeds"].to_list()).astype(np.float64)
clusters, samples, iters = create_cluster(embeds, num_clusters=20, soft_assign=True, soft_margin=0.1)
# sims, clusters

# spectral_labels = spectral_clusters(embeds, n_clusters=3, random_state=42)
# [len(c) for c in clusters]
clusters,iters

90.0
90.0
90.0
90.0
90.0
90.0
90.0
CPU times: user 793 ms, sys: 14.6 ms, total: 807 ms
Wall time: 70.6 ms


([18,
  13,
  18,
  18,
  13,
  18,
  17,
  18,
  12,
  12,
  12,
  4,
  12,
  12,
  13,
  8,
  12,
  3,
  18,
  12,
  0,
  0,
  3,
  3,
  18,
  13,
  10,
  10,
  10,
  15,
  18,
  12,
  12,
  12,
  13,
  13,
  12,
  2,
  12,
  18,
  19,
  12,
  12,
  10,
  12,
  12,
  12,
  12,
  18,
  10,
  10,
  10,
  10,
  18,
  12,
  3,
  18,
  18,
  19,
  12,
  12,
  18,
  10,
  10,
  12,
  12,
  18,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  1,
  14,
  14,
  9,
  1,
  9,
  9,
  9,
  14,
  14,
  9,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  1,
  14,
  14,
  14,
  14,
  1,
  14,
  14,
  14,
  16,
  16,
  16,
  16,
  19,
  12,
  12,
  12,
  13,
  19,
  18,
  12,
  18,
  12,
  12,
  12,
  19,
  3,
  12,
  11,
  12,
  12,
  11,
  12,
  19,
  12,
  19,
  10,
  12,
  12,
  7,
  12,
  7,
  16,
  10,
  12,
  7,
  12,
  12,
  10,
  16,
  12,
  7,
  3,
  3,
  11,
  13,
  10,
  12,
  11,
  12,
  12,
  7,
  12,
  6,
  12,
  12,
  12,
  6,
  11,
  12,
  12,
  12,
  19,
  18,
  12,
  7,
  12,
  11,
  18,
  1

In [44]:
(1 - 0.2) * 100

80.0

In [70]:
table["cluster"] = clusters

table["cluster"]

0      18
1      13
2      18
3      18
4      13
       ..
234    18
235    18
236    12
237     7
238    11
Name: cluster, Length: 239, dtype: int64

In [71]:
table.head()

Unnamed: 0,date,text,citation,year,embeds,cluster
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape...",1202.0,"[-0.050486620515584946, 0.01540993433445692, 0...",18
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape...",1224.0,"[0.02098303660750389, 0.03918666020035744, 0.0...",13
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape...",1265.0,"[0.014525304548442364, 0.025349894538521767, 0...",18
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape...",1272.0,"[0.01985478401184082, 0.003296412993222475, 0....",18
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape...",1273.0,"[0.007098940201103687, 0.01995459944009781, 0....",13


In [72]:
cluster_samples = [table.loc[sample, "text"] for sample in samples]
cluster_samples

[20    3. Act that the Venetian ambassador in England...
 21    4. Act that an answer be sat to the Venetian a...
 Name: text, dtype: object,
 92    82. The Same to the Same.Announces the arrival...
 97    87. The Same to the Same.Requests Ludovico Gon...
 78    68. The Same to the nobleman the Captain Jacop...
 74    64. The Same to the Same.Credentials in favour...
 Name: text, dtype: object,
 220    210. Decree of the Senate.That certain bales o...
 223    213. Decree of the Senate.Conceding priority o...
 37     27. Decree of the Senate concerning the Flande...
 Name: text, dtype: object,
 17     13. Motion made in the Grand Council and cance...
 144    134. Motion made in the Senate by the two Sage...
 55     45. Motions made in the Senate concerning an E...
 23     16. Motion made in the Grand Council and repea...
 22     15. Motion made in the Grand Council and repea...
 145    135. Motion made in the Senate by the two Sage...
 118    108. Decree of the Senate.To honour the Earl

In [73]:
table.groupby("cluster").size()

cluster
0      2
1      4
2      3
3      7
4      1
5      3
6      3
7     11
8      1
9      5
10    14
11    14
12    83
13     9
14    25
15     1
16     6
17     1
18    33
19    13
dtype: int64

In [75]:
table.loc[table["cluster"] == 12, "text"].to_list()

['9. The Voyage to Flanders.Currants and masere, and all other things, may be sent by the Flanders galleys, duty free.',
 'Mission to the King of England (Edward III.); notwithstanding which, galleys may fit out for the Flanders voyage, but their captain to be appointed by the Republic.',
 'Instructions for the galleys deputed to that voyage.',
 'A medical surgeon (unum medicum cirurgise) was given to the captain of the galleys; his salary being paid by the State.',
 '[The galleys] not to go to England unless the agreement be stipulated.[Latin.]',
 '12. Negotiations with Flanders.That the ambassador on his way to England be commissioned, should he be in Flanders, to visit the community of Flanders, thank them for justice done at our suit, state the grievances to which our merchants and lieges are subjected, and ask redress.If he should not succeed in obtaining all these things or part of them, to decide whether the galleys should go in the first place to Antwerp, where our subjects wer

In [None]:
# urls = get_urls("https://www.british-history.ac.uk/cal-state-papers/venice/vol35")
# dfs = []
# for url in data_links:
#     df = parse_historical_table(url)
#     dfs.append(df)
#     print(f"finished {url}")
# main_df = pd.concat(dfs)
# display(main_df.shape)

In [40]:
import extract_names
import importlib

importlib.reload(extract_names)
from extract_names import get_db_people, get_db_places, Person

people: list[Person] = get_db_people()
places: list[str] = get_db_places()
people

[{'name': 'Baudier, Michel', 'birthYear': 1589.0, 'deathYear': 1645.0},
 {'name': "Noailles, Gilles de, Abbot of L'Isle",
  'birthYear': None,
  'deathYear': None},
 {'name': 'Maximilien II, Holy Roman Emperor (r. 1564-1619)',
  'birthYear': nan,
  'deathYear': nan},
 {'name': 'Guillard, André, seigneur du Mortier',
  'birthYear': nan,
  'deathYear': nan},
 {'name': 'Rohan, François de', 'birthYear': nan, 'deathYear': nan},
 {'name': 'Halil Pasha', 'birthYear': nan, 'deathYear': nan},
 {'name': 'William Harborne', 'birthYear': 1542.0, 'deathYear': 1617.0},
 {'name': "Cleutin, Henri, seigneur d'Oisel",
  'birthYear': nan,
  'deathYear': nan},
 {'name': 'Calvimont, Jean de', 'birthYear': nan, 'deathYear': nan},
 {'name': 'Castion, Jean-Jacques de', 'birthYear': nan, 'deathYear': nan},
 {'name': 'Monstiers, Jean de', 'birthYear': nan, 'deathYear': nan},
 {'name': 'Crassier, Bernard de', 'birthYear': nan, 'deathYear': nan},
 {'name': 'De Seurre, Michel', 'birthYear': nan, 'deathYear': nan}

In [41]:
places

['Languedoc',
 'Paris',
 'Lyon',
 'France',
 'Swiss Cantons',
 'Grisons',
 'Holy Roman Empire',
 'Venice',
 'Denmark',
 'Ferrara',
 'Geneva',
 'Netherlands',
 'Poland',
 'Portugal',
 'Rome',
 'Savoy',
 'Scotland',
 'Spain',
 'Tuscany',
 'Santa-Fiore',
 'Lorraine',
 'Urbino',
 'Electorate of the Palatine',
 'Brandenbourg',
 'Sweden',
 'Mantua',
 'Wurttemburg',
 'Spanish Netherlands',
 'Fribourg',
 'Hamburg',
 'Strasbourg',
 'County Palatinate',
 'Fontenay-en-Brie',
 'Milan',
 'Lazio',
 'Piedmont',
 'Ragusa',
 'Safavid Empire',
 'Naples',
 'Kingdom of Naples',
 'Alexandria',
 'Jerusalem',
 'Saxe',
 'Berlin',
 'Sicily',
 'Marmande',
 'Crete',
 'Corfu',
 'Morocco',
 'Fez',
 'Algiers',
 'Cairo',
 'Knights of Malta',
 'St. Pierre de Paladru',
 'Marseille',
 'Istanbul',
 'Aleppo',
 'Tunisia',
 'Hungary',
 'Ottoman Empire',
 'England',
 'Morea',
 'Albania',
 'Brittany',
 'Moldavia',
 'Wallachia',
 'London',
 'Rocherster, NY',
 'Arabia',
 'Avignon',
 'Vienna',
 'Egypt',
 'Tripoli',
 'Flore',
 '

In [42]:
from extract_names import (
    get_entities_from_text,
    ResponseSchema,
    update_people_entities,
    update_place_entities,
    config_gemini,
)

client, config = config_gemini()

auth = []
recp = []
auth_loc = []
recp_loc = []
people_ents = []
place_ents = []

for row in table.itertuples():
    row_text = row.text
    row_date = row.date
    row_text = f"Date: {row_date}: {row_text}"
    idx = row.Index
    people = people
    place = places
    try:
        res_json, res_list = get_entities_from_text(
            row_text, people, places, client, config
        )
        people = update_people_entities(people, res_json)
        places = update_place_entities(places, res_json)
        print(f"Processed row {idx}")
        auth.append(res_json.author)
        recp.append(res_json.recipient)
        auth_loc.append(res_json.auth_location)
        recp_loc.append(res_json.recp_location)
        people_ents.append(res_json.people_entities)
        place_ents.append(res_json.place_entities)
    except Exception as e:
        print(f"Error processing row {idx}: {e} \n {e.__traceback__.tb_lineno}")
        break

Processed row 0
Processed row 1
Processed row 2
Processed row 3
Processed row 4
Processed row 5
Processed row 6
Processed row 7
Processed row 8
Processed row 9
Processed row 10
Processed row 11
Processed row 12
Processed row 13
Processed row 14
Processed row 15
Processed row 16
Processed row 17
Processed row 18
Processed row 19
Processed row 20
Processed row 21
Processed row 22
Processed row 23
Processed row 24
Processed row 25
Processed row 26
Processed row 27
Processed row 28
Processed row 29
Processed row 30
Processed row 31
Processed row 32
Processed row 33
Processed row 34
Processed row 35
Processed row 36
Processed row 37
Processed row 38
Processed row 39
Processed row 40
Processed row 41
Processed row 42
Processed row 43
Processed row 44
Processed row 45
Processed row 46
Processed row 47
Processed row 48
Processed row 49
Processed row 50
Processed row 51
Processed row 52
Processed row 53
Processed row 54
Processed row 55
Processed row 56
Processed row 57
Processed row 58
Process

In [44]:
table["author"] = auth
table["recipient"] = recp
table["auth_location"] = auth_loc
table["recp_location"] = recp_loc
table["people_entities"] = people_ents
table["place_entities"] = place_ents
table.head()

Unnamed: 0,date,text,citation,year,embeds,cluster,author,recipient,auth_location,recp_location,people_entities,place_entities
0,1202. Oct.,1. Baldwin Count of Flanders and Hainault.Nota...,"'Venice: 1202-1295', in Calendar of State Pape...",1202.0,"[-0.050486620515584946, 0.01540993433445692, 0...",1,Baldwin Count of Flanders and Hainault,"Markisino Soranzo, Pietro Giuliano, Marino Gra...",Holy Roman Empire,Venice,"[Baldwin Count of Flanders and Hainault, Marki...","[Ligny, Venice, Island of St. Erasmus, Holy Ro..."
1,1224. 13 Sept.,2. Doge Pietro Ziani and his six Privy Counsel...,"'Venice: 1202-1295', in Calendar of State Pape...",1224.0,"[0.02098303660750389, 0.03918666020035744, 0.0...",0,Doge Pietro Ziani,Raymond,Venice,Venice,"[Doge Pietro Ziani, Agnes of Marseilles, Giles...","[England, Venice, Marseilles, Paris]"
2,"1265. Nov. 6. “Fractus.” “Deliberazioni,” Gran...","3. Tariff of Duties on Cloths, Linens, and Fus...","'Venice: 1202-1295', in Calendar of State Pape...",1265.0,"[0.014525304548442364, 0.025349894538521767, 0...",2,Anonymous,Anonymous,,,[],"[England, Milan, Monza]"
3,"1272. Feb. 15. “Comune I.” “Deliberazioni,” Gr...",4. “Cambium.”Concerning merchants who go (qui ...,"'Venice: 1202-1295', in Calendar of State Pape...",1272.0,"[0.01985478401184082, 0.003296412993222475, 0....",2,Republic of Venice,Anonymous,Venice,Venice,[],"[France, Venice]"
4,"1273. Dec. 13. “ Comune I.” “Deliberazioni,” G...",5. “Cambium.”Concerning merchants navigating t...,"'Venice: 1202-1295', in Calendar of State Pape...",1273.0,"[0.007098940201103687, 0.01995459944009781, 0....",2,Republic of Venice,Venetian and Alien Merchants,Venice,Venice,[Marin],"[Provence, Marseilles, Montpellier, Aigues-Mor..."


In [58]:
table.loc[123, ["text", "author", "recipient", "people_entities", "place_entities"]]

text               113. Decree of the Senate.That for the benefit...
author                                          The Senate of Venice
recipient                                            King of England
people_entities    [King of England, Lord Henry of Lancaster, Ear...
place_entities                  [London, England, Lancaster, Venice]
Name: 123, dtype: object

In [57]:
table.loc[123, "text"]

'113. Decree of the Senate.That for the benefit of the galley bound to London and of our merchants, the state do write letters of recommendation to the King of England, the Duke of Lancaster, and others.[Latin, 12 lines.]'

In [59]:
table.to_csv("./test_outcomes/nov-6.csv", index=False)