### Tokenize and Lemmatize Acts

In [20]:
# Using spacy.load().
import spacy
nlp = spacy.load("la_core_web_lg")

In [21]:
import json

with open("stopwords_latin.json", "r", encoding="utf-8") as f:
    stopwords_json = json.load(f)

latin_stopwords = set()

for category, items in stopwords_json.items():
    if isinstance(items, list):
        # Case 1: Simple list of stopwords
        latin_stopwords.update(word.lower() for word in items)
    elif isinstance(items, dict):
        # Case 2: Nested dictionary (e.g., "Pronouns")
        for lemma, forms in items.items():
            latin_stopwords.add(lemma.lower())                     # include the lemma
            latin_stopwords.update(word.lower() for word in forms) # include all word forms

In [33]:
token = nlp("ablatus")[0]
print(token.lemma_) #returns incorrect lemma (ablatus) before custom lemma lookup

aufero


In [34]:
#create custom lemma lookup for 'inquit' and 'ablatus'
from spacy import Language
from spacy.lookups import Lookups

lookups = Lookups()

custom_lemma_lookups = {
    "inquam": ["inquit"],
    "aufero": ["ablatus"]
}

custom_lookups = {}
for key, values in custom_lemma_lookups.items():
    for value in values:
        if value not in lookups:
            custom_lookups[value] = key

@Language.component(name="custom_lookup_lemmatizer")
def make_lookup_lemmatizer_function(doc):
    for token in doc:
        token.lemma_ = custom_lookups.get(token.text, token.lemma_)
    return doc

try:
    nlp.add_pipe("custom_lookup_lemmatizer", name="custom_lookup_lemmatizer")
except:
    # If the pipeline component is already added, we can't add it again
    pass

In [35]:
token = nlp("ablatus")[0]
print(token.lemma_) #returns "aufero" after custom lemma lookup

aufero


In [36]:
def process_latin_text(text):
    doc = nlp(text)
    # Tokenize, lowercase, lemmatize
    tokens = [token.lemma_.lower() for token in doc if token.is_alpha and token.lemma_.lower() not in latin_stopwords and token.pos_ in ["PROPN", "NOUN", "VERB", "ADJ"]]
    return tokens

In [37]:
# Load the acts
with open("latin_tragedies_acts.json", "r", encoding="utf-8") as f:
    acts = json.load(f)

# Process each act and store tokens with id, play_slug, title, author and act
processed_acts = []
for tragedy_act in acts:
    tokens = process_latin_text(tragedy_act["text"])
    processed_tragedy_act = {
        "id": tragedy_act["id"],
        "play_slug": tragedy_act["play_slug"],
        "title": tragedy_act["title"],
        "author": tragedy_act["author"],
        "act": tragedy_act["act"],
        "tokens": tokens
    }
    processed_acts.append(processed_tragedy_act)

# Now `processed_acts` is a list of dictionaries like:
# {"title": "...", "author": "...", "tokens": [...]}

In [3]:
import json

with open("latin_tragedies_acts.json", "r", encoding="utf-8") as f:
    acts = json.load(f)

total_acts = len(acts)
print(f"Total acts across all plays: {total_acts}")

Total acts across all plays: 56


In [38]:
with open("processed_acts.json", "w", encoding="utf-8") as f:
    json.dump(processed_acts, f, ensure_ascii=False, indent=2)

In [39]:
import pickle

# Save the processed corpus to a .pkl file to use in python
with open("processed_acts.pkl", "wb") as f:
    pickle.dump(processed_acts, f)


In [40]:
# Load the processed acts from the .pkl file
with open("processed_acts.pkl", "rb") as f:
    processed_acts = pickle.load(f)

# access tokens directly
tokens_1 = processed_acts[0]["tokens"]
print(type(tokens_1))  # Should be <class 'list'>

<class 'list'>


In [41]:
len(tokens_1)

389

In [42]:
tragedy_act_title = processed_acts[0]['title']
print(tragedy_act_title)

Agamemnon


In [None]:
tragedy_act_number = processed_acts[0]['act']
print(tragedy_act_number)

NameError: name 'processed_acts' is not defined