### Filtering out Names with NER

In [1]:
import spacy
from pprint import pprint
nlp = spacy.load('la_core_web_lg')

In [25]:
import json
with open("latin_tragedies_corpus.json", "r", encoding="utf-8") as f:
    full_text_tragedies = json.load(f)

In [51]:
# Filter for entities of interest
relevant_tags = {"PERSON", "LOC", "NORP"}
entities_info = []
named_entities = []
for tragedy in full_text_tragedies:
    doc = nlp(tragedy["text"]) # use spaCy's nlp() to tokenize the tragedies' texts

    for token in doc:
        lemma = token.lemma_.lower()
        if token.ent_type_ in relevant_tags and token.is_alpha and lemma: #if the token entity is PERSON, LOC, or NORP and all characters are alphabets
            entities_info.append([tragedy["title"], token.text, token.ent_type_])
            named_entities.append(lemma)

In [60]:
from tabulate import tabulate
print(tabulate(entities_info[:20], headers=['Tragedy', 'Text', "Entity Type"]))

Tragedy     Text       Entity Type
----------  ---------  -------------
Phoenissae  Caeci      PERSON
Phoenissae  Cithaeron  LOC
Phoenissae  Actaeon    LOC
Phoenissae  Zethi      PERSON
Phoenissae  Cithaeron  LOC
Phoenissae  Laius      PERSON
Phoenissae  Labdaci    PERSON
Phoenissae  Thebana    LOC
Phoenissae  Argolicas  LOC
Phoenissae  Iuppiter   PERSON
Phoenissae  Phoebea    LOC
Phoenissae  Hesperus   PERSON
Phoenissae  Oedipodae  PERSON
Phoenissae  Ismenos    NORP
Phoenissae  Sphinx     PERSON
Phoenissae  Assyrio    LOC
Phoenissae  Cadmi      PERSON
Phoenissae  Dirce      PERSON
Phoenissae  Eurotan    PERSON
Phoenissae  Spartam    NORP


In [50]:
import pickle
# Save the entities information to a .pkl file
with open("entities_info.pkl", "wb") as f:
    pickle.dump(entities_info, f)

In [58]:
with open("entities_info.pkl", "rb") as f:
    entities_info = pickle.load(f)

In [59]:
ecerinis_entities = [row for row in entities_info if row[0] == "Ecerinis"]

# Print the first 20 (or however many you like)
print(tabulate(ecerinis_entities[:20], headers=['Tragedy', 'Text', "Entity Type"]))

Tragedy    Text      Entity Type
---------  --------  -------------
Ecerinis   Arcthoo   PERSON
Ecerinis   Gnati     PERSON
Ecerinis   Genui     PERSON
Ecerinis   Infausta  PERSON
Ecerinis   Latere    PERSON
Ecerinis   Devota    PERSON
Ecerinis   Antiqua   LOC
Ecerinis   Romanum   NORP
Ecerinis   Aetas     NORP
Ecerinis   Monachus  PERSON
Ecerinis   Ecerinus  PERSON
Ecerinis   Dormire   PERSON
Ecerinis   Supina    LOC
Ecerinis   Audire    NORP
Ecerinis   Imago     PERSON
Ecerinis   Frigore   PERSON
Ecerinis   Exangue   PERSON
Ecerinis   Albrice   LOC
Ecerinis   Recolo    LOC
Ecerinis   Natalis   PERSON


In [52]:
print(named_entities[:20])

['caecus', 'cithaeron', 'actaeon', 'zethus', 'cithaeron', 'laius', 'labdacus', 'thebanus', 'argolicus', 'iuppiter', 'phoebeus', 'hesperus', 'oedipoda', 'ismenus', 'sphinx', 'assyrius', 'cadmus', 'dirce', 'eurotas', 'sparta']


In [53]:
#deduplicate the named entities without losing the first appearance order
seen = set()
unique_named_entities = []
for entity in named_entities:
    if entity not in seen:
        seen.add(entity)
        unique_named_entities.append(entity)

In [54]:
print(unique_named_entities[:20])

['caecus', 'cithaeron', 'actaeon', 'zethus', 'laius', 'labdacus', 'thebanus', 'argolicus', 'iuppiter', 'phoebeus', 'hesperus', 'oedipoda', 'ismenus', 'sphinx', 'assyrius', 'cadmus', 'dirce', 'eurotas', 'sparta', 'elis']


In [55]:
#save unique entities
with open("unique_named_entities.pkl", "wb") as f:
    pickle.dump(unique_named_entities, f)