In [1]:
import spacy
from pprint import pprint
import json
nlp = spacy.load('la_core_web_lg')

In [2]:
with open("latin_tragedies_corpus.json", "r", encoding="utf-8") as f:
    full_text_tragedies = json.load(f)

In [3]:
doc = nlp(full_text_tragedies[10]["text"]) #tokenize Ecerinis text

In [12]:
from tabulate import tabulate

data = [[token.text, token.pos_, token.tag_] for token in doc]

print(tabulate(data[:20], headers=['Text', "POS", "TAG"]))


Text       POS    TAG
---------  -----  -----------
Quadnam    ADV    adjective
cruentum   ADJ    adjective
sidus      NOUN   noun
Arcthoo    PROPN  proper_noun
potens     ADJ    adjective
Regnavit   VERB   verb
orbe       NOUN   noun
,          PUNCT  punc
pestilens  ADJ    adjective
tantum     ADV    adverb
michi      PRON   pronoun
,          PUNCT  punc
Gnati      PROPN  proper_noun
,          PUNCT  punc
nefando    ADJ    adjective
flebiles   ADJ    adjective
cum        SCONJ  conjunction
vos        PRON   pronoun
thoro      NOUN   noun
Genui      PROPN  proper_noun


In [6]:
# Helper function to get tagging scores

def get_tagging_scores(doc, n=3):
    # cf. https://stackoverflow.com/a/69228515
    scores = []
    tagger = nlp.get_pipe('tagger')
    labels = tagger.labels
    for token in doc:
        token_scores = tagger.model.predict([doc])[0][token.i]
        r = [*enumerate(token_scores)]
        r.sort(key=lambda x: x[1], reverse=True)
        scores.append([(labels[i], p) for i, p in r[:n]])
    return scores

In [7]:
# Get the top 3 tags by score for each token in Ecerinis

tagging_probs = get_tagging_scores(doc)

for token in doc:
    print(f'Token: {token.text}', end='\n\n')
    data = []
    for label, prob in tagging_probs[token.i]:
        data.append([label, prob])
    print(tabulate(data, headers=['Label', 'Score']))
    break

Token: Quadnam

Label        Score
---------  -------
adjective  4.32699
adverb     3.364
particle   1.56564


In [10]:
# Get the top 3 tags by score for a specific token index in Ecerinis
token = doc[3] #token at index 3 is Arcthoo
tagging_probs = get_tagging_scores(doc)

print(f'Token: {token.text}', end='\n\n')
data = tagging_probs[token.i]
print(tabulate(data, headers=['Label', 'Score']))

Token: Arcthoo

Label           Score
-----------  --------
proper_noun  12.079
adjective     6.933
noun          1.95643


In [None]:
#Find the top 3 tags by score for the first instance of a specific word in Ecerinis
token_text = "Gnati"  # token we're searching for

# Find the first matching token in the doc
token = next((t for t in doc if t.text == token_text), None)

if token:
    tagging_probs = get_tagging_scores(doc)
    print(f'Token: {token.text}\n')
    data = tagging_probs[token.i]
    print(tabulate(data, headers=['Label', 'Score']))
else:
    print(f"Token '{token_text}' not found in the document.")

Token: Gnati

Label          Score
-----------  -------
proper_noun  9.57062
noun         4.85036
verb         4.69239
