In [14]:
from cltk import NLP
from cltk.languages.utils import get_lang

#from unidecode import unidecode

from pathlib import Path

import pandas as pd

In [15]:
# Initialize the Latin pipeline
nlp = NLP(language="lat")

‚Äéê§Ä CLTK version '1.5.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Latin' (ISO: 'lat'): `LatinNormalizeProcess`, `LatinStanzaProcess`, `LatinEmbeddingsProcess`, `StopsProcess`, `LatinLexiconProcess`.

‚∏ñ ``LatinStanzaProcess`` using Stanza model from the Stanford NLP Group: https://stanfordnlp.github.io/stanza/ . Please cite: https://arxiv.org/abs/2003.07082
‚∏ñ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/
‚∏ñ ``LatinLexiconProcess`` using Lewis's *An Elementary Latin Dictionary* (1890).

‚∏é To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


In [16]:
# New version!
fn_ap = Path('..') / 'Files' / 'ap-latin-draft-course-framework-vocab-list-revised.csv'

df_ap_vocab = pd.read_csv(fn_ap)

In [17]:
text_input = input("Enter the Text: ")

In [18]:
# Process the text
doc = nlp.analyze(text=text_input)

#print(doc.words)

# Get lemmas
list_lemmas = [word.lemma for word in doc.words]
list_pos = [word.pos for word in doc.words]

list_words = [word.string for word in doc.words]

In [19]:
df_input = pd.DataFrame({
    "Word": list_words,
    "POS": list_pos,
    "Lemma": list_lemmas
})

In [20]:
print(df_ap_vocab)

                                  Vocabulary Part of Speech  \
0                                 a, ab, abs    preposition   
1                     abeo, -ire, -ii, -itum           verb   
2                        absum, abesse, afui           verb   
3     accedo (adc-), -ere, -cessi, \n-cessum           verb   
4    accendo (adc-), -ere, -cendi, \n-censum           verb   
..                                       ...            ...   
995                           votum, -i (n.)           noun   
996                          vox, vocis (f.)           noun   
997   vulnero (volnero), -are, -avi, \n-atum           verb   
998              vulnus (volnus), -eris (n.)           noun   
999                vultus (voltus), -us (m.)           noun   

                                            Definition  Suggested Reading  \
0              (with abl.) from, away from, out of, by                1.1   
1    to go from, go away, go off, go forth, go, \nd...                1.1   
2           

In [21]:
df_merged = pd.merge(df_input, df_ap_vocab, how='left', left_on='Lemma', right_on = 'Base Word')

In [22]:
df_matches = df_merged[~df_merged["Base Word"].isna()]

In [23]:
df_matches

Unnamed: 0,Word,POS,Lemma,Vocabulary,Part of Speech,Definition,Suggested Reading,Base Word
0,Arma,8,arma,"arma, -orum (n. pl.)",noun,"implements, outfit, instruments, tools; \nimpl...",1.3,arma
4,qui,12,qui,"qui, quae, quod",pronoun/adjective,"who, which, that",1.1,qui
5,primus,1,primus,"primus, -a, -um",adjective,"the first, first",1.3,primus
8,sumus,4,sum,"sum, esse, fui",verb,to be,1.1,sum
10,maximux,1,magnus,"magnus, -a, -um",adjective,"great, large",1.1,magnus


In [24]:
df_output = df_matches[["Word", "Vocabulary", "Definition"]]

In [25]:
df_output.to_csv(Path("..") / "Files" / "Vocab_list.csv")

In [26]:
for i, r in df_output.iterrows():
    print(r["Vocabulary"] + ": " + r["Definition"])

arma, -orum (n. pl.): implements, outfit, instruments, tools; 
implements of war, arms, weapons
qui, quae, quod: who, which, that
primus, -a, -um: the first, first
sum, esse, fui: to be
magnus, -a, -um: great, large
