In [1]:
import re
import os

# Read the file (excerpt of De Bello Gallico downloaded from the Latin Library)
with open('dbg.txt') as f:
    lines = f.readlines()
    text = ' '.join(lines)

# Find all words
words = re.findall(r'\b[A-Za-z]+\b', text)
print(f'Number of words: {len(words)}')
print(f'First 10 words: {words[:10]}')

Number of words: 830
First 10 words: ['C', 'IVLI', 'CAESARIS', 'COMMENTARIORVM', 'DE', 'BELLO', 'GALLICO', 'LIBER', 'PRIMVS', 'Gallia']


In [2]:
import pexpect
from tqdm import tqdm
import numpy as np

beginning_of_line = ['\r\n=>', '\r=>', '\n=>']

# Creates a dictionary of words and their meanings by querying whitaker's words
def get_word_outputs(words, progress=False):
    current_dir = os.getcwd()
    os.chdir("whitakers-words")
    child = pexpect.spawn('bin/meanings')

    results = {}

    child.expect(beginning_of_line)

    it = words if not progress else tqdm(words)
    for word in it:
        # Send the word
        child.sendline(word)
        # Wait for the prompt again, capturing everything in between
        try:
            child.expect(beginning_of_line, timeout=0.2)
            # Save everything (except the first and last line, which are the prompts)
            results[word] = child.before.decode().strip().split('\n')[1:-1]
        except:
            continue

    os.chdir(current_dir)
    return results


outputs = get_word_outputs(words, progress=True)

100%|██████████| 830/830 [00:44<00:00, 18.61it/s]


In [3]:
# See 10 random examples of words
import random

for w in random.sample(list(outputs), 10):
    print(w, outputs[w])

vi ['vis, vis  N (3rd) F   [XXXAX]  \r', 'strength (sg. only), force, power, might, violence;\r\r']
consciverit ['conscio, conscire, conscivi, -  V (4th)   [XXXEO]    uncommon\r', 'feel guilty; be conscious of (wrong); have on conscience; know well (late);\r\r', 'conscisco, consciscere, conscivi, conscitus  V (3rd) TRANS   [XXXCO]  \r']
quorum [' [XXXAO]  \r', 'who; that; which, what; of which kind/degree; person/thing/time/point that;\r\r', 'who/what/which?, what/which one/man/person/thing? what kind/type of?;\r\r', 'who/whatever, everyone who, all that, anything that;\r\r', 'any; anyone/anything, any such; unspecified some; (after si/sin/sive/ne);\r\r']
paratos ['paro, parare, paravi, paratus  V (1st) TRANS   [XXXAO]  \r', 'prepare; furnish/supply/provide; produce; obtain/get; buy; raise; put up; plan;\r', 'paratus, parata -um, paratior -or -us, paratissimus -a -um  ADJ   [XXXDX]    lesser\r']
mortem ['mors, mortis  N (3rd) F   [XXXAX]  \r']
quibus [' [XXXAO]  \r', 'who; that; which,

In [8]:
def clean_output(l):
    l_stripped = list(map(lambda x: x.strip(), l))
    l_concatenated = "\n".join(l_stripped)
    return l_concatenated

values = [clean_output(outputs[k]) for k in outputs.keys()]

# Get unique elements and their counts using numpy's unique and return_counts functions
unique_elements, counts = np.unique(values, return_counts=True)

# Sort the unique elements by their counts in descending order
sorted_indices = np.argsort(-counts)
sorted_unique_elements = unique_elements[sorted_indices]


In [9]:
# Top 10 most common words in the corpus
sorted_unique_elements[:10], -np.sort(-counts)[:10]

(array(['', 'civitas, civitatis  N (3rd) F   [XLXAO]',
        'sum, esse, fui, futurus  V   [XXXAX]',
        'cogo, cogere, coegi, coactus  V (3rd) TRANS   [XXXAO]',
        'mons, montis  N (3rd) M   [XXXAX]',
        'omnis, omnis, omne  ADJ   [XXXAC]\neach, every, every one (of a number); all (pl.); all/the whole of;\nomne, omnis  N (3rd) N   [XXXCC]\nall things (pl.); everything; a/the whole, entity, unit;\nomnis, omnis  N (3rd) C   [XXXBC]',
        'Rhodanus, Rhodani  N (2nd) M   [XXFDX]    lesser',
        'persuadeo, persuadere, persuasi, persuasus  V (2nd)   [XXXBX]',
        'obtineo, obtinere, obtinui, obtentus  V (2nd)   [XXXAO]',
        '[XXXAO]\nwho; that; which, what; of which kind/degree; person/thing/time/point that;\nwho/what/which?, what/which one/man/person/thing? what kind/type of?;\nwho/whatever, everyone who, all that, anything that;\nany; anyone/anything, any such; unspecified some; (after si/sin/sive/ne);'],
       dtype='<U1074'),
 array([11,  5,  5,  4,  4