# DataLoader

Working prototype of the data loader.  Will be converted to standard Python

In [22]:
from dataclasses import dataclass
from typing import Tuple, List, Optional, Generator
import textwrap
import re
import string
from glob import glob
import os.path
from collections import Counter, defaultdict

import pandas as pd

### Convert Packard markup to the Greek Alphabet

In [8]:
def make_markup_to_greek_converter(print_key=False):
    greek_lower = {}
    greek_upper = {}
    
    greek_lower['A'] = '\u03b1'
    greek_upper['A'] = '\u0391'
    
    greek_lower['B'] = '\u03b2'
    greek_upper['B'] = '\u0392'
    
    greek_lower['G'] = '\u03b3'
    greek_upper['G'] = '\u0393'
    
    greek_lower['D'] = '\u03b4'
    greek_upper['D'] = '\u0394'
    
    greek_lower['E'] = '\u03b5'
    greek_upper['E'] = '\u0395'
    
    greek_lower['V'] = '\u03dd'
    greek_upper['V'] = '\u03dc'
    
    greek_lower['Z'] = '\u03b6'
    greek_upper['Z'] = '\u0396'
    
    greek_lower['H'] = '\u03b7'
    greek_upper['H'] = '\u0397'
    
    greek_lower['Q'] = '\u03b8'
    greek_upper['Q'] = '\u0398'
    
    greek_lower['I'] = '\u03b9'
    greek_upper['I'] = '\u0399'
    
    greek_lower['K'] = '\u03ba'
    greek_upper['K'] = '\u039a'
    
    greek_lower['L'] = '\u03bb'
    greek_upper['L'] = '\u039b'
    
    greek_lower['M'] = '\u03bc'
    greek_upper['M'] = '\u039c'
    
    greek_lower['N'] = '\u03bd'
    greek_upper['N'] = '\u039d'
    
    greek_lower['C'] = '\u03be'
    greek_upper['C'] = '\u039e'
    
    greek_lower['O'] = '\u03bf'
    greek_upper['O'] = '\u039f'
    
    greek_lower['P'] = '\u03c0'
    greek_upper['P'] = '\u03a0'
    
    greek_lower['3'] = '\u03d9'
    greek_upper['3'] = '\u03d8'
    
    greek_lower['R'] = '\u03c1'
    greek_upper['R'] = '\u03a1'
    
    greek_lower['S'] = '\u03c3'
    greek_upper['S'] = '\u03a3'
    
    greek_lower['J'] = '\u03c2'
    greek_upper['J'] = '\u03a3'  
    
    greek_lower['T'] = '\u03c4'
    greek_upper['T'] = '\u03a4'
    
    greek_lower['U'] = '\u03c5'
    greek_upper['U'] = '\u03a5'
    
    greek_lower['F'] = '\u03c6'
    greek_upper['F'] = '\u03a6'
    
    greek_lower['X'] = '\u03c7'
    greek_upper['X'] = '\u03a7'
    
    greek_lower['Y'] = '\u03c8'
    greek_upper['Y'] = '\u03a8'
    
    greek_lower['W'] = '\u03c9'
    greek_upper['W'] = '\u03a9'
    
    greek_lower['5'] = '\u03e1'
    greek_upper['5'] = '\u03e0'
    
    greek_lower[')'] = '\u0313'
    greek_upper[')'] = '\u0313'
    
    greek_lower['('] = '\u0314'
    greek_upper['('] = '\u0314'
    
    greek_lower['|'] = '\u0345'
    greek_upper['|'] = '\u0345'
    
    greek_lower['/'] = '\u0301'
    greek_upper['/'] = '\u0301'
    
    greek_lower['\\'] = '\u0300'
    greek_upper['\\'] = '\u0300'
    
    greek_lower['='] = '\u0342'
    greek_upper['='] = '\u0342'
    
    greek_lower['+'] = '\u0308'
    greek_upper['+'] = '\u0308'
    
    greek_lower[':'] = '\u00B7'
    greek_upper[':'] = '\u00B7'
    
    greek_lower["'"] = '\u0384'
    greek_upper["'"] = '\u0384'

    greek_upper = {ord(k): ord(v) for k,v in greek_upper.items()}
    greek_lower = {ord(k): ord(v) for k,v in greek_lower.items()}
    if print_key:
        print([chr(k) for k,v in greek_lower.items()])
        print([chr(v) for k,v in greek_lower.items()])
        print([chr(k) for k,v in greek_lower.items()][-9:])
        print(['a'+chr(v) for k,v in greek_lower.items()][-9:])



    def converter(english_str: str) -> str:
        greek_str = ''
        greek_ = greek_lower
        for i,c in enumerate(english_str):
            if c == '*':
                greek_ = greek_upper
                continue
            if c == 'S' and ((i == len(english_str) -1) or english_str[i+1] == ' '):
                c = 'J'
            greek_str += c.translate(greek_)
            greek_ = greek_lower
        return greek_str
    
    return converter


greek_converter = make_markup_to_greek_converter()


['A', 'B', 'G', 'D', 'E', 'V', 'Z', 'H', 'Q', 'I', 'K', 'L', 'M', 'N', 'C', 'O', 'P', '3', 'R', 'S', 'J', 'T', 'U', 'F', 'X', 'Y', 'W', '5', ')', '(', '|', '/', '\\', '=', '+', ':', "'"]
['α', 'β', 'γ', 'δ', 'ε', 'ϝ', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ϙ', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'ϡ', '̓', '̔', 'ͅ', '́', '̀', '͂', '̈', '·', '΄']
[')', '(', '|', '/', '\\', '=', '+', ':', "'"]
['a̓', 'a̔', 'aͅ', 'á', 'à', 'a͂', 'ä', 'a·', 'a΄']


In [9]:
print(greek_converter(r'E)N A)RXH=| E)POI/HSEN O( QEO\S TO\N '))
print(greek_converter(r'a/ a\ a= a) a( a)/ a)\ a)= a(= a| a)/| a)='))

ἐν ἀρχῇ ἐποίησεν ὁ θεὸς τὸν 
á à a͂ a̓ a̔ a̓́ a̓̀ a̓͂ a̔͂ aͅ a̓́ͅ a̓͂


### Greek Morphology

This first part loads the data into structures that are easier to process.  

In [4]:
test_file = '../../data/catss/greek_morph/01.Gen.1.mlxx'

In [54]:
BOOK_CHAPTER_VERSE_PATTERN = re.compile('^(\S+)\s+(\S*):(\S+)\s*$')
BOOK_CHAPTER_PATTERN = re.compile('^(\S+)\s+(\S*)$')
BOOK_PATTERN = re.compile('^(\S+)\s*$')
TOKEN_PATTERN = re.compile('(.{25})(.{4})(.{7})(\S{1,17})\s*(\S*)')
BOOKS = {
            '1Sam/K': {'english_name': 'I Samuel', 
                'alternate_english_name': 'Kings I',
                'greek_name': 'Βασιλειῶν Α' + '\u0374'},
            'Gen': {'english_name': 'Genesis', 
                'alternate_english_name': None,
                'greek_name': 'Γένεσις'},
            'DanTh': {'english_name': 'Daniel Th (Theodotion)', 
                'alternate_english_name': None,
                'greek_name': 'Δανιήλ'},
            '2Esdr': {'english_name': '2 Esdras', 
                'alternate_english_name': 'Ezra-Nehemiah',
                'greek_name': 'Ἔσδρας Β' + '\u0374'},
            '2/4Kgs': {'english_name': 'Kings IV', 
                'alternate_english_name': '2 Kings',
                'greek_name': 'Βασιλειῶν Δ' + '\u0374'},
            'Ezek': {'english_name': 'Ezekiel', 
                'alternate_english_name': None,
                'greek_name': 'Ἰεζεκιήλ'},
            'Jer': {'english_name': 'Jeremiah', 
                'alternate_english_name': None,
                'greek_name': 'Ἱερεμίας'},
            'EpJer': {'english_name': 'Letter of Jeremiah', 
                'alternate_english_name': None,
                'greek_name': 'Ἐπιστολὴ Ἰερεμίου'},
            'BelTh': {'english_name': 'Bel and the Dragon Th (Theodotion)', 
                'alternate_english_name': 'Daniel 14',
                'greek_name': 'Βὴλ καὶ Δράκων'},
            'Gen': {'english_name': 'Genesis', 
                'alternate_english_name': None,
                'greek_name': 'Γένεσις'},
            'PsSol': {'english_name': 'Psalms of Solomon', 
                'alternate_english_name': None,
                'greek_name': 'Ψαλμοί Σαλoμῶντος'},
            'Ps': {'english_name': 'Psalms', 
                'alternate_english_name': None,
                'greek_name': 'Ψαλμοί'},
            'Mal': {'english_name': 'Malachi', 
                'alternate_english_name': None,
                'greek_name': 'Μαλαχίας ΙΒ' + '\u0374'},
            '1Mac': {'english_name': 'Maccabees I', 
                'alternate_english_name': None,
                'greek_name': 'Μακκαβαίων Α' + '\u0374'},
            'Isa': {'english_name': 'Isaiah', 
                'alternate_english_name': None,
                'greek_name': 'Ἠσαΐας'},
            'Ezek': {'english_name': 'Ezekiel', 
                'alternate_english_name': None,
                'greek_name': 'Ἰεζεκιήλ'},
            '4Mac': {'english_name': '4 Maccabees', 
                'alternate_english_name': None,
                'greek_name': 'Μακκαβαίων Δ' + '\u0374' + ' Παράρτημα'},
            'Zeph': {'english_name': 'Zephaniah', 
                'alternate_english_name': None,
                'greek_name': 'Σοφονίας Θ' + '\u0374'},
            'Sir': {'english_name': 'Wisdom ofSirach', 
                'alternate_english_name': 'Ecclesiasticus',
                'greek_name': 'Σοφία Ἰησοῦ Σειράχ'},
            'Joel': {'english_name': 'Joel', 
                'alternate_english_name': None,
                'greek_name': 'Ἰωήλ Δ' + '\u0374'},
            'Wis': {'english_name': 'Wisdom of Solomon', 
                'alternate_english_name': 'Wisdom',
                'greek_name': 'Σοφία Σαλoμῶντος'},
            'JoshA': {'english_name': 'Joshua A (Codex Alexandrinus)', 
                'alternate_english_name': None,
                'greek_name': 'Ἰησοῦς Ναυῆ'},
            '1Esdr': {'english_name': '1 Esdras', 
                'alternate_english_name': None,
                'greek_name': 'Ἔσδρας Α' + '\u0374'},
            'Prov': {'english_name': 'Proverbs', 
                'alternate_english_name': None,
                'greek_name': 'Παροιμίαι'},
            'Jdt': {'english_name': 'Judith', 
                'alternate_english_name': None,
                'greek_name': 'Ἰουδίθ'},
            'Dan': {'english_name': 'Daniel OG', 
                'alternate_english_name': 'Daniel with Additions',
                'greek_name': 'Δανιήλ'},
            'TobBA': {'english_name': 'Tobit BA (Codices Vaticanus and Alexandrinus)', 
                'alternate_english_name': None,
                'greek_name': 'Τωβίτ'},
            'Lam': {'english_name': 'Lamentations', 
                'alternate_english_name': None,
                'greek_name': 'Θρῆνοι'},
            'Amos': {'english_name': 'Amos', 
                'alternate_english_name': None,
                'greek_name': 'Ἀμώς Β' + '\u0374'},
            'Hab': {'english_name': 'Habakkuk', 
                'alternate_english_name': None,
                'greek_name': 'Ἀμβακούμ Η' + '\u0374'},
            'Nah': {'english_name': 'Nahum', 
                'alternate_english_name': None,
                'greek_name': 'Ναούμ Ζ' + '\u0374'},
            'Deut': {'english_name': 'Deuteronomy', 
                'alternate_english_name': None,
                'greek_name': 'Δευτερονόμιον'},
            'JoshB': {'english_name': 'Joshua B (Codex Vaticanus)', 
                'alternate_english_name': None,
                'greek_name': 'Ἰησοῦς Ναυῆ'},
            'Job': {'english_name': 'Job', 
                'alternate_english_name': None,
                'greek_name': 'Ἰώβ'},
            'Obad': {'english_name': 'Obadiah', 
                'alternate_english_name': None,
                'greek_name': 'Ὀβδιού Ε' + '\u0374'},
            'TobS': {'english_name': 'Tobit S (Codex Sinaiticus)', 
                'alternate_english_name': None,
                'greek_name': 'Τωβίτ'},
            'Exod': {'english_name': 'Exodus', 
                'alternate_english_name': None,
                'greek_name': 'Ἔξοδος'},
            'Hos': {'english_name': 'Hosea', 
                'alternate_english_name': None,
                'greek_name': 'Ὡσηέ Α' + '\u0374'},
            'Od': {'english_name': 'Odes', 
                'alternate_english_name': None,
                'greek_name': 'Ωδαί'},
            'Mic': {'english_name': 'Micah', 
                'alternate_english_name': None,
                'greek_name': 'Μιχαίας Γ' + '\u0374'},
            '1/3Kgs': {'english_name': 'Kings III', 
                'alternate_english_name': '1 Kings',
                'greek_name': 'Βασιλειῶν Γ' + '\u0374'},
            'Jonah': {'english_name': 'Jonah', 
                'alternate_english_name': None,
                'greek_name': 'Ἰωνᾶς Ϛ' + '\u0374'},
            'JudgA': {'english_name': 'Judges A (Codex Alexandrinus)', 
                'alternate_english_name': None,
                'greek_name': 'Κριταί'},
            'Ps': {'english_name': 'Psalms', 
                'alternate_english_name': None,
                'greek_name': 'Ψαλμοί'},
            '3Mac': {'english_name': 'Maccabees III', 
                'alternate_english_name': None,
                'greek_name': 'Μακκαβαίων Γ' + '\u0374'},
            'Sus': {'english_name': 'Susanna OG (Old Greek)', 
                'alternate_english_name': 'Daniel 13',
                'greek_name': 'Σουσάννα'},
            '2Mac': {'english_name': 'Maccabees II', 
                'alternate_english_name': None,
                'greek_name': 'Μακκαβαίων Β' + '\u0374'},
            'Zech': {'english_name': 'Zachariah', 
                'alternate_english_name': None,
                'greek_name': 'Ζαχαρίας'},
            'Lev': {'english_name': 'Leviticus', 
                'alternate_english_name': None,
                'greek_name': 'Λευϊτικόν'},
            'Ruth': {'english_name': 'Ruth', 
                'alternate_english_name': None,
                'greek_name': 'Ῥούθ'},
            'Esth': {'english_name': 'Esther', 
                'alternate_english_name': None,
                'greek_name': 'Ἐσθήρ'},
            'JudgB': {'english_name': 'Judges B (Codex Vaticanus)', 
                'alternate_english_name': None,
                'greek_name': 'Κριταί'},
            'Jer': {'english_name': 'Jeremiah', 
                'alternate_english_name': None,
                'greek_name': 'Ἱερεμίας'},
            'Num': {'english_name': 'Numbers', 
                'alternate_english_name': None,
                'greek_name': 'Ἀριθμοί'},
            '2Sam/K': {'english_name': 'Kings II', 
                'alternate_english_name': 'II Samuel',
                'greek_name': 'Βασιλειῶν Β' + '\u0374'},
            'Bar': {'english_name': 'Baruch', 
                'alternate_english_name': None,
                'greek_name': 'Βαρούχ'},
            'Hag': {'english_name': 'Haggai', 
                'alternate_english_name': None,
                'greek_name': 'Ἀγγαῖος Ι' + '\u0374'},
            '2Chr': {'english_name': 'Chronicles II', 
                'alternate_english_name': None,
                'greek_name': 'Παραλειπομένων Β' + '\u0374'},
            'Qoh': {'english_name': 'Ecclesiastes', 
                'alternate_english_name': 'Qoheleth',
                'greek_name': 'Ἐκκλησιαστὴς'},
            'Cant': {'english_name': 'Song of Songs', 
                'alternate_english_name': 'Song of Solomon',
                'greek_name': 'Ἆσμα Ἀσμάτων'},
            '1Chr': {'english_name': 'Chronicles I', 
                'alternate_english_name': None,
                'greek_name': 'Παραλειπομένων Α' + '\u0374'},
            'Bel': {'english_name': 'Bel and the Dragon OG (Old Greek) ', 
                'alternate_english_name': 'Daniel 14',
                'greek_name': 'Βὴλ καὶ Δράκων'},
            'SusTh': {'english_name': 'Susanna Th (Theodotion)', 
                'alternate_english_name': 'Daniel 13',
                'greek_name': 'Σουσάννα'}

}

@dataclass
class Meta():
    name: str
    language: str
    source: str    
    filename: str
    raw: str
    encoding: str
    offsets: Tuple[int, int]
    description: str
    license: str
    
    def __repr__(self):
        repr = f'Meta(name={self.name}, language={self.language}, source={self.source}, filename={self.filename}, raw={textwrap.shorten(self.raw, width=20)}, encoding={self.encoding},'
        repr += f'offsets={self.offsets}, description={self.description}, license={self.license})'
        return repr

@dataclass
class Token():
    lemma_transliteration: str
    type_code: str
    parse_code: str
    dictionary_form_transliteration: str
    word_number: int = 0
    lemma_original: Optional[str] = None
    dictionary_form_original: Optional[str] = None
    other_transliteration: Optional[str] = None
    other_original: Optional[str] = None
    meta: Optional[Meta] = None
        
    def __post_init__(self):
        if self.lemma_transliteration:
            self.lemma_original = greek_converter(self.lemma_transliteration)
        if self.dictionary_form_transliteration:
            self.dictionary_form_original = greek_converter(self.dictionary_form_transliteration)
        if self.other_transliteration:
            self.other_original = greek_converter(self.other_transliteration)
    
@dataclass 
class Sentence():
    tokens: List[Token]
    sentence_number: int = 0
    meta: Optional[Meta] = None
    
    def print_all(self, transliterated=True, original=True):
        if transliterated:
            print(' '.join([t.lemma_transliteration for t in self.tokens]))
        if original:
            print(' '.join([t.lemma_original for t in self.tokens]))

    
@dataclass
class Verse():
    sentences: List[Sentence]
    verse_number: str
    meta: Optional[Meta] = None
    
    def print_all(self, transliterated=True, original=True):
        for sentence in self.sentences:
            sentence.print_all(transliterated=transliterated, original=original)
    
    
@dataclass
class Chapter():
    verses: List[Verse]
    chapter_number: str
    meta: Optional[Meta] = None
    
    def print_all(self, transliterated=True, original=True):
        for verse in self.verses:
            print('Verse:', verse.verse_number)
            verse.print_all(transliterated=transliterated, original=original)
    
    
@dataclass      
class Book():
    chapters: List[Chapter]
    book_name: str
    english_name: Optional[str] = None
    alternate_english_name: Optional[str] = None
    greek_name: Optional[str] = None
    meta: Optional[Meta] = None
    
    def __post_init__(self):
        if book := BOOKS.get(self.book_name):
            self.english_name = book['english_name']
            self.alternate_english_name = book['alternate_english_name']
            self.greek_name = book['greek_name']
    
    def print_all(self, transliterated=True, original=True):
        for chapter in self.chapters:
            print('Chapter:', chapter.chapter_number)
            chapter.print_all(transliterated=transliterated, original=original)
        
    
@dataclass        
class Document():
    books: List[Book]
    meta: Meta
    
    def print_all(self, transliterated=True, original=True):
        for book in self.books:
            print('Book:', book.book_name)
            book.print_all(transliterated=transliterated, original=original)
  
@dataclass
class Corpus():
    documents: List[Document]
    meta: Optional[Meta] = None
    
    def get_books(self):
        for doc in self.documents:
            for book in doc.books:
                yield book
                
    def get_book(self, book_name):
        for book in self.get_books():
            if book.book_name == book_name:
                return book
    
    def print_books(self):
        for book in self.get_books():
            if book.alternate_english_name:
                print(f'{book.book_name:8} {book.english_name:50}{book.alternate_english_name:25} {book.greek_name}')
            else:
                print(f'{book.book_name:8} {book.english_name:50}{" ":25} {book.greek_name}')
                
    def get_vocab_counter(self):
        vocab: defaultdict = defaultdict(int)
        for book in self.get_books():
            for chapter in book.chapters:
                for verse in chapter.verses:
                    for sentence in verse.sentences:
                        for token in sentence.tokens:
                            vocab[token.lemma_original] += 1
        return Counter(vocab)
    
    def get_sentences(self, dictionary_form:bool=False)->Generator[List[str], None, None]:
        """ Python generator to return raw sentences from the entire Rahlf Septuagint.  Includes all books, including those books with multiple
        translations.  Books are in no particular order.  Within each book, sentences are in order.  Each sentence is a list of strings.  Each string represents
        a token.  If dictionary_form is False (the default), the token will be the text as it appears in the book; if True it will be the dictionary
        form of the word.  There is no identification of books, chapter, or verse.  This function is designed for training on the entire corpus. This is
        a convenience function.  For other needs, the get_books function provides an easy way to iterate through the books of the corpus. 
        
        Parmameters:
            dictionary_form: If true, the dictionary form of each token will be returned.  Otherwise, the form of the word in the document will
            be returned
            
        Returns: Every sentence in the corpus.  Every sentence is a list of tokens.
            
        """
        for book in self.get_books():
            for chapter in book.chapters:
                for verse in chapter.verses:
                    for sentence in verse.sentences:
                        yield [t.dictionary_form_original if dictionary_form else t.lemma_original
                                  for t in sentence.tokens]
                        
    def get_tokens_glove(self, filename):
        """ Outputs a file conforming to the requirements of the Glove algorithm.  We'll start with the
        entire Septuagint as a single document """
        with open(filename, 'w', encoding='utf8') as fh:
            document = ''
            for book in self.get_books():
                for chapter in book.chapters:
                    for verse in chapter.verses:
                        for sentence in verse.sentences:
                            for token in sentence.tokens:
                                document += token.lemma_original + ' '
            fh.write(document)
                
                                

def load_morph_file(filename, name, language, source, encoding='UTF8', description=None, license=None, include_raw=False):
    raw = None
    if include_raw:
        with open(filename, 'r', encoding=encoding) as fd:
            raw = fd.read()
    
    document_meta = Meta(filename=filename, name=name, language=language, source=source, encoding=encoding, 
                offsets=(0,-1), description=description, license=None, raw=raw)
    
    books = []
    
    with open(filename, encoding=encoding) as fd:
        current_sentence = []
        current_book = []
        current_book_name = None
        current_sentence = []
        current_chapter_number = None
        current_chapter = []
        current_verse_number = None
        current_verse = []
        
        sentence_number = 0
        word_number = 0
        
        for i, line in enumerate(fd):
            line = line.strip()
            matched = False
            if match := BOOK_CHAPTER_VERSE_PATTERN.match(line):
                book_name, chapter_number, verse_number = match.groups()
                matched = True
            elif match := BOOK_CHAPTER_PATTERN.match(line):
                # print('BOOK_CHAPTER_PATTERN==>', line)
                book_name, chapter_number = match.groups()
                verse_number = None
                matched = True
            elif match := BOOK_PATTERN.match(line):
                # print('BOOK_PATTERN==>', line)
                book_name = match.group(1)
                chapter_number = verse_number = None
                matched = True
            if matched:
                word_number = 0
                if current_book_name is None:
                    current_book_name = book_name
                    current_chapter_number = chapter_number
                    current_verse_number = verse_number
                if current_verse_number != verse_number \
                        or current_chapter_number != chapter_number \
                        or current_book_name != book_name:
                    if current_sentence:
                        current_verse.append(Sentence(tokens=current_sentence, sentence_number=sentence_number))
                    current_sentence = []
                    current_chapter.append(Verse(sentences=current_verse, verse_number=current_verse_number))
                    current_verse = []
                    current_verse_number = verse_number
                if current_chapter_number != chapter_number or current_book_name != book_name:
                    current_book.append(Chapter(verses=current_chapter, chapter_number=current_chapter_number))
                    current_chapter = []
                    current_chapter_number = chapter_number
                if current_book_name != book_name:
                    if current_book_name:
                        books.append(Book(chapters=current_book, book_name=current_book_name))
                        current_book = []
                    current_book_name = book_name
                continue
            if len(line) == 0:
                if current_sentence:
                    current_verse.append(Sentence(tokens=current_sentence, sentence_number=sentence_number))
                    current_sentence = []
                    sentence_number += 1
                continue
            
            # current_sentence.append(line)
            if match := TOKEN_PATTERN.match(line):
                lemma_transliteration, type_code, parse_code, dictionary_form_transliteration, \
                    other_transliteration = match.groups()
                token = Token(lemma_transliteration=lemma_transliteration.strip(), 
                              type_code=type_code.strip(), 
                              parse_code=parse_code.strip(),
                              dictionary_form_transliteration=dictionary_form_transliteration,
                              other_transliteration=other_transliteration.strip())
                current_sentence.append(token)
                # print(token)
                word_number += 1
            else:
                print('NOT MATCH==>', line)
            
        if current_sentence:
            current_verse.append(Sentence(tokens=current_sentence, sentence_number=sentence_number))
        if current_verse:
            current_chapter.append(Verse(sentences=current_verse, verse_number=current_verse_number))
        if current_chapter:
            current_book.append(Chapter(verses=current_chapter, chapter_number=current_chapter_number))
        if current_book:
            books.append(Book(chapters=current_book, book_name=current_book_name))
            
        return Document(books=books, meta=document_meta)
                

def load_corpus(path='../../data/catss/greek_morph'):
    files = glob(os.path.join(path, '*.mlxx'))
    documents = []
    for file in files:
        name = re.match(r'\S*[/\\]\d\d\.(\S+).mlxx', file).group(1)
        print(file, name)
        documents.append(load_morph_file(file, name, 'greek', 'catss'))
    return Corpus(documents=documents)
        
                

In [55]:
corpus = load_corpus()

../../data/catss/greek_morph\01.Gen.1.mlxx Gen.1
../../data/catss/greek_morph\02.Gen.2.mlxx Gen.2
../../data/catss/greek_morph\03.Exod.mlxx Exod
../../data/catss/greek_morph\04.Lev.mlxx Lev
../../data/catss/greek_morph\05.Num.mlxx Num
../../data/catss/greek_morph\06.Deut.mlxx Deut
../../data/catss/greek_morph\07.JoshB.mlxx JoshB
../../data/catss/greek_morph\08.JoshA.mlxx JoshA
../../data/catss/greek_morph\09.JudgesB.mlxx JudgesB
../../data/catss/greek_morph\10.JudgesA.mlxx JudgesA
../../data/catss/greek_morph\11.Ruth.mlxx Ruth
../../data/catss/greek_morph\12.1Sam.mlxx 1Sam
../../data/catss/greek_morph\13.2Sam.mlxx 2Sam
../../data/catss/greek_morph\14.1Kings.mlxx 1Kings
../../data/catss/greek_morph\15.2Kings.mlxx 2Kings
../../data/catss/greek_morph\16.1Chron.mlxx 1Chron
../../data/catss/greek_morph\17.2Chron.mlxx 2Chron
../../data/catss/greek_morph\18.1Esdras.mlxx 1Esdras
../../data/catss/greek_morph\19.2Esdras.mlxx 2Esdras
../../data/catss/greek_morph\20.Esther.mlxx Esther
../../data/c

In [56]:
corpus.print_books()

Gen      Genesis                                                                     Γένεσις
Gen      Genesis                                                                     Γένεσις
Exod     Exodus                                                                      Ἔξοδος
Lev      Leviticus                                                                   Λευϊτικόν
Num      Numbers                                                                     Ἀριθμοί
Deut     Deuteronomy                                                                 Δευτερονόμιον
JoshB    Joshua B (Codex Vaticanus)                                                  Ἰησοῦς Ναυῆ
JoshA    Joshua A (Codex Alexandrinus)                                               Ἰησοῦς Ναυῆ
JudgB    Judges B (Codex Vaticanus)                                                  Κριταί
JudgA    Judges A (Codex Alexandrinus)                                               Κριταί
Ruth     Ruth                                            

In [57]:
corpus.get_tokens_glove('embeddings/lxx.txt')

In [44]:
chr_1 = corpus.get_book('1Chr')

In [20]:
doc = load_morph_file(test_file, 'Genesis 1:1', 'greek', 'catss')

In [31]:
counter = corpus.get_vocab_counter()

In [34]:
sum(counter.values())

623685

In [43]:
for i, l in enumerate(corpus.get_sentences()):
    if i > 100:
        break
    print(' '.join(l))
    # print(l)

ἐν ἀρχῇ ἐποίησεν ὁ θεὸς τὸν οὐρανὸν καὶ τὴν γῆν
ἡ δὲ γῆ ἦν ἀόρατος καὶ ἀκατασκεύαστος καὶ σκότος ἐπάνω τῆς ἀβύσσου καὶ πνεῦμα θεοῦ ἐπεφέρετο ἐπάνω τοῦ ὕδατος
καὶ εἶπεν ὁ θεός γενηθήτω φῶς καὶ ἐγένετο φῶς
καὶ εἶδεν ὁ θεὸς τὸ φῶς ὅτι καλόν καὶ διεχώρισεν ὁ θεὸς ἀνὰ μέσον τοῦ φωτὸς καὶ ἀνὰ μέσον τοῦ σκότους
καὶ ἐκάλεσεν ὁ θεὸς τὸ φῶς ἡμέραν καὶ τὸ σκότος ἐκάλεσεν νύκτα καὶ ἐγένετο ἑσπέρα καὶ ἐγένετο πρωί ἡμέρα μία
καὶ εἶπεν ὁ θεός γενηθήτω στερέωμα ἐν μέσῳ τοῦ ὕδατος καὶ ἔστω διαχωρίζον ἀνὰ μέσον ὕδατος καὶ ὕδατος καὶ ἐγένετο οὕτως
καὶ ἐποίησεν ὁ θεὸς τὸ στερέωμα καὶ διεχώρισεν ὁ θεὸς ἀνὰ μέσον τοῦ ὕδατος ὃ ἦν ὑποκάτω τοῦ στερεώματος καὶ ἀνὰ μέσον τοῦ ὕδατος τοῦ ἐπάνω τοῦ στερεώματος
καὶ ἐκάλεσεν ὁ θεὸς τὸ στερέωμα οὐρανόν καὶ εἶδεν ὁ θεὸς ὅτι καλόν καὶ ἐγένετο ἑσπέρα καὶ ἐγένετο πρωί ἡμέρα δε

In [45]:
chr_1

Book(chapters=[Chapter(verses=[Verse(sentences=[Sentence(tokens=[Token(lemma_transliteration='*ADAM', type_code='N', parse_code='NSM', dictionary_form_transliteration='*ADAM', word_number=0, lemma_original='Αδαμ', dictionary_form_original='Αδαμ', other_transliteration='', other_original=None, meta=None), Token(lemma_transliteration='*SHQ', type_code='N', parse_code='NSM', dictionary_form_transliteration='*SHQ', word_number=0, lemma_original='Σηθ', dictionary_form_original='Σηθ', other_transliteration='', other_original=None, meta=None), Token(lemma_transliteration='*ENWS', type_code='N', parse_code='NSM', dictionary_form_transliteration='*ENWS', word_number=0, lemma_original='Ενως', dictionary_form_original='Ενως', other_transliteration='', other_original=None, meta=None)], sentence_number=0, meta=None)], verse_number='1', meta=None), Verse(sentences=[Sentence(tokens=[Token(lemma_transliteration='*KAINAN', type_code='N', parse_code='NSM', dictionary_form_transliteration='*KAINAN', word