# DataLoader

Working prototype of the data loader.  Will be converted to standard Python

In [8]:
from dataclasses import dataclass
from typing import Tuple, List, Optional
import textwrap
import re
import string

import pandas as pd

### Convert Packard markup to the Greek Alphabet

In [15]:
def make_markup_to_greek_converter():
    english = string.ascii_uppercase + r"35)(|/\=+:'"

    greek_lower = [''] * len(english)
    greek_upper = [''] * len(english)
    greek_lower[english.find('A')] = '\u03b1'
    greek_upper[english.find('A')] = '\u0391'
    
    greek_lower[english.find('B')] = '\u03b2'
    greek_upper[english.find('B')] = '\u0392'
    
    greek_lower[english.find('G')] = '\u03b3'
    greek_upper[english.find('G')] = '\u0393'
    
    greek_lower[english.find('D')] = '\u03b4'
    greek_upper[english.find('D')] = '\u0394'
    
    greek_lower[english.find('E')] = '\u03b5'
    greek_upper[english.find('E')] = '\u0395'
    
    greek_lower[english.find('V')] = '\u03dd'
    greek_upper[english.find('V')] = '\u03dc'
    
    greek_lower[english.find('Z')] = '\u03b6'
    greek_upper[english.find('Z')] = '\u0396'
    
    greek_lower[english.find('H')] = '\u03b7'
    greek_upper[english.find('H')] = '\u0397'
    
    greek_lower[english.find('Q')] = '\u03b8'
    greek_upper[english.find('Q')] = '\u0398'
    
    greek_lower[english.find('I')] = '\u03b9'
    greek_upper[english.find('I')] = '\u0399'
    
    greek_lower[english.find('K')] = '\u03ba'
    greek_upper[english.find('K')] = '\u039a'
    
    greek_lower[english.find('L')] = '\u03bb'
    greek_upper[english.find('L')] = '\u039b'
    
    greek_lower[english.find('M')] = '\u03bc'
    greek_upper[english.find('M')] = '\u039c'
    
    greek_lower[english.find('N')] = '\u03bd'
    greek_upper[english.find('N')] = '\u039d'
    
    greek_lower[english.find('C')] = '\u03be'
    greek_upper[english.find('C')] = '\u039e'
    
    greek_lower[english.find('O')] = '\u03bf'
    greek_upper[english.find('O')] = '\u039f'
    
    greek_lower[english.find('P')] = '\u03c0'
    greek_upper[english.find('P')] = '\u03a0'
    
    greek_lower[english.find('3')] = '\u03d9'
    greek_upper[english.find('3')] = '\u03d8'
    
    greek_lower[english.find('R')] = '\u03c1'
    greek_upper[english.find('R')] = '\u03a1'
    
    greek_lower[english.find('S')] = '\u03c3'
    greek_upper[english.find('S')] = '\u03a3'
    
    greek_lower[english.find('J')] = '\u03c2'
    greek_upper[english.find('J')] = '\u03a3'  
    
    greek_lower[english.find('T')] = '\u03c4'
    greek_upper[english.find('T')] = '\u03a4'
    
    greek_lower[english.find('U')] = '\u03c5'
    greek_upper[english.find('U')] = '\u03a5'
    
    greek_lower[english.find('F')] = '\u03c6'
    greek_upper[english.find('F')] = '\u03a6'
    
    greek_lower[english.find('X')] = '\u03c7'
    greek_upper[english.find('X')] = '\u03a7'
    
    greek_lower[english.find('Y')] = '\u03c8'
    greek_upper[english.find('Y')] = '\u03a8'
    
    greek_lower[english.find('W')] = '\u03c9'
    greek_upper[english.find('W')] = '\u03a9'
    
    greek_lower[english.find('5')] = '\u03e1'
    greek_upper[english.find('5')] = '\u03e0'
    
    greek_lower[english.find(')')] = '\u0313'
    greek_upper[english.find(')')] = '\u0313'
    greek_lower[english.find('(')] = '\u0314'
    greek_upper[english.find('(')] = '\u0314'
    greek_lower[english.find('|')] = '\u0345'
    greek_upper[english.find('|')] = '\u0345'
    greek_lower[english.find('/')] = '\u0301'
    greek_upper[english.find('/')] = '\u0301'
    greek_lower[english.find('\\')] = '\u0300'
    greek_upper[english.find('\\')] = '\u0300'
    greek_lower[english.find('=')] = '\u0342'
    greek_upper[english.find('=')] = '\u0342'
    greek_lower[english.find('+')] = '\u0308'
    greek_upper[english.find('+')] = '\u0308'
    greek_lower[english.find(':')] = '\u00B7'
    greek_upper[english.find(':')] = '\u00B7'
    greek_lower[english.find("'")] = '\u0384'
    greek_upper[english.find("'")] = '\u0384'

    greek_lower = ''.join(greek_lower)
    greek_upper = ''.join(greek_upper)
    

    def converter(english_str: str) -> str:
        greek_str = ''
        greek_ = greek_lower
        for i in range(len(english_str)):
            es = english_str[i]
            if es == '*':
                greek_ = greek_upper
                continue
            if es == 'S' and ((i == len(english_str) -1) or english_str[i+1] == ' '):
                es = 'J'
            if es in english:
                greek_str += greek_[english.find(es)]
            elif es == ' ':
                greek_str += ' '
            else:
                greek_str += f'__{es}__'
            greek_ = greek_lower
        return greek_str
                                   
                                   
    
    return converter

greek_converter = make_markup_to_greek_converter()

In [16]:
greek_converter(r'E)N A)RXH=| E)POI/HSEN O( QEO\S TO\N ')

'ἐν ἀρχῇ ἐποίησεν ὁ θεὸς τὸν '

### Greek Morphology

This first part loads the data into structures that are easier to process.  

In [11]:
test_file = '../../source_files/catss/greek_morph/01.Gen.1.mlxx'

In [12]:
BOOK_CHAPTER_VERSE_PATTERN = re.compile('^(\S+)\s+(\d+):(\d+)\s*$')
TOKEN_PATTERN = re.compile('(.{25})(.{4})(.{7})(\S{1,17})\s*(\S*)')


@dataclass
class Meta():
    name: str
    language: str
    source: str    
    filename: str
    raw: str
    encoding: str
    offsets: Tuple[int, int]
    description: str
    license: str
    
    def __repr__(self):
        repr = f'Meta(name={self.name}, language={self.language}, source={self.source}, filename={self.filename}, raw={textwrap.shorten(self.raw, width=20)}, encoding={self.encoding},'
        repr += f'offsets={self.offsets}, description={self.description}, license={self.license})'
        return repr

@dataclass
class Token():
    lemma_transliteration: str
    type_code: str
    parse_code: str
    dictionary_form_transliteration: str
    word_number: int = 0
    lemma_original: Optional[str] = None
    dictionary_form_original: Optional[str] = None
    other_transliteration: Optional[str] = None
    other_original: Optional[str] = None
    meta: Optional[Meta] = None
    
    def to_greek(str):
        pass
    
    def __post_init__(self):
        if self.lemma_transliteration:
            self.lemma_original = greek_converter(self.lemma_transliteration)
        if self.dictionary_form_transliteration:
            self.dictionary_form_original = greek_converter(self.dictionary_form_transliteration)
        if self.other_transliteration:
            self.other_original = greek_converter(self.other_transliteration)
    
@dataclass 
class Sentence():
    tokens: List[Token]
    sentence_number: int = 0
    meta: Optional[Meta] = None
    
    def print_all(self, transliterated=True, original=True):
        if transliterated:
            print(' '.join([t.lemma_transliteration for t in self.tokens]))
        if original:
            print(' '.join([t.lemma_original for t in self.tokens]))

    
@dataclass
class Verse():
    sentences: List[Sentence]
    verse_number: str
    meta: Optional[Meta] = None
    
    def print_all(self, transliterated=True, original=True):
        for sentence in self.sentences:
            sentence.print_all(transliterated=transliterated, original=original)
    
    
@dataclass
class Chapter():
    verses: List[Verse]
    chapter_number: str
    meta: Optional[Meta] = None
    
    def print_all(self, transliterated=True, original=True):
        for verse in self.verses:
            print('Verse:', verse.verse_number)
            verse.print_all(transliterated=transliterated, original=original)
    
    
@dataclass      
class Book():
    chapters: List[Chapter]
    book_name: str
    meta: Optional[Meta] = None
    
    def print_all(self, transliterated=True, original=True):
        for chapter in self.chapters:
            print('Chapter:', chapter.chapter_number)
            chapter.print_all(transliterated=transliterated, original=original)
        
    
@dataclass        
class Document():
    books: List[Book]
    meta: Meta
    
    def print_all(self, transliterated=True, original=True):
        for book in self.books:
            print('Book:', book.book_name)
            book.print_all(transliterated=transliterated, original=original)
  

def load_morph_file(filename, name, language, source, encoding='UTF8', description=None, license=None, include_raw=True):
    raw = None
    if include_raw:
        with open(filename, 'r', encoding=encoding) as fd:
            raw = fd.read()
    
    document_meta = Meta(filename=filename, name=name, language=language, source=source, encoding=encoding, 
                offsets=(0,-1), description=description, license=None, raw=raw)
    
    books = []
    
    with open(test_file, encoding=encoding) as fd:
        current_sentence = []
        current_book = []
        current_book_name = None
        current_sentence = []
        current_chapter_number = None
        current_chapter = []
        current_verse_number = None
        current_verse = []
        
        sentence_number = 0
        word_number = 0
        
        for i, line in enumerate(fd):
            line = line.strip()
            if match := BOOK_CHAPTER_VERSE_PATTERN.search(line):
                book_name, chapter_number, verse_number = match.groups()
                word_number = 0
                if current_verse_number != verse_number \
                        or current_chapter_number != chapter_number \
                        or current_book_name != book_name:
                    current_verse.append(Sentence(tokens=current_sentence, sentence_number=sentence_number))
                    current_sentence = []
                    current_chapter.append(Verse(sentences=current_verse, verse_number=current_verse_number))
                    current_verse = []
                    current_verse_number = verse_number
                if current_chapter_number != chapter_number or current_book_name != book_name:
                    current_book.append(Chapter(verses=current_chapter, chapter_number=current_chapter_number))
                    current_chapter = []
                    current_chapter_number = chapter_number
                if current_book_name != book_name:
                    if current_book_name:
                        books.append(Book(chapters=current_book, book_name=current_book_name))
                        current_book = []
                    current_book_name = book_name
                continue
            if len(line) == 0:
                if current_sentence:
                    current_verse.append(Sentence(tokens=current_sentence, sentence_number=sentence_number))
                    current_sentence = []
                    sentence_number += 1
                continue
            
            # current_sentence.append(line)
            if match := TOKEN_PATTERN.match(line):
                lemma_transliteration, type_code, parse_code, dictionary_form_transliteration, \
                    other_transliteration = match.groups()
                token = Token(lemma_transliteration=lemma_transliteration.strip(), 
                              type_code=type_code.strip(), 
                              parse_code=parse_code.strip(),
                              dictionary_form_transliteration=dictionary_form_transliteration,
                              other_transliteration=other_transliteration.strip())
                current_sentence.append(token)
                # print(token)
                word_number += 1
            else:
                print('NOT MATCH==>', line)
            
            # print(sentence_number, word_number, i,  line)
            
            
        if current_sentence:
            current_verse.append(Sentence(tokens=current_sentence, sentence_number=sentence_number))
        if current_verse:
            current_chapter.append(Verse(sentences=current_verse, verse_number=current_verse_number))
        if current_chapter:
            current_book.append(Chapter(verses=current_chapter, chapter_number=current_chapter_number))
        if current_book:
            books.append(Book(chapters=current_book, book_name=current_book_name))
            
        return Document(books=books, meta=document_meta)
                
        
                

In [13]:
doc = load_morph_file(test_file, 'Genesis 1:1', 'greek', 'catss')

In [14]:
doc.print_all()

Book: Gen
Chapter: None
Verse: None


Chapter: 1
Verse: 1
E)N A)RXH=| E)POI/HSEN O( QEO\S TO\N OU)RANO\N KAI\ TH\N GH=N
ἐν ἀρχῇ ἐποίησεν ὁ θεὸς τὸν οὐρανὸν καὶ τὴν γῆν


Verse: 2
H( DE\ GH= H)=N A)O/RATOS KAI\ A)KATASKEU/ASTOS KAI\ SKO/TOS E)PA/NW TH=S A)BU/SSOU KAI\ PNEU=MA QEOU= E)PEFE/RETO E)PA/NW TOU= U(/DATOS
ἡ δὲ γῆ ἦν ἀόρατος καὶ ἀκατασκεύαστος καὶ σκότος ἐπάνω τῆς ἀβύσσου καὶ πνεῦμα θεοῦ ἐπεφέρετο ἐπάνω τοῦ ὕδατος


Verse: 3
KAI\ EI)=PEN O( QEO/S GENHQH/TW FW=S KAI\ E)GE/NETO FW=S
καὶ εἶπεν ὁ θεός γενηθήτω φῶς καὶ ἐγένετο φῶς


Verse: 4
KAI\ EI)=DEN O( QEO\S TO\ FW=S O(/TI KALO/N KAI\ DIEXW/RISEN O( QEO\S A)NA\ ME/SON TOU= FWTO\S KAI\ A)NA\ ME/SON TOU= SKO/TOUS
καὶ εἶδεν ὁ θεὸς τὸ φῶς ὅτι καλόν καὶ διεχώρισεν ὁ θεὸς ἀνὰ μέσον τοῦ φωτὸς καὶ ἀνὰ μέσον τοῦ σκότους


Verse: 5
KAI\ E)KA/LESEN O( QEO\S TO\ FW=S H(ME/RAN KAI\ TO\ SKO/TOS E)KA/LESEN NU/KTA KAI\ E)GE/NETO E(SPE/RA KAI\ E)GE/NETO PRWI/ H(ME/RA M

In [20]:
for book in doc.books:
    print(book.book_name)

Gen


In [86]:
doc.meta

Meta(name=Genesis 1:1, language=greek, source=catss, filename=../../source_files/catss/greek_morph/01.Gen.1.mlxx, raw=Gen 1:1 E)N P [...], encoding=UTF8,offsets=(0, -1), description=None, license=None)

In [None]:
with open(test_file) as fd:
    for line in fd:
        print(line.strip())

In [100]:
'\u03b1\u0313', '\u0313\u03b1'

('ἀ', '̓α')

In [99]:
'\u1f00'

'ἀ'