In [167]:
import os
import re
import glob
import string

from lxml import etree

In [168]:
print(len(list(glob.glob('docx/*.docx'))))

264


In [169]:
from docx import Document

In [170]:
page_num = '111r'

with open(f'docx/{page_num}.docx', 'rb') as f:
    document = Document(f)

In [171]:
def reverse_engineer_abbreviation(solution):
    expan = f'<ex>{solution}</ex>'
    if solution in ('nm') or solution == 'de':
        abbr = '<g ref="#bar"/>'
    elif solution.endswith('r'):
        abbr = '<g ref="#apomod"/>'
    elif solution == "et" or solution == "at":
        abbr = '<g ref="#etfin"/>'
    elif solution == "pro":
        abbr = '<g ref="#pflour"/>'
    elif solution == "par":
        abbr = '<g ref="#pbardes"/>'
    elif solution == "per":
        abbr = '<g ref="#pbardes"/>'
    elif solution == "con" or solution == "us" or solution == "com":
        abbr = '<g ref="#usmod"/>'
    elif solution == "rv":
        abbr = '<hi rend="superscript">v</hi>'
    elif solution == "ri":
        abbr = '<hi rend="superscript">i</hi>'
    elif solution == "ur":
        abbr = '<hi rend="superscript">z</hi>'
    elif solution == "ue":
        abbr = '<hi rend="superscript">e</hi>'
    elif solution == "ro":
        abbr = '<hi rend="superscript">o</hi>'
    elif solution == "ua":
        abbr = '<hi rend="superscript">u</hi>'
    elif solution == "ra":
        abbr = '<hi rend="superscript">u</hi>'
    elif solution == "re":
        abbr = '<hi rend="superscript">e</hi>'
    elif solution == "eit" or solution == "iet":
        abbr = '<hi rend="superscript">t</hi>'
    else:
        abbr = '<hi rend="superscript">'+solution+'</hi>'
        print(f'-> unsolvable abbreviation: {solution}')
    try:
        return abbr, expan
    except:
        return '', expan

In [172]:
def parse_word(word):
    parts, flags = [], []
    part, flag = '', False
    
    for char in word:
        if char == '%':
            if part:
                parts.append(part)
                flags.append(flag)
            part, flag = '', True
        elif char == '@':
            parts.append(part)
            flags.append(flag)
            part, flag = '', False
        else:
            part += char
    
    # dangling bit:
    if part:
        parts.append(part)
        flags.append(flag)
        
    if len(parts) > 1:
        solutions, abbrevs = [], []
        for part, flag in zip(parts, flags):
            if flag:
                abbr, solution = reverse_engineer_abbreviation(part)
                solutions.append(solution)
                abbrevs.append(abbr)
            else:
                solutions.append(part)
                abbrevs.append(part)

        abbr = "<abbr>"+''.join(abbrevs)+"</abbr>"
        expan = "<expan>"+''.join(solutions)+"</expan>"
        encoded = '<choice>'+abbr+expan+'</choice>'
    else:
        encoded = ''.join(parts)
    
    return encoded

In [175]:
rubric = re.compile('\&([A-Z])\_([0-9])\&')
verse_punct = '. : | |. .| |: :|'.split()

def parse_line(line, line_num=None, page_num=None, kind=None):
    word_split = None
    if line.endswith('%#@'):
        word_split = 'shy'
        line = line.replace('%#@', '')
    elif line.endswith('-'):
        word_split = 'explicit'
        line = line[:-1]
    
    para_left = False
    if line.startswith('¶'):
        # assumption: if line starts with para sign, it will be placed in left margin
        para_left = True
        line = line[1:].strip()
    
    encoded_tokens = []
    for word in line.split():
        if word in verse_punct:
            encoded_tokens.append('<pc>'+word+'</pc>')
        else:
            encoded_tokens.append(parse_word(word))
    
    # deal with rubrics:
    for idx, token in enumerate(encoded_tokens):
        m = re.match(rubric, token)
        if m:
            repl = f'<hi rend="capitalsize{m.group(2)}">{m.group(1)}</hi>'
            encoded_tokens[idx] = rubric.sub(repl, token)
    
    encoded = ' '.join(encoded_tokens)
    if para_left:
        encoded = '<note place="left" resp="scribe"><g ref="#para"/></note> ' + encoded
    if word_split == 'shy':
        encoded += '<choice><sic></sic><corr><c type="shy">-</c></corr></choice>'
    elif word_split == 'explicit':
        encoded += '<c type="shy">-</c>'
    
    if kind == 'header':
        encoded = f'<fw type="header">{encoded}</fw>\n'
    elif kind == 'title':
        encoded = f'<head>{encoded}</head>\n'
    elif kind == 'normal':
        encoded = f'<lb n="{line_num}" xml:id="HB.f{page_num}.{line_num}"/>' + encoded + '\n'
    
    return encoded

In [176]:
line_number_presence = re.compile('^\s*[0-9]+\s+')

for idx, para in enumerate(document.paragraphs):
    line = ''
    for run in para.runs:
        if run.italic:
            line += f'%{run.text}@'
        else:
            line += run.text
    
    line = line.strip()
    if not line:
        continue
    
    if line_number_presence.match(line):
        # remove manually added line numbers:
        line = line_number_presence.sub('', line)
    
    if line.startswith('$') and line.endswith('$'):
        line = line[1:-1]
        parsed = parse_line(line, kind='header')
    elif line.startswith('!') and line.endswith('!'):
        line = line[1:-1]
        parsed = parse_line(line, kind='title')
    else:
        parsed = parse_line(line, line_num=idx + 1, page_num=page_num, kind='normal')
    
    print(parsed)
            
        #print(run.bold)
        #print(run.italic)
        #print(run.underline)
        

<fw type="header">i</fw>

<lb n="3" xml:id="HB.f111r.3"/>70 <hi rend="capitalsize1">W</hi>at mochte eua <choice><abbr>d<g ref="#etfin"/></abbr><expan>d<ex>at</ex></expan></choice> <choice><abbr>ada<g ref="#bar"/></abbr><expan>ada<ex>m</ex></expan></choice> <pc>:</pc> Dor <choice><abbr>hare<g ref="#bar"/></abbr><expan>hare<ex>n</ex></expan></choice> wille den

<lb n="4" xml:id="HB.f111r.4"/>appel <choice><abbr>na<g ref="#bar"/></abbr><expan>na<ex>m</ex></expan></choice> <pc>:</pc> <choice><abbr>D<g ref="#etfin"/></abbr><expan>D<ex>at</ex></expan></choice> wi noch <choice><abbr>best<g ref="#apomod"/>ue<g ref="#bar"/></abbr><expan>best<ex>er</ex>ue<ex>n</ex></expan></choice> <pc>:</pc> Jc <choice><abbr>wae<g ref="#bar"/>re</abbr><expan>wae<ex>n</ex>re</expan></choice> ons

<lb n="5" xml:id="HB.f111r.5"/>vordeel af <choice><abbr>qua<g ref="#bar"/></abbr><expan>qua<ex>m</ex></expan></choice> <pc>:</pc> Want <choice><abbr>hi<g ref="#apomod"/>om<g ref="#bar"/>e</abbr><expan>hi<ex>er</ex>om<ex

### TODO
- Small caps (bv. nummers worden nog niet opgevangen)
- Strofenummers worden niet onderscheiden van regelnummers
- Onderlijnde stukken wordne nog niet opgevangen

### Notas en vragen:
- Ook eenregelige letters in rood met accolade
- Toegevoegde woordafbrekingen (#) staan cursief: hebben we dit consequent gedaan?
- : - zou :- moeten worden