In [1]:
import os
import re
import glob
import string

from lxml import etree

In [2]:
print(len(list(glob.glob('docx/*.docx'))))

161


In [3]:
from docx import Document

In [4]:
with open('docx/1r.docx', 'rb') as f:
    document = Document(f)

In [5]:
def reverse_engineer_abbreviation(solution):
    expan = f'<ex>{solution}</ex>'
    if solution in ('nm') or solution == 'de':
        abbr = '<g ref="#bar"/>'
    elif solution.endswith('r'):
        abbr = '<g ref="#apomod"/>'
    elif solution == "et" or solution == "at":
        abbr = '<g ref="#etfin"/>'
    elif solution == "pro":
        abbr = '<g ref="#pflour"/>'
    elif solution == "par":
        abbr = '<g ref="#pbardes"/>'
    elif solution == "per":
        abbr = '<g ref="#pbardes"/>'
    elif solution == "con" or solution == "us" or solution == "com":
        abbr = '<g ref="#usmod"/>'
    elif solution == "rv":
        abbr = '<hi rend="superscript">v</hi>'
    elif solution == "ri":
        abbr = '<hi rend="superscript">i</hi>'
    elif solution == "ur":
        abbr = '<hi rend="superscript">z</hi>'
    elif solution == "ue":
        abbr = '<hi rend="superscript">e</hi>'
    elif solution == "ro":
        abbr = '<hi rend="superscript">o</hi>'
    elif solution == "ua":
        abbr = '<hi rend="superscript">u</hi>'
    elif solution == "ra":
        abbr = '<hi rend="superscript">u</hi>'
    elif solution == "re":
        abbr = '<hi rend="superscript">e</hi>'
    elif solution == "eit" or solution == "iet":
        abbr = '<hi rend="superscript">t</hi>'
    else:
        abbr = '<hi rend="superscript">'+solution+'</hi>'
        print(f'-> unsolvable abbreviation: {solution}')
    try:
        return abbr, expan
    except:
        return '', expan

In [6]:
def parse_word(word):
    parts, flags = [], []
    part, flag = '', False
    
    for char in word:
        if char == '%':
            if part:
                parts.append(part)
                flags.append(flag)
            part, flag = '', True
        elif char == '$':
            parts.append(part)
            flags.append(flag)
            part, flag = '', False
        else:
            part += char
    
    # dangling bit:
    if part:
        parts.append(part)
        flags.append(flag)
        
    if len(parts) > 1:
        solutions, abbrevs = [], []
        for part, flag in zip(parts, flags):
            if flag:
                abbr, solution = reverse_engineer_abbreviation(part)
                solutions.append(solution)
                abbrevs.append(abbr)
            else:
                solutions.append(part)
                abbrevs.append(part)

        abbr = "<abbr>"+''.join(abbrevs)+"</abbr>"
        expan = "<expan>"+''.join(solutions)+"</expan>"
        encoded = '<choice>'+abbr+expan+'</choice>'
    else:
        encoded = ''.join(parts)
    
    return encoded

In [7]:
def parse_normal_line(line, line_num, page_num):
    line = line.strip()
    encoded_tokens = []
    trailer = True if line.endswith('-') else False
    if trailer:
        line = line[:-1]
        
    words = line.split()
    for word in words:
        if word in string.punctuation:
            encoded_tokens.append('<pc>'+word+'</pc>')
        else:
            encoded_tokens.append(parse_word(word))
    
    encoded = ' '.join(encoded_tokens)
    if trailer:
        encoded += '<c type="shy">-</c>'
        
    encoded = f'<lb n="{line_num}" xml:id="HB.f{page_num}.{line_num}"/>' + encoded
    return encoded

In [8]:
page_num = '1r'

for idx, para in enumerate(document.paragraphs):
    print('::::')
    line = ''
    for run in para.runs:
        if run.italic:
            line += f'%{run.text}$'
        else:
            line += run.text
    print(parse_normal_line(line, line_num=idx + 1, page_num=page_num))
            
        #print(run.bold)
        #print(run.italic)
        #print(run.underline)

::::
<lb n="1" xml:id="HB.f1r.1"/>Dit <choice><abbr>sij<g ref="#bar"/></abbr><expan>sij<ex>n</ex></expan></choice> dier liede <choice><abbr>wo<g ref="#apomod"/>de</abbr><expan>wo<ex>er</ex>de</expan></choice> <choice><abbr>va<g ref="#bar"/></abbr><expan>va<ex>n</ex></expan></choice> <choice><abbr>galle<g ref="#bar"/></abbr><expan>galle<ex>n</ex></expan></choice>
::::
<lb n="2" xml:id="HB.f1r.2"/>Die gode die eweleke <choice><abbr>leue<g ref="#bar"/></abbr><expan>leue<ex>n</ex></expan></choice> <pc>:</pc> <choice><abbr>En<g ref="#bar"/></abbr><expan>En<ex>de</ex></expan></choice> jare <choice><abbr>gifte<g ref="#bar"/></abbr><expan>gifte<ex>n</ex></expan></choice> bi rede<c type="shy">-</c>
::::
<lb n="3" xml:id="HB.f1r.3"/><choice><abbr>ne<g ref="#bar"/></abbr><expan>ne<ex>n</ex></expan></choice> <choice><abbr>geue<g ref="#bar"/></abbr><expan>geue<ex>n</ex></expan></choice> <pc>:</pc> Als di ane liede <choice><abbr>wille<g ref="#bar"/></abbr><expan>wille<ex>n</ex></expan></choice> <cho