In [1]:
import os
import re
import glob
import string

from lxml import etree

Regular expressions etc.:

In [2]:
line_number_presence = re.compile('^\s*([0-9]+)\s+')
rubric = re.compile('\&([A-Z])\_([0-9])\&')
verse_punct = '. : | |. .| |: :| :–'.split()

In [3]:
print(len(list(glob.glob('docx/*.docx'))))

0


In [4]:
from docx import Document

In [5]:
page_num = '2v'

with open(f'docx/{page_num}.docx', 'rb') as f:
    document = Document(f)

FileNotFoundError: [Errno 2] No such file or directory: 'docx/2v.docx'

In [114]:
def reverse_engineer_abbreviation(solution):
    expan = f'<ex>{solution}</ex>'
    if solution in 'nm' or solution in ('de', 'ab', 'i', 'um'):
        abbr = '<g ref="#bar"/>'
    elif solution.endswith('r') or solution in ('eu', 'u', 'ne'):
        abbr = '<g ref="#apomod"/>'
    elif solution.endswith(('ente', 'else')):
        abbr = '<g ref="#apomod"/>'
    elif solution in ("et", "at"):
        abbr = '<g ref="#etfin"/>'
    elif solution == "pro":
        abbr = '<g ref="#pflour"/>'
    elif solution == "en":
        abbr = '<g ref="#bar"/>'
    elif solution == "par":
        abbr = '<g ref="#pbardes"/>'
    elif solution == "per":
        abbr = '<g ref="#pbardes"/>'
    elif solution == "con" or solution == "us" or solution == "com":
        abbr = '<g ref="#usmod"/>'
    elif solution == "rv":
        abbr = '<hi rend="superscript">v</hi>'
    elif solution in ("ri", 'ui'):
        abbr = '<hi rend="superscript">i</hi>'
    elif solution == "ur":
        abbr = '<hi rend="superscript">z</hi>'
    elif solution == "aec":
        abbr = '<hi rend="superscript">c</hi>'
    elif solution == "ue":
        abbr = '<hi rend="superscript">e</hi>'
    elif solution == "ro":
        abbr = '<hi rend="superscript">o</hi>'
    elif solution == "ua":
        abbr = '<hi rend="superscript">u</hi>'
    elif solution in ("ra", 'na'):
        abbr = '<hi rend="superscript">u</hi>'
    elif solution == "re":
        abbr = '<hi rend="superscript">e</hi>'
    elif solution == "uae":
        abbr = '<g ref="#bar"/>'
    elif solution in ("eit", "iet", "otest"):
        abbr = '<hi rend="superscript">t</hi>'
    else:
        abbr = '<hi rend="superscript">'+solution+'</hi>'
        print(f'-> unsolvable abbreviation: {solution}')
    try:
        return abbr, expan
    except:
        return '', expan

In [115]:
def parse_word(word):
    parts, flags = [], []
    part, flag = '', False
    
    for char in word:
        if char == '%':
            if part:
                parts.append(part)
                flags.append(flag)
            part, flag = '', True
        elif char == '@':
            parts.append(part)
            flags.append(flag)
            part, flag = '', False
        else:
            part += char
    
    # dangling bit:
    if part:
        parts.append(part)
        flags.append(flag)
        
    if len(parts) > 1:
        solutions, abbrevs = [], []
        for part, flag in zip(parts, flags):
            if flag:
                if part == 'a':
                    print(solutions)
                abbr, solution = reverse_engineer_abbreviation(part)
                solutions.append(solution)
                abbrevs.append(abbr)
            else:
                solutions.append(part)
                abbrevs.append(part)

        abbr = "<abbr>"+''.join(abbrevs)+"</abbr>"
        expan = "<expan>"+''.join(solutions)+"</expan>"
        encoded = '<choice>'+abbr+expan+'</choice>'
    else:
        encoded = ''.join(parts)
    
    return encoded

In [116]:
def parse_line(line, line_num=None, page_num=None, kind=None):
    word_split = None
    if line.endswith('%#@'):
        word_split = 'shy'
        line = line.replace('%#@', '')
    elif line.endswith('-'):
        word_split = 'explicit'
        line = line[:-1]
    
    para_left = False
    if line.startswith('¶'):
        # assumption: if line starts with para sign, it will be placed in left margin
        para_left = True
        line = line[1:].strip()
    
    encoded_tokens = []
    for word in line.split():
        if word in verse_punct:
            encoded_tokens.append(f'<c type="guide">{word}</c></l> <l>')
        else:
            encoded_tokens.append(parse_word(word))
    
    # deal with rubrics:
    for idx, token in enumerate(encoded_tokens):
        m = re.search(rubric, token)
        if m:
            repl = f'<hi rend="capitalsize{m.group(2)}">{m.group(1)}</hi>'
            encoded_tokens[idx] = rubric.sub(repl, token)
    
    encoded = ' '.join(encoded_tokens)
    if para_left:
        encoded = '<note place="left" resp="scribe"><g ref="#para"/></note> ' + encoded
    if word_split == 'shy':
        encoded += '<supplied><c type="shy">-</c></supplied>'
    elif word_split == 'explicit':
        encoded += '<c type="shy">-</c>'
    
    if kind == 'header':
        encoded = f'<fw type="header">{encoded}</fw>'
    elif kind == 'title':
        encoded = f'</lg></body></text><text>\n<body>\n<lb n="???"/>\n<head>{encoded}</head>\n<lg>\n<l>'
    elif kind == 'normal':
        encoded = f'<lb n="{line_num}" xml:id="HB.f{page_num}.{line_num}"/>' + encoded
        if str(line_num) == '1' and str(page_num) == '1r': # only for very first line of MS
            encoded = encoded.replace('<lb n="1" xml:id="HB.f1r.1"/>', '<lb n="1" xml:id="HB.f1r.1"/><l>')
            
    encoded = encoded.replace('[...]', '<gap/>')
    encoded = encoded.replace(' <l> ', '\n<l>')
    
    return encoded

In [117]:
start = """<?xml version="1.0" encoding="UTF-8"?>
<MVN xmlns="http://www.tei-c.org/ns/1.0">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>Het Heber-Serrurehandschrift</title>
            </titleStmt>
            <publicationStmt>
                <p>Diplomatische, digitale uitgave van handschrift Gent, Universiteitsbibliotheek,
                    1374, in de reeks Middelnederlandse Verzamelhandschriften uit de Nederlanden</p>
            </publicationStmt>
            <sourceDesc>
                <p>De transcriptie is vervaardigd op basis van een fotografische reproductie van het
                    handschrift, die vervolgens in situ is gecollationeerd met het origineel.</p>
            </sourceDesc>
        </fileDesc>
        <encodingDesc>
            <xi:include href="charDecl.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
            <editorialDecl>
                <p>Voor alle andere opmerkingen: zie de inleiding bij de editie.</p>
            </editorialDecl>
        </encodingDesc>
        <profileDesc>
            <handNotes>
                <handNote xml:id="TweedePartie-kopiist">
                    <p>Het hele handschrift is vervaardigd door een enkele kopiist die vermoedelijk
                        ook de rubricator van het geheel is. Deze scribent is door Erik Kwakkel de
                        Tweede Partie-kopiist genoemd (Engelse benaming: Speculum scribe).</p>
                </handNote>
            </handNotes>
        </profileDesc>
    </teiHeader>
    <text xml:id="HBSR">
        <group>
            <text xml:id="Gallen" n="1.1.1">
                <body>
                    <lg>
"""

end = """
</lg>
                </body>
            </text>
        </group>
    </text>
</MVN>
"""

In [118]:
def parse_page(page_num):
    with open(f'/Users/mikekestemont/Dropbox/Heber-Serrure/docx/{page_num}.docx', 'rb') as f:
        document = Document(f)
    
    ln_num = 0
    
    xml_str = f'<pb xml:id="HBSRf{page_num}" n="{page_num}"/>'
    for idx, para in enumerate(document.paragraphs):
        line = ''
        for run in para.runs:
            if run.italic:
                line += f'%{run.text}@'
            else:
                line += run.text

        line = line.strip()
        if not line:
            continue

        if line.startswith('$') and line.endswith('$'):
            line = line[1:-1]
            parsed = parse_line(line, kind='header')
        elif line.startswith('!') and line.endswith('!'):
            line = line[1:-1]
            parsed = parse_line(line, kind='title')
        else:                    
            ln_num_match = re.match(line_number_presence, line)
            if ln_num_match:
                ln_num = int(ln_num_match.group(1))
                line = line_number_presence.sub('', line)
            else:
                try:
                    ln_num += 1
                except TypeError:
                    ln_num = f'???'
            parsed = parse_line(line, line_num=ln_num, page_num=page_num, kind='normal')
        
        parsed = parsed.replace('[', '<unclear>')
        parsed = parsed.replace(']', '</unclear>')
        xml_str += '\n' + parsed
    
    if xml_str.endswith(' <l>'):
        xml_str = xml_str[:-4]
        
    return xml_str

In [119]:
#fns = [os.path.basename(fn).replace('.docx', '') for fn in sorted(glob.glob('docx/*.docx'))[:10]]
#fns = []
#for idx in range(1, 11):
#    fns.extend([f'{idx}r', f'{idx}v'])
fns = ['6r', '6v']

In [120]:
page_strs = []
for page_num in fns:
    print('====', page_num)
    page_strs.append(parse_page(page_num))
    
xml_str = start + '\n'.join(page_strs) + end

with open('tmp.xml', 'w') as f:
    f.write(xml_str)

==== 6r
==== 6v


### TODO
- Small caps (bv. nummers worden nog niet opgevangen)
- Strofenummers worden niet onderscheiden van regelnummers
- Onderlijnde stukken wordne nog niet opgevangen

### Notas en vragen:
- Ook eenregelige letters in rood met accolade
- Toegevoegde woordafbrekingen (#) staan cursief: hebben we dit consequent gedaan?
- : - zou :- moeten worden

- Voor de regelnummering komen we echt in de problemen bij de hoofdstuktitels, die we op aparte lijnen weergeven.
- ```<c type="guide">.</c> moet eigenlijk <pc type="verse">.</pc>```+ zijn, maar het framework laat dit niet toe
- ```<choice><sic></sic><corr><c type="shy">-</c></corr></choice>``` -> wordst niet toegelaten door framework