# Preprocessing

In [None]:
import re
import os
import glob
import html
import shutil

import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

In [None]:
LACUNA = re.compile(r'\.\.+')

def load_file(path):
    with open(path) as f:
        xml_text = f.read()
    
    xml_text = xml_text.replace('&oudpond;', '')
    xml_text = xml_text.replace('&supm;', 'm')
    xml_text = xml_text.replace('&supM;', 'm')
    xml_text = xml_text.replace('&supc;', 'c')
    xml_text = xml_text.replace('&supt;', 't')
    xml_text = xml_text.replace('&supn;', 'n')
    xml_text = xml_text.replace('&sups;', 's')
    xml_text = xml_text.replace('&supd;', 'd')
    xml_text = xml_text.replace('&supc;', 'c')
    xml_text = xml_text.replace('&uring;', 'u')
    xml_text = xml_text.replace('&lt;', '')
    xml_text = xml_text.replace('&gt;', '')
    xml_text = html.unescape(xml_text)

    soup = BeautifulSoup(xml_text)
    
    data = {}

    # extract metadata:
    data['id'] = os.path.basename(path).replace('.xml', '')
    data['title'] = soup.find('title').text
    data['author'] = soup.find('author').text
    
    postquem = '<UNK>'
    try:
        postquem = soup.find('interpgrp', {'type': 'witnessYear_from'})
        postquem = postquem.find('interp')['value']
    except AttributeError:
        pass
    
    antequem = '<UNK>'
    try:
        antequem = soup.find('interpgrp', {'type': 'witnessYear_to'})
        antequem = antequem.find('interp')['value']
    except AttributeError:
        pass
    
    data['date'] = f'{postquem}-{antequem}'
    
    provenance = '<UNK>'
    try:
        provenance = soup.find('interpgrp', {'type': 'corpusProvenance'})
        provenance = provenance.find('interp')['value']
    except AttributeError:
        pass
    data['provenance'] = provenance
    
    # extract and clean lines:
    lines = []
    for line in soup.find_all('l'):
        text = line.get_text().strip()
        if text and not re.search(LACUNA, text):
            line = ''.join([c for c in text if c.isalpha() or c.isspace()]).strip()
            if line:
                lines.append(line)
    
    data['lines'] = lines
    
    return data

In [None]:
texts = []
for fn in tqdm(glob.glob('../data/cdrom_rhyme/*.xml')):
    texts.append(load_file(fn))

In [None]:
for fn in tqdm(glob.glob('../data/cdrom_CG1/*.xml')):
    with open(fn) as f:
        text = f.read()
    
    # metadata
    data = {}
    data['id'] = os.path.basename(fn).replace('.xml', '')
    data['title'] = re.findall(r'\<bron_oms\>(.*)\<\/bron_oms\>', text)[0]
    postquem = re.findall(r"jaar\_tot\=\'([0-9]+)'", text)[0]
    antequem = re.findall(r"jaar\_van\=\'([0-9]+)'", text)[0]
    data['date'] = f'{postquem}-{antequem}'
    
    data['provenance'] = 'CG1'
    data['author'] = 'Onbekend'
    
    lines = []
    for line in text.split('\n'):
        line = line.strip()
        if not line:
            continue
        line = line.replace('<A >', '').replace('</A>', '')
        if not line or re.search(LACUNA, line):
            continue
        
        clean = ''
        for word in re.findall(r'<C ([0-9#\*@\+]+)_([^>]+)>\s*([^ \n\t\r<]+)', line):
            word = word[-1]
            word = ''.join([c for c in word if c.isalpha() or c.isspace()]).strip()
            if word:
                clean += word + ' '
        clean = clean.strip()
        if clean:
            lines.append(clean)
    
    data['lines'] = lines
    
    texts.append(data)

In [None]:
metadata = []
for text in texts:
    d = {d:text[d] for d in text if d != 'lines'}
    metadata.append(d)

In [None]:
mdf = pd.DataFrame(metadata)
mdf = mdf.set_index('id')
mdf

Clean up author labels:

In [None]:
import numpy as np
mdf['author'] = mdf['author'].replace('Niet van toepassing', 'Onbekend')
mdf['author'] = mdf['author'].replace('Onbekend', np.nan)
mdf['author'] = mdf['author'].str.replace(r'Jacob van Maerlant\?', 'Jacob van Maerlant(?)')
mdf['author'].value_counts(dropna=True)

Set some (uninformative) values for missing dates:

In [None]:
mdf.loc['jan_splinters_testament', 'date'] = '1550-1550'
mdf.loc['borchgrave_van_couchi_fragm_dp', 'date'] = '1300-1325'
mdf.loc['grimbergse_oorlog', 'date'] = '1300-1350'

In [None]:
mdf['date_range'] = mdf['date']
dates = []
for d in mdf['date_range']:
    d1, d2 = [int(date_str) for date_str in d.split('-')]
    d = d1 + ((d2 - d1) / 2)
    dates.append(d)
mdf['date'] = dates

In [None]:
mdf['date'].plot.kde();

In [None]:
mdf['genre'] = None
mdf['subgenre'] = None
mdf.to_excel('../data/metadata_extract.xlsx', header=True, index=True)

## Enrich using PIE

Taken from the [PIE NLP Taggers documentation](https://github.com/hipster-philology/nlp-pie-taggers):

In [None]:
from typing import List
from pie_extended.cli.utils import get_tagger, get_model, download
import lxml.etree

In [None]:
do_download = False # set to True if necessary
if do_download:
    for dl in download("dum"):
        x = 1

In [None]:
xml_path = '../data/xml'
try:
    shutil.rmtree(xml_path)
except FileNotFoundError:
    pass
os.mkdir(xml_path)

In [None]:
from pie_extended.models.dum.imports import get_iterator_and_processor
from collections import defaultdict

iterator, processor = get_iterator_and_processor()
tagger = get_tagger('dum', batch_size=1024, device="cpu", model_path=None)

for text in tqdm(texts):
    title = text['id']
    root = lxml.etree.Element('text')
    root.attrib['id'] = title
    
    lines: List[str] = [l for l in text['lines']]
    for nb, line in enumerate(lines):
        l_node = lxml.etree.Element('l')
        l_node.attrib['n'] = str(nb + 1)
        
        for w in tagger.tag_str(line.lower(), iterator=iterator, processor=processor):
            w_node = lxml.etree.Element('w')
            for tag in ('form', 'lemma', 'pos'):
                subnode = lxml.etree.Element(tag)
                subnode.text = w[tag]
                w_node.append(subnode)

            l_node.append(w_node)
            
        l_node.attrib['tokens'] = text['lines'][nb]
        root.append(l_node)
    
    with open(f'{xml_path}/{title}.xml', 'w') as f:
        f.write(lxml.etree.tostring(root, xml_declaration=True,
                                pretty_print=True, encoding='utf-8').decode())
    