In [1]:
%reload_ext autoreload
%autoreload 2

## Extracting the Coding Scheme

In [2]:
import requests
from bs4 import BeautifulSoup as bsoup

coding_scheme_url = 'https://middelnederlands.nl/codering/'

response = requests.get(coding_scheme_url)
response.status_code

200

In [3]:
page_soup = bsoup(response.text, 'lxml')
code_table = page_soup.find('table')
headers = [header_col.text for header_col in code_table.find('thead').find_all('th')]
headers

['Code', 'Vertaling', 'CQL']

In [6]:
code_scheme = {}

for row in code_table.find('tbody').find_all('tr'):
    row_vals = [cell.text for cell in row.find_all('td')]
    # print(row_vals)
    row_json = {header: row_vals[hi] for hi, header in enumerate(headers)}
    row_json['pos'] = row_json['Vertaling']
    row_json['code'] = row_json['Code']
    row_json['cql'] = row_json['CQL']
    del row_json['Code']
    del row_json['Vertaling']
    del row_json['CQL']
    # print(row_json)
    code_scheme[row_json['code']] = row_json

code_scheme

{'615': {'pos': 'PronAdv(dem)',
  'code': '615',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '645': {'pos': 'PronAdv(indef)',
  'code': '645',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '635': {'pos': 'PronAdv(inter)',
  'code': '635',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '605': {'pos': 'PronAdv(pers)',
  'code': '605',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '655': {'pos': 'PronAdv(prep)',
  'code': '655',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '625': {'pos': 'PronAdv(rel)',
  'code': '625',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '100': {'pos': 'Adj()', 'code': '100', 'cql': 'pos=ADJ, feat.getal=ev'},
 '101': {'pos': 'Adj(forme)',
  'code': '101',
  'cql': 'pos=ADJ, feat.getal=ev, feat.form=-e'},
 '104': {'pos': 'Adj(formn)',
  'code': '104',
  'cql': 'pos=ADJ, feat.getal=ev, feat.form=-n'},
 '105': {'pos': 'Adj(formr)',
  'code': '105',
  'cql': 'pos=ADJ, feat.getal=ev, feat.form=-r/-re'},
 '102': {'pos': 'Adj(forms)',
  'code': '102',
  'cql': 'pos=ADJ, feat.getal=ev, feat.form=-s/-th'},
 '

In [7]:
import json

code_scheme_file = '../data/CMNL-code_scheme.json'
with open(code_scheme_file, 'wt') as fh:
    json.dump(code_scheme, fh)

In [8]:
# code_scheme_tsv_file = '../data/CMNL-code_scheme.tsv'

# headers = ['code', 'pos', 'cql']

# with open(code_scheme_tsv_file, 'wt') as fh:
#     fh.write('\t'.join(headers) + '\n')
#     for code in sorted(code_scheme):
#         row = [code_scheme[code][header] for header in headers]
#         fh.write('\t'.join(row) + '\n')


In [240]:
import os

charter_dir = '../data/charters'

corpora = ['CGR', 'CRM']

if os.path.exists(charter_dir) is False:
    os.mkdir(charter_dir)
    
for corpus in corpora:
    corpus_dir = os.path.join(charter_dir, corpus)
    if os.path.exists(corpus_dir) is False:
        os.mkdir(corpus_dir)



## Parsing Corpus

In [262]:
import json

from scripts.parse import read_code_scheme
from scripts.parse import extend_code_scheme
from scripts.parse import read_docs
from scripts.parse import parse_metadata_line
from scripts.parse import parse_token_line

crm14_file = '../data/CRM14OorknrVoorElkeVorm'
crm14_file = '../data/CRM14ZESCORPORA.txt'
cgr13_file = '../data/CGR13.txt'

code_scheme_file = '../data/CMNL-code_scheme.tsv'

corpus_file = {
    'CGR': cgr13_file, 
    'CRM': crm14_file
}



code_scheme = read_code_scheme(code_scheme_file)
code_scheme = extend_code_scheme(code_scheme)

line_num = 0

for corpus in corpus_file:
    doc_ids = set()
    corpus_dir = os.path.join(charter_dir, corpus)
    for di, doc_lines in enumerate(read_docs(corpus_file[corpus])):
        #print('number of lines:', len(doc_lines))
        #print(doc_lines[:2])
        doc_tokens = []
        try:
            metadata = parse_metadata_line(doc_lines[0]['line'])
            if metadata['doc_id'] in doc_ids:
                print('DOUBLE ID:', metadata['doc_id'])
            doc_ids.add(metadata['doc_id'])
        except AttributeError:
            print(f"{doc_lines[0]['line_num']}\t{doc_lines[0]['line']}")
            raise
        for line in doc_lines[1:]:
            try:
                token = parse_token_line(line['line'], code_scheme)
                doc_tokens.append(token)
            except (TypeError, ValueError, IndexError) as err:
                print(f'ERROR - {di}\t{line["line_num"]}\t{line["line"]}')
                if 'unexpected number of elements' in str(err):
                    pass
                else:
                    raise
        line_num += len(doc_lines)
        filename, content = make_doc(metadata, doc_tokens, corpus_dir)
        with open(filename, 'wt') as fh:
            json.dump(content, fh)
        # print('number of tokens:', len(doc_tokens))



Removing trailing characters from pos_code Punc(lpt)iijpunc(lpw)
ERROR - 190	97549	300 sceventich - - -
Removing trailing characters from pos_code Punc(lpt)iij'hcpunc(lpw)
Removing trailing characters from pos_code Punc(lpt)iij'hcpunc(lpw)
Removing trailing characters from pos_code Punc(lpt)iij'hcpunc(lpw)
Removing trailing characters from pos_code Punc(lpt)iij'hcpunc(lpw)
Removing trailing characters from pos_code Punc(lpt)iijpunc(lpw)
Adding sent_sign field to line willen willen willen willen 214 - -
Adding sent_sign field to line worde worde worde worden 261 - -
Adding sent_sign field to line worde worde worde worden 261 - -
Adding field 7 - to line waer~om_~dat waer~om_~dat waerommedat waaromdat 520 - 5
Removing X from pos_code X101
Adding sent_sign field to line der der der de 475 - -
Removing @ from line @ @ @ _n:sic bedoeld.is.godes? Markup(sic) - - -
Adding missing digit 9 to pos_code 23
Adding sent_sign field to line worde worde worde worden 261 - -
Adding sent_sign field to l

In [206]:
metadata

{'kloeke_letter': 'K',
 'kloeke_number': '720',
 'separator': 'r',
 'year': '1349',
 'serial_number': '01',
 'archive_ref': 'ArchiefPostel.www.bhic.nl',
 'doc_id_prefix': '_o',
 'doc_id': '_o_K720r34901.ArchiefPostel.www.bhic.nl'}

In [251]:
def map_token(token, form='orig'):
    if token[form].startswith('&') and token[form].endswith(';'):
        if token['orig'] == '&unreadable;':
            return '_'
        if token['pos'] == 'Punc(period)':
            return '.'
        elif token['pos'] == 'Punc(comma)':
            return ','
        elif token['pos'] == 'Punc(gcomma)':
            return ','
        elif token['pos'] == 'Punc(colon)':
            return ':'
        elif token['pos'] == 'Punc(hyph)':
            return '-'
        elif token['pos'] == 'Punc(semicolon)':
            return ';'
        elif token['pos'] == 'Punc(tilde)':
            return '~'
        elif token['pos'] == 'Punc(other)':
            return '_'
        elif token['pos'] == 'Punc(unknown)':
            return '_'
        elif token['pos'] == 'Misc(unrdbl)':
            return '_'
        elif token['pos'] == 'Markup(sep)':
            return '+'
        elif token['pos'] == 'Markup(sep)':
            return '+'
        else:
            print(token)
            raise ValueError('unmapped punctuation')
        print(token)
    else:
        return token[form]

    
def make_representation(doc_tokens, form='orig'):
    if form == 'pos':
        pos_tokens = []
        for token in doc_tokens:
            orig = map_token(token, form='orig')
            lemma = map_token(token, form='lemma')
            pos_tokens.append(f"{orig}/{lemma}/{token['pos']}")
        return ' '.join(pos_tokens)
    else:
        return ' '.join([map_token(token, form=form) for token in doc_tokens])


def make_doc(metadata, doc_tokens, output_dir):
    filename = metadata["doc_id"].replace('/', '-')
    filename = os.path.join(output_dir, f'{filename}.json')
    forms = ['orig', 'lower', 'full', 'lemma']
    content = {form: make_representation(doc_tokens, form) for form in forms}
    return filename, content
    
    
filename, content = make_doc(metadata, doc_tokens, corpus_dir)
filename

'../data/charters/CRM/_o_H014p35301.RABruggeOBN.11679-11673.json'

In [217]:
doc_tokens

[{'orig': 'Wj',
  'lower': 'wj',
  'full': 'wj',
  'lemma': 'wij',
  'pos_code': '404',
  'pos': 'Pron(pers,1,plu)',
  'cql': 'pos=VNW, feat.getal=mv, feat.persoon=1',
  'sent_sign': 'start_main_sent'},
 {'orig': 'danel',
  'lower': 'danel',
  'full': 'danel',
  'lemma': 'daniel',
  'pos_code': '020',
  'pos': 'N(sing,prop)',
  'cql': 'pos=N, feat.ntype=eigen',
  'sent_sign': None},
 {'orig': 'die',
  'lower': 'die',
  'full': 'die',
  'lemma': 'de',
  'pos_code': '471',
  'pos': 'Art(def,forme)',
  'cql': 'pos=LID, feat.form=-e',
  'sent_sign': None},
 {'orig': 'Rademeker',
  'lower': 'rademeker',
  'full': 'rademeker',
  'lemma': 'radmaker',
  'pos_code': '020',
  'pos': 'N(sing,prop)',
  'cql': 'pos=N, feat.ntype=eigen',
  'sent_sign': None},
 {'orig': '&duitsekomma;',
  'lower': '&duitsekomma;',
  'full': '&duitsekomma;',
  'lemma': '&duitsekomma;',
  'pos_code': 'Punc(ldk)',
  'pos': 'Punc(gcomma)',
  'cql': '~',
  'sent_sign': None},
 {'orig': 'gherit',
  'lower': 'gherit',
  'fu