In [1]:
%reload_ext autoreload
%autoreload 2

## Extracting the Coding Scheme

In [2]:
import requests
from bs4 import BeautifulSoup as bsoup

coding_scheme_url = 'https://middelnederlands.nl/codering/'

response = requests.get(coding_scheme_url)
response.status_code

200

In [3]:
page_soup = bsoup(response.text, 'lxml')
code_table = page_soup.find('table')
headers = [header_col.text for header_col in code_table.find('thead').find_all('th')]
headers

['Code', 'Vertaling', 'CQL']

In [6]:
code_scheme = {}

for row in code_table.find('tbody').find_all('tr'):
    row_vals = [cell.text for cell in row.find_all('td')]
    # print(row_vals)
    row_json = {header: row_vals[hi] for hi, header in enumerate(headers)}
    row_json['pos'] = row_json['Vertaling']
    row_json['code'] = row_json['Code']
    row_json['cql'] = row_json['CQL']
    del row_json['Code']
    del row_json['Vertaling']
    del row_json['CQL']
    # print(row_json)
    code_scheme[row_json['code']] = row_json

code_scheme

{'615': {'pos': 'PronAdv(dem)',
  'code': '615',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '645': {'pos': 'PronAdv(indef)',
  'code': '645',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '635': {'pos': 'PronAdv(inter)',
  'code': '635',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '605': {'pos': 'PronAdv(pers)',
  'code': '605',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '655': {'pos': 'PronAdv(prep)',
  'code': '655',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '625': {'pos': 'PronAdv(rel)',
  'code': '625',
  'cql': 'pos=BW, feat.form=-r/-re'},
 '100': {'pos': 'Adj()', 'code': '100', 'cql': 'pos=ADJ, feat.getal=ev'},
 '101': {'pos': 'Adj(forme)',
  'code': '101',
  'cql': 'pos=ADJ, feat.getal=ev, feat.form=-e'},
 '104': {'pos': 'Adj(formn)',
  'code': '104',
  'cql': 'pos=ADJ, feat.getal=ev, feat.form=-n'},
 '105': {'pos': 'Adj(formr)',
  'code': '105',
  'cql': 'pos=ADJ, feat.getal=ev, feat.form=-r/-re'},
 '102': {'pos': 'Adj(forms)',
  'code': '102',
  'cql': 'pos=ADJ, feat.getal=ev, feat.form=-s/-th'},
 '

In [7]:
import json

code_scheme_file = '../data/CMNL-code_scheme.json'
with open(code_scheme_file, 'wt') as fh:
    json.dump(code_scheme, fh)

In [8]:
# code_scheme_tsv_file = '../data/CMNL-code_scheme.tsv'

# headers = ['code', 'pos', 'cql']

# with open(code_scheme_tsv_file, 'wt') as fh:
#     fh.write('\t'.join(headers) + '\n')
#     for code in sorted(code_scheme):
#         row = [code_scheme[code][header] for header in headers]
#         fh.write('\t'.join(row) + '\n')


In [240]:
import os

charter_dir = '../data/charters'

corpora = ['CGR', 'CRM']

if os.path.exists(charter_dir) is False:
    os.mkdir(charter_dir)
    
for corpus in corpora:
    corpus_dir = os.path.join(charter_dir, corpus)
    if os.path.exists(corpus_dir) is False:
        os.mkdir(corpus_dir)



## Parsing Corpus

In [286]:
import json

from scripts.parse import read_code_scheme
from scripts.parse import extend_code_scheme
from scripts.parse import read_docs
from scripts.parse import parse_metadata_line
from scripts.parse import parse_token_line

crm14_file = '../data/CRM14OorknrVoorElkeVorm'
crm14_file = '../data/CRM14ZESCORPORA.txt'
cgr13_file = '../data/CGR13.txt'

code_scheme_file = '../data/CMNL-code_scheme.tsv'

corpus_file = {
    'CGR': cgr13_file, 
    'CRM': crm14_file
}



code_scheme = read_code_scheme(code_scheme_file)
code_scheme = extend_code_scheme(code_scheme)

line_num = 0

example = {}

for corpus in corpus_file:
    print(f"INFO Parsing {corpus} file {corpus_file[corpus]}")
    doc_ids = set()
    corpus_dir = os.path.join(charter_dir, corpus)
    for di, doc_lines in enumerate(read_docs(corpus_file[corpus])):
        #print('number of lines:', len(doc_lines))
        #print(doc_lines[:2])
        doc_tokens = []
        try:
            metadata = parse_metadata_line(doc_lines[0])
            if metadata['doc_id'] in doc_ids:
                print('DOUBLE ID:', metadata['doc_id'])
            doc_ids.add(metadata['doc_id'])
        except AttributeError:
            print(f"{doc_lines[0]['num']}\t{doc_lines[0]['text']}")
            raise
        for line in doc_lines[1:]:
            try:
                token = parse_token_line(line, code_scheme)
                if 'unknown' in token['pos']:
                    if token['pos_code'] not in example:
                        example[token['pos_code']] = token
                        if '9' in token['pos_code']:
                            print(f"WARNING line {line['num']} - uncategorisale code in {line['text']}")
                        else:
                            print(f"WARNING line {line['num']} - unknown code in {line['text']}")
                doc_tokens.append(token)
            except (TypeError, ValueError, IndexError) as err:
                print(f'ERROR line {line["num"]} - {line["text"]}')
                if 'unexpected number of elements' in str(err):
                    pass
                else:
                    raise
        line_num += len(doc_lines)
        filename, content = make_doc(metadata, doc_tokens, corpus_dir)
        with open(filename, 'wt') as fh:
            json.dump(content, fh)
        # print('number of tokens:', len(doc_tokens))



INFO Parsing CGR file ../data/CGR13.txt
ERROR line 97549 - 300 sceventich - - -


INFO Parsing CRM file ../data/CRM14ZESCORPORA.txt


ERROR line 426657 - naesten naesten naesten naesten naesten 104 - - -
ERROR line 676887 - er~op er~op~ erop erop 510 0 - - -
ERROR line 732854 - bode bode bode bode 001 - 1X
ERROR line 844197 - daer daer daer daar 625 - ba4 - 6
ERROR line 941731 - ghene ghene ghene11 - - -
ERROR line 946245 - ghene ghene ghene11 - - -
ERROR line 946365 - ghene ghene ghene11 - - 2
ERROR line 949422 - ghene ghene ghene11 - - -
ERROR line 953405 - ghens ghens ghens12 - - -
ERROR line 954513 - gheens gheens gheens12 - - -
ERROR line 956204 - ghene ghene ghene11 - - -
ERROR line 964216 - ghene ghene ghene11 - - -
ERROR line 965122 - ghene ghene ghene11 - - -
ERROR line 973148 - ghene ghene ghene11 - - -
ERROR line 974885 - ghene ghene ghene11 - - -
ERROR line 976596 - ghene ghene ghene11 - - -
ERROR line 977939 - ghene ghene ghene11 - - -
ERROR line 978761 - ghene ghene ghene11 - - -
ERROR line 980201 - ghene ghene ghene11 - - -
ERROR line 981273 - ghene ghene ghene11 - - -
ERROR line 983596 - ghene ghene g

In [206]:
metadata

{'kloeke_letter': 'K',
 'kloeke_number': '720',
 'separator': 'r',
 'year': '1349',
 'serial_number': '01',
 'archive_ref': 'ArchiefPostel.www.bhic.nl',
 'doc_id_prefix': '_o',
 'doc_id': '_o_K720r34901.ArchiefPostel.www.bhic.nl'}

In [265]:
def map_token(token, form='orig'):
    if token[form].startswith('&') and token[form].endswith(';'):
        if token['orig'] == '&unreadable;':
            return '_'
        if token['pos'] == 'Punc(period)':
            return '.'
        elif token['pos'] == 'Punc(comma)':
            return ','
        elif token['pos'] == 'Punc(gcomma)':
            return ','
        elif token['pos'] == 'Punc(colon)':
            return ':'
        elif token['pos'] == 'Punc(hyph)':
            return '-'
        elif token['pos'] == 'Punc(semicolon)':
            return ';'
        elif token['pos'] == 'Punc(tilde)':
            return '~'
        elif token['pos'] == 'Punc(other)':
            return '_'
        elif token['pos'] == 'Punc(unknown)':
            return '_'
        elif token['pos'] == 'Misc(unrdbl)':
            return '_'
        elif token['pos'] == 'Markup(sep)':
            return '+'
        elif token['pos'] == 'Markup(sep)':
            return '+'
        else:
            print(token)
            raise ValueError('unmapped punctuation')
        print(token)
    else:
        return token[form]

    
def make_representation(doc_tokens, form='orig'):
    if form == 'pos':
        pos_tokens = []
        for token in doc_tokens:
            orig = map_token(token, form='orig')
            lemma = map_token(token, form='lemma')
            pos_tokens.append(f"{orig}/{lemma}/{token['pos']}")
        return ' '.join(pos_tokens)
    else:
        return ' '.join([map_token(token, form=form) for token in doc_tokens])


def make_doc(metadata, doc_tokens, output_dir):
    filename = metadata["doc_id"].replace('/', '-')
    filename = os.path.join(output_dir, f'{filename}.json')
    forms = ['orig', 'lower', 'full', 'lemma']
    doc = {
        'metadata': metadata,
        'tokens': doc_tokens,
        'text': {form: make_representation(doc_tokens, form) for form in forms}
    }
    return filename, doc
    
    
filename, doc = make_doc(metadata, doc_tokens, corpus_dir)
filename

'../data/charters/CRM/_o_P041p31901Weckx.deel2.oudestijl.gecollationeerd.json'

In [217]:
doc_tokens

[{'orig': 'Wj',
  'lower': 'wj',
  'full': 'wj',
  'lemma': 'wij',
  'pos_code': '404',
  'pos': 'Pron(pers,1,plu)',
  'cql': 'pos=VNW, feat.getal=mv, feat.persoon=1',
  'sent_sign': 'start_main_sent'},
 {'orig': 'danel',
  'lower': 'danel',
  'full': 'danel',
  'lemma': 'daniel',
  'pos_code': '020',
  'pos': 'N(sing,prop)',
  'cql': 'pos=N, feat.ntype=eigen',
  'sent_sign': None},
 {'orig': 'die',
  'lower': 'die',
  'full': 'die',
  'lemma': 'de',
  'pos_code': '471',
  'pos': 'Art(def,forme)',
  'cql': 'pos=LID, feat.form=-e',
  'sent_sign': None},
 {'orig': 'Rademeker',
  'lower': 'rademeker',
  'full': 'rademeker',
  'lemma': 'radmaker',
  'pos_code': '020',
  'pos': 'N(sing,prop)',
  'cql': 'pos=N, feat.ntype=eigen',
  'sent_sign': None},
 {'orig': '&duitsekomma;',
  'lower': '&duitsekomma;',
  'full': '&duitsekomma;',
  'lemma': '&duitsekomma;',
  'pos_code': 'Punc(ldk)',
  'pos': 'Punc(gcomma)',
  'cql': '~',
  'sent_sign': None},
 {'orig': 'gherit',
  'lower': 'gherit',
  'fu

In [270]:
for code in code_scheme:
    if 'unknown' in code_scheme[code]['pos']:
        print(f"{code}\t{code_scheme[code]['pos']}")

009	N(unknown)
019	N(unknown)
023	N(unknown)
029	N(unknown)
046	N(unknown)
091	N(unknown)
103	Adj(unknown)
109	Adj(unknown)
110	Adj(unknown)
111	Adj(unknown)
114	Adj(unknown)
115	Adj(unknown)
191	Adj(unknown)
202	V(unknown)
209	V(fin,pres,lex,unknown)
219	V(unknown)
239	V(fin,past,aux_cop,unknown)
260	V(unknown)
261	V(unknown)
264	V(unknown)
270	V(unknown)
279	V(unknown)
283	V(unknown)
299	V(unknown)
329	Num(unknown)
373	Num(unknown)
399	Num(unknown)
409	Pron(unknown)
413	Pron(unknown)
419	Pron(unknown)
429	Pron(unknown)
449	Pron(unknown)
459	Pron(unknown)
479	Art(unknown)
483	Art(unknown)
499	Art(unknown)
501	Adv(unknown)
502	Adv(unknown)
504	Adv(unknown)
505	Adv(unknown)
509	Adv(unknown)
511	Adv(unknown)
519	Adv(unknown)
521	Adv(unknown)
524	Adv(unknown)
551	Adv(unknown)
590	Adv(unknown)
600	PronAdv(unknown)
601	PronAdv(unknown)
604	PronAdv(unknown)
610	PronAdv(unknown)
611	PronAdv(unknown)
614	PronAdv(unknown)
620	PronAdv(unknown)
621	PronAdv(unknown)
630	PronAdv(unknown)
641	PronAd

Kloekenummers:
- p = plaats
- r = regio
- a/b/c = in de buurt van



_n = note
_l = nieuwe lijn


ba4 - ea0 = opsplitsing, b komt voor e
ba0 - ea2 = opsplitsing, e komt voor b