## Inladen

In [246]:
import re
import os
import glob
import html
import shutil

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook, show
output_notebook()

In [250]:
LACUNA = re.compile(r'\.\.+')

def load_file(path):
    with open(path) as f:
        xml_text = f.read()
    
    xml_text = xml_text.replace('&oudpond;', '')
    xml_text = xml_text.replace('&supm;', 'm')
    xml_text = xml_text.replace('&supM;', 'm')
    xml_text = xml_text.replace('&supc;', 'c')
    xml_text = xml_text.replace('&supt;', 't')
    xml_text = xml_text.replace('&supn;', 'n')
    xml_text = xml_text.replace('&sups;', 's')
    xml_text = xml_text.replace('&supd;', 'd')
    xml_text = xml_text.replace('&supc;', 'c')
    xml_text = xml_text.replace('&uring;', 'u')
    xml_text = xml_text.replace('&lt;', '')
    xml_text = xml_text.replace('&gt;', '')
    xml_text = html.unescape(xml_text)

    soup = BeautifulSoup(xml_text)
    
    data = {}

    # extract metadata:
    data['id'] = os.path.basename(path).replace('.xml', '')
    data['title'] = soup.find('title').text
    data['author'] = soup.find('author').text
    
    postquem = '<UNK>'
    try:
        postquem = soup.find('interpgrp', {'type': 'witnessYear_from'})
        postquem = postquem.find('interp')['value']
    except AttributeError:
        pass
    
    antequem = '<UNK>'
    try:
        antequem = soup.find('interpgrp', {'type': 'witnessYear_to'})
        antequem = antequem.find('interp')['value']
    except AttributeError:
        pass
    
    data['date'] = f'{postquem}-{antequem}'
    
    # extract and clean lines:
    lines = []
    for line in soup.find_all('l'):
        text = line.get_text().strip()
        if text and not re.search(LACUNA, text):
            line = ''.join([c for c in text if c.isalpha() or c.isspace()]).strip()
            if line:
                lines.append(line)
    
    data['size'] = len(lines)
    data['lines'] = '\n'.join(lines)
    
    return data

In [251]:
outdir = '../data/CMN-TEI/raw'

try:
    shutil.rmtree(outdir)
except FileNotFoundError:
    pass

os.mkdir(outdir)

In [252]:
texts = []
for fn in glob.glob('../data/CMN-TEI/rijm/*.xml'):
    texts.append(load_file(fn))

In [253]:
    #new_fn = os.path.basename(fn).replace('.xml', '.tsv')
    #with open(f'{outdir}/{new_fn}', 'w') as f:
    #    for line in lines:
    #        f.write('\n'.join(line.split()) + '\n\n')

## Rauwe aanpak

In [254]:
metadata, lines = [], []
for text in texts:
    lines.append(text['lines'])
    d = {d:text[d] for d in text if d != 'lines'}
    metadata.append(d)

In [278]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, token_pattern=r'(?u)\b\w+\b')
X = vectorizer.fit_transform(lines).toarray()
print(X.shape)

(262, 5000)


In [279]:
from sklearn.decomposition import PCA
pca = PCA(2)
X_pca = pca.fit_transform(X)
print(X_pca.shape)

(262, 2)


In [305]:
num_clust = 6
from sklearn.cluster import AgglomerativeClustering
clust = AgglomerativeClustering(num_clust)
labels = clust.fit_predict(X_pca)

In [306]:
from bokeh.palettes import Accent
colors = [Category20[num_clust][i] for i in labels]

In [307]:
data_dict = {'x': X_pca[:, 0],
             'y': X_pca[:, 1],
             'color': colors,
             'Title': [d['title'] for d in metadata],
             'Author': [d['author'] for d in metadata],
             'Date': [d['date'] for d in metadata],
             'Length': [d['size'] for d in metadata],
             'loglen': [np.log(d['size']) for d in metadata],
}
data_df = pd.DataFrame(data_dict)

x_label = "PC 1"
y_label = "PC 2"
title = "Cd-rom Middelnederlands (rijm)"

In [310]:
source = ColumnDataSource(data_dict)

p = figure(
    tools="hover,pan,wheel_zoom,save",
    toolbar_location="above",
    title=title,
    plot_width=800, plot_height=800,
)
p.xaxis.axis_label = x_label
p.yaxis.axis_label = y_label

p.hover.tooltips = [("Title", "@Title"),
                    ("Author", "@Author"),
                    ("Date", "@Date")]

p.scatter('x', 'y', source=source,
          fill_color='color', fill_alpha=0.8,
          line_color=None, size='loglen')
show(p)

We krijgen mooi de driehoek van Petersen: lyriek vs epiek vs dramatiek.