In [1]:
from itertools import combinations
import pandas as pd
import lxml.etree
import networkx as nx
import numpy as np
np.random.seed(18012023)
import scipy.spatial.distance as scidist
import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_excel('../data/metadata_corrected.xlsx')
df = df[df['exclude'] != 'x']
df.sample(10)

Unnamed: 0,id,title,author,date,provenance,date_range,genre,subgenre,exclude
265,van_sente_brandane,Van sente Brandane,,1402.5,cdrom-mnl,1380-1425,Epiek,Heiligenleven,
63,florigout_fragm_l,Florigout,,1387.5,cdrom-mnl,1375-1400,Epiek,Ridder,
111,loyhier_en_malaert_fragm_a,Loyhier en Malaert,,1375.0,cdrom-mnl,1350-1400,Epiek,Karel,
86,historie_van_gaver_capeel,Historie van Gaver Capeel,,1510.0,cdrom-mnl,1500-1520,Epiek,Ridder,
167,Reinout van Montalbaen,Reinout van Montalbaen,,1288.0,CG1,1300-1276,Epiek,Karel,
210,spiegel_der_jongers,Spiegel der jongers,Lambertus Goetman,1515.0,cdrom-mnl,1510-1520,Epiek,Didactiek,
209,spel_van_de_antichrist,Spel van de Antichrist,,1425.0,cdrom-mnl,1400-1450,Dramatiek,,
123,madelgijs_fragm_p,Madelgijs,,1387.5,cdrom-mnl,1375-1400,Epiek,Karel,
165,queeste_van_den_grale,Queeste van den Grale,,1325.0,cdrom-mnl,1300-1350,Epiek,Arthur,
238,strofische_gedichten,Strofische gedichten,Hadewijch,1350.0,cdrom-mnl,1340-1360,Lyriek,,


In [3]:
works = {}

for title, group in df.groupby('title'):
    words = []
    for id_ in group['id']:
        try:
            tree = lxml.etree.parse(f'../data/xml/{id_}.xml')
        except OSError:
            print(f'- Could not load {id_}')
            continue
        
        for line_node in tree.iterfind('//l'):
            tokens = line_node.attrib['tokens']
            lemmas = [l.text for l in line_node.iterfind('.//lemma')]
            pos_tags = [p.text for p in line_node.iterfind('.//pos')]
            
            for lemma, pos_tag in zip(lemmas, pos_tags):
                for l, p in zip(lemma.split('+'), pos_tag.split('+')):
                        if p != 'n(prop)':
                            words.append(l)
    
    works[title] = words

- Could not load ab_recht_ende_averecht
- Could not load alexanders_geesten
- Could not load Alexiuslegende
- Could not load anatomie_van_de_mens
- Could not load antwerps_liedboek
- Could not load arturs_doet
- Could not load aubri_de_borgengoen
- Could not load baghynken_van_parys
- Could not load barlaam_en_josaphat_fragm_go
- Could not load barlaam_en_josaphat_fragm_ge
- Could not load beatrijs
- Could not load bediedenisse_van_der_missen
- Could not load beerte_metten_breden_voeten
- Could not load beginsel_der_oorlogen_van_luyck
- Could not load berlijnse_liederenhandschrift
- Could not load boec_exemplaer
- Could not load boec_van_den_houte
- Could not load dat_boec_vander_wraken
- Could not load Boeve van Hamtone
- Could not load borchgrave_van_couchi_fragm_a
- Could not load borchgrave_van_couchi_fragm_br
- Could not load borchgrave_van_couchi_fragm_dp
- Could not load borchgravinne_van_vergi__1
- Could not load borchgravinne_van_vergi__2
- Could not load boudewijn_van_seborch

- Could not load tien_plaghen
- Could not load roman_van_torec
- Could not load Tristant
- Could not load truwanten
- Could not load tvveeste_musyck_boexken
- Could not load valentijn_en_nameloos_fragm_be
- Could not load valentijn_en_nameloos_fragm_ge
- Could not load van_maskeroen
- Could not load van_saladijn
- Could not load van_den_vii_vroeden_van_binnen_rome
- Could not load van_den_derden_eduwaert
- Could not load van_den_lande_van_over_zee
- Could not load van_den_neghen_besten__kort
- Could not load van_den_neghen_besten__lang
- Could not load van_den_verkeerden_martijn
- Could not load van_den_vijf_vrouden
- Could not load van_den_vos_reynaerde
- Could not load maanzodiologium
- Could not load van_der_wive_wonderlijcheit_lang
- Could not load van_ghevene
- Could not load van_ons_heren_wonden
- Could not load van_sente_brandane
- Could not load van_ses_vaerwen
- Could not load van_smeinscen_lede
- Could not load korte_kroniek_van_brabant_korte_versie_2
- Could not load vanden_

In [4]:
length = 3000

titles, texts = [], []

for title, words in works.items():
    si, ei = 0, length
    while ei < len(words):
        titles.append(title)
        texts.append(' '.join(words[si:ei]))
        si += length
        ei += length
        
df = pd.DataFrame(zip(titles, texts), columns=('title', 'text'))
df.sample(3)

Unnamed: 0,title,text
7,Theophilus,bedie het zijn veel riezen al leven zij een st...
1,Borchgrave van Couchi,hij van de moeder zijde maar onder de keizer t...
4,Spiegel der jongers,o hemels koning god almachtig vader zoon heili...


In [5]:
vec = TfidfVectorizer(min_df=2, token_pattern=r"(?u)\b\w+\b")
X = vec.fit_transform(df['text']).toarray()
X.shape

(9, 964)

In [6]:
titles = sorted(df['title'].unique())
distances = np.zeros((len(titles), len(titles)))
for t1, t2 in tqdm.tqdm(list(combinations(titles, 2))):
    A = X[df['title'] == t1]
    B = X[df['title'] == t2]
    distance = scidist.cdist(A, B, metric='cosine').mean()
    distances[titles.index(t1), titles.index(t2)] = distance
    distances[titles.index(t2), titles.index(t1)] = distance

100%|██████████| 6/6 [00:00<00:00, 1680.30it/s]


In [7]:
G = nx.Graph() 
for t1, v in zip(titles, distances):
    tops = v.argsort()[1:4] # ignore self-distance; take top-3 (like Eder)
    for t2 in tops:
        t2 = titles[t2]
        if G.has_edge(t1, t2):
            G[t1][t2]['weight'] += 1
        else:
            G.add_edge(t1, t2, weight=1)
print(f"N nodes = {G.number_of_nodes()}, N edges = {G.number_of_edges()}")
nx.write_gexf(G, "../network.gexf")

N nodes = 4, N edges = 6
