In [9]:
from itertools import combinations
import pandas as pd
import lxml.etree
import networkx as nx
import numpy as np
np.random.seed(18012023)
import scipy.spatial.distance as scidist
import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
df = pd.read_excel('../data/metadata_corrected.xlsx')
df = df[df['exclude'] != 'x']
df.sample(10)

Unnamed: 0,id,title,author,date,provenance,date_range,genre,subgenre,exclude
204,Lutgard K,Sinte Lutgard,,1267.5,CG1,1270-1265,Epiek,Heiligenleven,
63,florigout_fragm_l,Florigout,,1387.5,cdrom-mnl,1375-1400,Epiek,Ridder,
111,loyhier_en_malaert_fragm_a,Loyhier en Malaert,,1375.0,cdrom-mnl,1350-1400,Epiek,Karel,
86,historie_van_gaver_capeel,Historie van Gaver Capeel,,1510.0,cdrom-mnl,1500-1520,Epiek,Ridder,
235,spiegel_historiael__4_velthem__fragm_l,Spiegel historiael (P4 [Velthem]),Lodewijk van Velthem,1350.0,cdrom-mnl,1340-1360,Epiek,Historiografie,
153,ongeidentificeerd_6,Ongeïdentificeerd (6),,1387.5,cdrom-mnl,1375-1400,Epiek,Karel,
211,spiegel_der_sonden,Spiegel der sonden,,1450.0,cdrom-mnl,1440-1460,Epiek,Didactiek,
123,madelgijs_fragm_p,Madelgijs,,1387.5,cdrom-mnl,1375-1400,Epiek,Karel,
167,Reinout van Montalbaen,Reinout van Montalbaen,,1288.0,CG1,1300-1276,Epiek,Karel,
241,tien_plaghen,Tien plaghen ende die tien ghebode,,1400.0,cdrom-mnl,1390-1410,Epiek,Didactiek,


In [11]:
works = {}

for title, group in df.groupby('title'):
    words = []
    for id_ in sorted(group['id']):
        try:
            tree = lxml.etree.parse(f'../data/xml/{id_}.xml')
        except OSError:
            print(f'- Could not load {id_}')
            continue
        
        for interpolation in tree.xpath("//interpolation"):
            interpolation.getparent().remove(interpolation)
        
        for line_node in tree.iterfind('//l'):
            tokens = line_node.attrib['tokens']
            lemmas = [l.text for l in line_node.iterfind('.//lemma')]
            pos_tags = [p.text for p in line_node.iterfind('.//pos')]
            
            for lemma, pos_tag in zip(lemmas, pos_tags):
                for l, p in zip(lemma.split('+'), pos_tag.split('+')):
                        if p != 'n(prop)':
                            words.append(l)
    
    works[title] = words

In [4]:
length = 3000

titles, texts = [], []

for title, words in works.items():
    si, ei = 0, length
    while ei < len(words):
        titles.append(title)
        texts.append(' '.join(words[si:ei]))
        si += length
        ei += length
        
df = pd.DataFrame(zip(titles, texts), columns=('title', 'text'))
df.sample(3)

Unnamed: 0,title,text
871,Queeste van den Grale,en ontbieden de koning ne doen hij niet een di...
683,Lancelot,hand dat gij het bij uw vromigheid hebben winn...
756,Luiks Diatessaron,hij zijn oog op te hemel waart en zeggen aldus...


In [5]:
vec = TfidfVectorizer(min_df=2, token_pattern=r"(?u)\b\w+\b")
X = vec.fit_transform(df['text']).toarray()
X.shape

(1623, 16227)

In [6]:
titles = sorted(df['title'].unique())
distances = np.zeros((len(titles), len(titles)))
for t1, t2 in tqdm.tqdm(list(combinations(titles, 2))):
    A = X[df['title'] == t1]
    B = X[df['title'] == t2]
    distance = scidist.cdist(A, B, metric='cosine').mean()
    distances[titles.index(t1), titles.index(t2)] = distance
    distances[titles.index(t2), titles.index(t1)] = distance

100%|██████████| 6670/6670 [00:38<00:00, 171.53it/s]


In [7]:
G = nx.Graph() 
for t1, v in zip(titles, distances):
    tops = v.argsort()[1:4] # ignore self-distance; take top-3 (like Eder)
    for t2 in tops:
        t2 = titles[t2]
        if G.has_edge(t1, t2):
            G[t1][t2]['weight'] += 1
        else:
            G.add_edge(t1, t2, weight=1)
print(f"N nodes = {G.number_of_nodes()}, N edges = {G.number_of_edges()}")
nx.write_gexf(G, "../network.gexf")

N nodes = 116, N edges = 312
