In [1]:
import glob
import os
from collections import Counter
from itertools import combinations, product

import numpy as np
np.random.seed(18012023)

import pandas as pd
pd.set_option('display.max_colwidth', 0)

import seaborn as sb

from sklearn.metrics import pairwise_distances
import lxml.etree
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
import matplotlib
plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.family'] = 'Arial'
from IPython.display import display

In /Users/mikekestemont/anaconda3/envs/n36/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/mikekestemont/anaconda3/envs/n36/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/mikekestemont/anaconda3/envs/n36/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /Users/mikekestemont/anaconda3/envs/n36/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases

In [2]:
fig_dir = '../figures'
if not os.path.isdir(fig_dir):
    os.mkdir(fig_dir)

#### Load metadata

In [3]:
meta_df = pd.read_excel('../data/metadata_corrected.xlsx')
meta_df = meta_df[meta_df['exclude'] != 'x']
meta_df.sample(10)

Unnamed: 0,id,title,author,date,provenance,date_range,genre,subgenre,exclude
266,van_ses_vaerwen,Van ses vaerwen ende twaelf outheyden,,1410.0,cdrom-mnl,1400-1420,Epiek,Didactiek,
63,florigout_fragm_l,Florigout,,1387.5,cdrom-mnl,1375-1400,Epiek,Ridder,
111,loyhier_en_malaert_fragm_a,Loyhier en Malaert,,1375.0,cdrom-mnl,1350-1400,Epiek,Karel,
86,historie_van_gaver_capeel,Historie van Gaver Capeel,,1510.0,cdrom-mnl,1500-1520,Epiek,Ridder,
168,renout_van_montalbaen_fragm_be,Renout van Montalbaen,,1350.0,cdrom-mnl,1340-1360,Epiek,Karel,
211,spiegel_der_sonden,Spiegel der sonden,,1450.0,cdrom-mnl,1440-1460,Epiek,Didactiek,
210,spiegel_der_jongers,Spiegel der jongers,Lambertus Goetman,1515.0,cdrom-mnl,1510-1520,Epiek,Didactiek,
123,madelgijs_fragm_p,Madelgijs,,1387.5,cdrom-mnl,1375-1400,Epiek,Karel,
166,reinaerts_historie,Reinaerts historie,,1470.0,cdrom-mnl,1460-1480,Epiek,Dier,
239,suverlijc_boecxken,Suverlijc boecxken,,1508.0,cdrom-mnl,1508-1508,Lyriek,,


In [4]:
def get_verse_groups(verses, size=2, intertexts=False):
    for i in range(len(verses) - (size - 1)):
        if not intertexts:
            yield ' / '.join(verses[i : i + size])
        else:
            its = Counter(verses[i : i + size])
            if None in its:
                yield None
            elif len(its) > 1:
                yield 'overlap'
            else:
                yield list(its.keys())[0]

In [5]:
def parse_xml(fn, rm_interpol=False):
    try:
        tree = lxml.etree.parse(fn)
    except OSError:
        print(f'- Could not load {fn}')
        return None
        
    if rm_interpol:
        for interpolation in tree.xpath("//interpolation"):
            interpolation.getparent().remove(interpolation)
        
    for line_node in tree.iterfind('//l'):
        try:
            intertext_id = line_node.attrib['intertext']
        except KeyError:
            intertext_id = None
        
        tokens_ = line_node.attrib['tokens'].split()
        lemmas_ = []
        
        lemma_tags = [l.text for l in line_node.iterfind('.//lemma')]
        pos_tags = [p.text for p in line_node.iterfind('.//pos')]
        
        for lemma, pos in zip(lemma_tags, pos_tags):
            for l, p in zip(lemma.split('+'), pos.split('+')):
                    if p == 'n(prop)':
                        lemmas_.append('n(prop)')
                    else:
                        lemmas_.append(l)
    
        yield tokens_, lemmas_, intertext_id

In [7]:
GROUP_SIZE = 2

titles, tokens, lemmas, intertexts = [], [], [], []

for title, group in tqdm(meta_df.groupby('title')):
    work_tokens, work_lemmas, work_intertexts = [], [], []
    
    for id_ in sorted(group['id']):
        for tok, lem, intertext_id in parse_xml(f'../data/xml/{id_}.xml', rm_interpol=True):
            work_tokens.append(tok)
            work_lemmas.append(lem)
            work_intertexts.append(intertext_id)
    
    verse_tokens = [' '.join(v) for v in work_tokens]
    verse_lemmas = [' '.join(v) for v in work_lemmas]

    verse_group_tokens = list(get_verse_groups(verse_tokens, size=GROUP_SIZE))
    verse_group_lemmas = list(get_verse_groups(verse_lemmas, size=GROUP_SIZE))
    verse_group_intertexts = list(get_verse_groups(work_intertexts, size=GROUP_SIZE, intertexts=True))

    tokens.extend(verse_group_tokens)
    lemmas.extend(verse_group_lemmas)
    intertexts.extend(verse_group_intertexts)
    titles.extend([title] * len(verse_group_lemmas))

100%|██████████| 205/205 [00:33<00:00,  6.09it/s]


In [8]:
df = pd.DataFrame(zip(titles, tokens, lemmas, intertexts), columns=('title', 'tokens', 'lemmas', 'intertext'))

In [9]:
def tokenizer(text):
    return text.replace(' / ', ' ').lower().strip().split()

def add_rhyme_column(df):
    rhyme_words = []
    for lemmas in df['lemmas']:
        rhymes = []
        for verse in lemmas.split(' / '):
            rhymes.append(verse.strip().split()[-1])
        rhyme_words.append(' '.join(rhymes))
    df['rhyme'] = rhyme_words
    return df

In [10]:
df = add_rhyme_column(df)
df.head()

Unnamed: 0,title,tokens,lemmas,intertext,rhyme
0,AB recht ende averecht,Aensiet dese vrouwen hoe si gaen / Besiet hoe haer tuten staen,aanzien deze vrouw hoe zij gaan / bezien hoe zij de staan,,gaan staan
1,AB recht ende averecht,Besiet hoe haer tuten staen / Claer dat si hen blanketten,bezien hoe zij de staan / klaar dat zij zij n(prop),,staan n(prop)
2,AB recht ende averecht,Claer dat si hen blanketten / Die cleeder soe lanc dat si hen letten,klaar dat zij zij n(prop) / de kleed zo lang dat zij zij letten,,n(prop) letten
3,AB recht ende averecht,Die cleeder soe lanc dat si hen letten / Ende sleypen hen nae al op die eerde,de kleed zo lang dat zij zij letten / en slapen zij na al op de aarde,,letten aarde
4,AB recht ende averecht,Ende sleypen hen nae al op die eerde / Fi diere vuylder hoverde,en slapen zij na al op de aarde / fi duur vouwer hovaardij,,aarde hovaardij


We've set these values:

In [11]:
optim_vs = 7000
optim_rw = 0.1693877551020408
optim_th = 0.42202047865566794

In [12]:
def batch(iterable, n=1):
    l = iterable.shape[0]
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

def pairwise(title1, title2, rhyme_weight=.25, batch_size=5000):
    
    A = df[df['title'] == title1]
    B = df[df['title'] == title2]

    AX = vec.transform(A['lemmas']) + rhyme_weight * vec.transform(A['rhyme'])
    BX = vec.transform(B['lemmas']) + rhyme_weight * vec.transform(B['rhyme'])

    all_distances = None

    for ax in batch(AX, batch_size):
        if all_distances is None:
            all_distances = pairwise_distances(ax, BX, metric='cosine')
        else:
            all_distances = np.append(all_distances,
                                      pairwise_distances(ax, BX, metric='cosine'),
                                      axis=0)
    
    return all_distances

In [13]:
def retrieve_intertexts(title1, title2, distances, threshold):
    A = df[df['title'] == title1]
    B = df[df['title'] == title2]
    
    intertexts = []
    for idx1, idx2 in np.transpose(np.nonzero(distances < threshold)):
        tokens1 = A.iloc[idx1]['tokens']
        tokens2 = B.iloc[idx2]['tokens']
        distance = distances[idx1, idx2]
        intertexts.append((title1, title2, tokens1, tokens2, distance))
        
    return pd.DataFrame(intertexts, columns=['title1', 'title2', 'tokens1', 'tokens2', 'distance'])

def plot_min_distance_nn(t1, t2, min_dists, threshold, rolling_size=1000):
    mu = min_dists.rolling(rolling_size).mean()
    fig, ax = plt.subplots()
    mu.plot(ax=ax)
    ax.axhline(threshold, ls='--', c='grey')
    ax.set_ylim(0, 1)
    ax.set_xlabel(t1)
    ax.set_ylabel('Distance to NN in ' + t2)
    return ax

In [14]:
chiv_titles = meta_df[meta_df['genre'] == 'Epiek']['title'].unique()

In [15]:
vec = TfidfVectorizer(max_features=optim_vs, min_df=2,
                      tokenizer=tokenizer, token_pattern=None).fit(df['lemmas'])

In [None]:
results, intertexts = [], []

select_titles = chiv_titles#[:5]

for t1, t2 in tqdm(list(product(select_titles, select_titles))):
    if t1 == t2:
        continue
        
    # author
    a1 = meta_df[meta_df['title'] == t1]['author'].iloc[0]
    a2 = meta_df[meta_df['title'] == t2]['author'].iloc[0] 
    
    #if not isinstance(a1, str) or not isinstance(a2, str):
    #    continue
    
    # genre
    g1 = meta_df[meta_df['title'] == t1]['genre'].iloc[0]
    g2 = meta_df[meta_df['title'] == t2]['genre'].iloc[0]
    
    # subgenre
    sg1 = meta_df[meta_df['title'] == t1]['subgenre'].iloc[0]
    sg2 = meta_df[meta_df['title'] == t2]['subgenre'].iloc[0]

    # get NNs for statistics
    distances = pairwise(t1, t2, rhyme_weight=optim_rw, batch_size=10000)
    NNs = pd.Series(distances.min(axis=1))
    l1 = len(df[df['title'] == t1])
    l2 = len(df[df['title'] == t2])
    hits = np.sum(NNs <= optim_th)
    results.append((t1, t2, l1, l2, hits, g1, g2, sg1, sg2, a1, a2))
    
    # collect actual intertexts
    #intertexts.append(retrieve_intertexts(t1, t2, distances=distances, threshold=threshold))
    
#intertexts = pd.concat(intertexts)
#intertexts.to_excel('../figures/intertexts.xlsx', index=False)

results = pd.DataFrame(results, columns=['t1', 't2', 'l1', 'l2', 'hits', 'genre1', 'genre2',
                                         'subgenre1', 'subgenre2',
                                         'author1', 'author2'])
results['hit_ratio'] = results['hits'] / (results['l1'] * results['l2'])
results.to_excel('../figures/statistics.xlsx', index=False)
results

  0%|          | 77/25921 [02:06<12:26:09,  1.73s/it]