In [1]:
from copy import deepcopy
import os
import math
import shutil
import re
from glob import glob
from collections import defaultdict, Counter

import numpy as np
from tqdm import tqdm
import pandas as pd
pd.set_option('display.max_colwidth', None)

import rispy
mapping = deepcopy(rispy.TAG_KEY_MAPPING)
mapping['M2'] = 'extra'
mapping['AB'] = 'orig_abstract'
mapping['CN'] = 'call_number'
mapping['CH'] = 'chapters'
mapping['AS'] = 'articles'

In [2]:
dump_dir = '../data/bntl-db-csv-dump-4March2024'

### Recensies

*Add to reviews which publication they review (under "RI" = reviewed item).*

We first load the original data table for all publications in the original dump:

In [3]:
publications = pd.read_csv(f"{dump_dir}/publications.csv", header=0, parse_dates=['creation_date', 'modification_date'])
publications['id'] = publications['id'].fillna('')
publications.sample(3)

Unnamed: 0,id,ppn,type,jaar,title_description,abstract,url,containing_publication_id,containing_publication_ppn,page_numbers,creator,creation_date,modifier,modification_date,canonical_url
294957,97752,,recensie,1993,"Schellenbach, C.J.\r\nIn: Van taal tot taal (Hilversum): 37 (1993) 3 (sep) 136-138.",,,,,,1,NaT,16.0,2008-08-12 16:26:02,
12525,161673,,recensie,1954,Recensie van de oorspronkelijke uitgave door J. J. Klant. \r\nIn: Critisch bulletin: 21 (1954) 3 (mrt) 124-128.,,,,,,1,NaT,19.0,2008-05-06 12:09:16,
99723,180945,862446910.0,artikeltijdschrift,1970,"'Pas op' : de toneelschool begint weer / Dimitri Frenkel Frank.\nIn: Avenue Amst.: (1970), afl. 8 (aug), pag. 49.",,,157888.0,830503064.0,"(1970), afl. 8 (aug), p. 49.",1,NaT,,NaT,


We are going to extract title descriptions for including that in the RIS dump, so we much sure that these are clean:

In [4]:
publications['title_description'] = [' '.join(p.split()).strip() for p in publications['title_description']]

We extract a review's title description and check to which (source) id this review was mapped:

In [5]:
#publications[publications['title_description'] == 'Kestemont, Mike. Eind goed, al goed? In: Queeste: 18 (2011) 1, 81-84.']['id']
publications[publications['title_description'] == 'Een waaier van emoties. Nijmegen: Vantilt. 23 p. Speciaal nummer van: Filter (Nijmegen): 25 (2018) 4 (dec) 3-26.']


Unnamed: 0,id,ppn,type,jaar,title_description,abstract,url,containing_publication_id,containing_publication_ppn,page_numbers,creator,creation_date,modifier,modification_date,canonical_url
2973,314016,,specialetijdschriftaflevering,2018,Een waaier van emoties. Nijmegen: Vantilt. 23 p. Speciaal nummer van: Filter (Nijmegen): 25 (2018) 4 (dec) 3-26.,"P. 3 Inleiding, door de redactie; p. 26 Noten; p. 26 Bibliografie ""Koudwatervrees"".",,,,,451,2023-12-06 09:35:47,451.0,2023-12-06 10:03:06,een_waaier_van_emoties_nijmege/n_v_t


Next, we look up the (target) ID of the publication that was reviewed:

In [6]:
print(publications[publications['title_description'] == "Draux, Roland. Beatrijs' biecht: stilistisch en semiotisch onderzoek. Köln: Lambert Academic Publishing, 2009. 253 p."]['id'])

181127    273488
Name: id, dtype: int64


We now create three lookup dicts:
- `title2id`: to map a title description to a publication ID;
- `title2ptype`: to map a publication's title descirption to its publication type;
- `id2title`: to map a publication ID to a title description.

In [7]:
title2id = dict(zip(publications['title_description'], publications['id']))
id2title = dict(zip(publications['id'], publications['title_description']))

In [8]:
title2ptypes = defaultdict(set)
for td, pt in zip(publications['title_description'], publications['type']):
    title2ptypes[td].add(pt)

We now create a linkage dict (`review2reference`), that maps the IDs of reviews, to the IDs of the publication that they review:

In [9]:
references = pd.read_csv(f'{dump_dir}/publication_publications.csv')
reference2publication = dict(zip(references['publication_id'], references['referenced_publication_id']))

Now, we inverse our lookup dictionary, to that books can be mapped to a list of their reviews:

In [10]:
publication2references = defaultdict(list)
for review_id, book_id in reference2publication.items():
    publication2references[book_id].append(review_id)

def get_reviews(pub_title):
    try:
        pub_id = title2id[pub_title]
    except KeyError:
        return None
    
    reviews = []
    for potential_review_id in publication2references[pub_id]:
        potential_review_title = id2title[potential_review_id]
        if 'recensie' in title2ptypes[potential_review_title]:
            try:
                reviews.append(potential_review_title)
            except KeyError:
                continue
    
    if reviews:
        return ' | '.join(reviews)
    

def get_chapters(pub_title):
    try:
        pub_id = title2id[pub_title]
    except KeyError:
        return None
    
    chapters = []
    for potential_chapter_id in publication2references[pub_id]:
        potential_chapter_title = id2title[potential_chapter_id]
        if 'artikelboek' in title2ptypes[potential_chapter_title]:
            try:
                chapters.append(potential_chapter_title)
            except KeyError:
                continue
    
    if chapters:
        return ' | '.join(chapters)
    
def get_articles(pub_title):
    try:
        pub_id = title2id[pub_title]
    except KeyError:
        return None
    
    articles = []
    for potential_article_id in publication2references[pub_id]:
        potential_article_title = id2title[potential_article_id]
        if 'artikeltijdschrift' in title2ptypes[potential_article_title]:
            try:
                articles.append(potential_article_title)
            except KeyError:
                continue
    
    if articles:
        return ' | '.join(articles)


In [11]:
review_id = title2id["Kestemont, Mike. Eind goed, al goed? In: Queeste: 18 (2011) 1, 81-84."]
print(review_id)
book_id = reference2publication[review_id]
print(book_id)
book_title = id2title[book_id]
print(book_title)

274189
273488
Draux, Roland. Beatrijs' biecht: stilistisch en semiotisch onderzoek. Köln: Lambert Academic Publishing, 2009. 253 p.


In [12]:
t = "Ene andre tale: tendensen in de Middelnederlandse late ridderepiek; onder redactie van An Faems en Marjolein Hogenbirk. Hilversum: Verloren, 2012. 318 p. (Middeleeuwse studies en bronnen; 131)."
print(get_reviews(t))
print(get_chapters(t))

Besamusca, Bart. De (late) Middelnederlandse ridderepiek belicht. In: Queeste (Hilversum): 20 (2013) 1, 45-49. | Bouwmeester, Gerard. Nieuwe inzichten in de late ridderepiek. In: Spiegel der letteren (Leuven): 55 (2013) 4, 537-539.
Janssens, Jef. De Middelnederlandse ridderepiek in de veertiende eeuw: postmodern of voorzichtig vernieuwend? In: Ene andre tale: tendensen in de Middelnederlandse late ridderepiek; onder redactie van An Faems en Marjolein Hogenbirk. Hilversum: Verloren, 2012, p. 37-52. (Middeleeuwse studies en bronnen; 131). | Meulen, Janet F. van der. Vrouwen van Avesnes: een nieuwe Alexander in de Lage Landen. In: Ene andre tale: tendensen in de Middelnederlandse late ridderepiek; onder redactie van An Faems en Marjolein Hogenbirk. Hilversum: Verloren, 2012, p. 55-81. (Middeleeuwse studies en bronnen; 131). | Reynders, Anne. "Ghi heren, ic houde in ware wort dat ghi van Alexandre gehort hebt": de Middelnederlandse vertalingen van de Oudfranse "Florimont" en "Voeux du paon

In [13]:
llm_path = '../data/llm-dump'

for decade_folder in sorted(glob(f'{llm_path}/*')):
    print(':::', decade_folder, ':::')
    decade = decade_folder.split('/')[-1]
    
    for ris_path in sorted(glob(f'{decade_folder}/*_consolidated.ris')):
        print(ris_path)
        with open(ris_path) as f:
            entries = rispy.load(f, encoding='utf-8', mapping=mapping)
        
        for idx, entry in tqdm(list(enumerate(entries))):
            if 'extra' in entry:
                pub_title = ' '.join(entry['extra'].split()).strip()
            else:
                # this means that we weren't able to structure the item via the LLM...
                pub_title = ' '.join(entry['title'].split()).strip()
            
            # add to reviews which book they review:
            if 'keywords' in entry and 'recensie' in entry['keywords']:
                ref_title = None
                try:
                    pub_id = title2id[pub_title]
                    ref_id = reference2publication[pub_id]
                    ref_title = id2title[ref_id]
                except KeyError:
                    pass

                if ref_title:
                    entry['short_title'] = ref_title

                if not 'title' in entry or entry['title'].strip() in ('titel', 'title'):
                    entry['title'] = '[Zonder titel]'
                
                entries[idx] = entry
            
            reviews = get_reviews(pub_title)
            if reviews:
                if entry['type_of_reference'] != 'WEB':
                    entry['call_number'] = reviews
                else:
                    entry['language'] = reviews
                entries[idx] = entry
            
            if 'boek' in title2ptypes[pub_title]:
                chapters = get_chapters(pub_title)
                if chapters:
                    if 'extra' in entry:
                        entry['extra'] += "\nHOOFDSTUKKEN: " + chapters
                    else:
                        entry['extra'] = "HOOFDSTUKKEN: " + chapters
                    entries[idx] = entry

            if 'specialetijdschriftaflevering' in title2ptypes[pub_title]:
                articles = get_articles(pub_title)
                if articles:
                    if 'extra' in entry:
                        entry['extra'] += "\nARTIKELS: " + articles
                    else:
                        entry['extra'] = "ARTIKELS: " + articles
                    entries[idx] = entry

            if entry['type_of_reference'] == 'CHAP':
                try:
                    pub_id = title2id[pub_title]
                    containing_title = None
                
                    containing_id = reference2publication[pub_id]
                    containing_title = id2title[containing_id]
                except KeyError:
                    continue

                if containing_title:
                    if 'extra' in entry:
                        entry['extra'] += "\nBOEK: " + containing_title
                    else:
                        entry['extra'] = "\nBOEK: " + containing_title
                    entries[idx] = entry

            if entry['type_of_reference'] == 'JOUR' and 'extra' in entry:
                if 'speciaal' in entry['extra'].lower():
                    if 'keywords' in entry:
                        entry['keywords'].append('Bijdrage speciaal tijdschriftnummer')
                    else:
                        entry['keywords'] = ['Bijdrage speciaal tijdschriftnummer']

        out_ris_path = ris_path.replace('.ris', f'_link_{decade}.ris')
        with open(out_ris_path, 'w') as bibliography_file:
            rispy.dump(entries, bibliography_file, mapping=mapping)

::: ../data/llm-dump/1940s :::
../data/llm-dump/1940s/BOOK_consolidated.ris


100%|██████████| 1442/1442 [00:00<00:00, 137256.02it/s]


../data/llm-dump/1940s/CHAP_consolidated.ris


100%|██████████| 1764/1764 [00:00<00:00, 257751.34it/s]


../data/llm-dump/1940s/JFULL_consolidated.ris


100%|██████████| 68/68 [00:00<00:00, 192581.14it/s]

../data/llm-dump/1940s/JOUR_consolidated.ris



100%|██████████| 9897/9897 [00:00<00:00, 219325.22it/s]


::: ../data/llm-dump/1950s :::
../data/llm-dump/1950s/BOOK_consolidated.ris


100%|██████████| 917/917 [00:00<00:00, 201676.72it/s]


../data/llm-dump/1950s/CHAP_consolidated.ris


100%|██████████| 1161/1161 [00:00<00:00, 284971.15it/s]


../data/llm-dump/1950s/JFULL_consolidated.ris


100%|██████████| 17/17 [00:00<00:00, 68037.37it/s]


../data/llm-dump/1950s/JOUR_consolidated.ris


100%|██████████| 6218/6218 [00:00<00:00, 368494.27it/s]


::: ../data/llm-dump/1960s :::
../data/llm-dump/1960s/BOOK_consolidated.ris


100%|██████████| 2185/2185 [00:00<00:00, 231188.78it/s]


../data/llm-dump/1960s/CHAP_consolidated.ris


100%|██████████| 4084/4084 [00:00<00:00, 299347.07it/s]


../data/llm-dump/1960s/JFULL_consolidated.ris


100%|██████████| 190/190 [00:00<00:00, 267242.71it/s]


../data/llm-dump/1960s/JOUR_consolidated.ris


100%|██████████| 19387/19387 [00:00<00:00, 400134.69it/s]


::: ../data/llm-dump/1970s :::
../data/llm-dump/1970s/BOOK_consolidated.ris


100%|██████████| 3623/3623 [00:00<00:00, 262329.54it/s]

../data/llm-dump/1970s/CHAP_consolidated.ris



100%|██████████| 6650/6650 [00:00<00:00, 321908.94it/s]


../data/llm-dump/1970s/JFULL_consolidated.ris


100%|██████████| 273/273 [00:00<00:00, 283426.98it/s]


../data/llm-dump/1970s/JOUR_consolidated.ris


100%|██████████| 25511/25511 [00:00<00:00, 383768.83it/s]


::: ../data/llm-dump/1980s :::
../data/llm-dump/1980s/ADVS_consolidated.ris


100%|██████████| 2/2 [00:00<00:00, 35098.78it/s]


../data/llm-dump/1980s/BOOK_consolidated.ris


100%|██████████| 6722/6722 [00:00<00:00, 269192.17it/s]


../data/llm-dump/1980s/CHAP_consolidated.ris


100%|██████████| 12289/12289 [00:00<00:00, 289180.39it/s]


../data/llm-dump/1980s/JFULL_consolidated.ris


100%|██████████| 619/619 [00:00<00:00, 284460.85it/s]


../data/llm-dump/1980s/JOUR_consolidated.ris


100%|██████████| 34995/34995 [00:00<00:00, 387670.14it/s]


../data/llm-dump/1980s/WEB_consolidated.ris


100%|██████████| 1/1 [00:00<00:00, 21732.15it/s]


::: ../data/llm-dump/1990s :::
../data/llm-dump/1990s/ADVS_consolidated.ris


100%|██████████| 33/33 [00:00<00:00, 136366.53it/s]


../data/llm-dump/1990s/BOOK_consolidated.ris


100%|██████████| 7992/7992 [00:00<00:00, 235191.32it/s]


../data/llm-dump/1990s/CHAP_consolidated.ris


100%|██████████| 14247/14247 [00:00<00:00, 272503.71it/s]


../data/llm-dump/1990s/EJOUR_consolidated.ris


100%|██████████| 97/97 [00:00<00:00, 287931.70it/s]


../data/llm-dump/1990s/JFULL_consolidated.ris


100%|██████████| 660/660 [00:00<00:00, 248406.37it/s]

../data/llm-dump/1990s/JOUR_consolidated.ris



100%|██████████| 44498/44498 [00:00<00:00, 381591.15it/s]


../data/llm-dump/1990s/WEB_consolidated.ris


100%|██████████| 10/10 [00:00<00:00, 44810.94it/s]


::: ../data/llm-dump/2000s :::
../data/llm-dump/2000s/ADVS_consolidated.ris


100%|██████████| 54/54 [00:00<00:00, 181775.61it/s]


../data/llm-dump/2000s/BOOK_consolidated.ris


100%|██████████| 6229/6229 [00:00<00:00, 239170.97it/s]


../data/llm-dump/2000s/CHAP_consolidated.ris


100%|██████████| 11027/11027 [00:00<00:00, 275979.58it/s]


../data/llm-dump/2000s/EJOUR_consolidated.ris


100%|██████████| 628/628 [00:00<00:00, 399275.87it/s]


../data/llm-dump/2000s/JFULL_consolidated.ris


100%|██████████| 479/479 [00:00<00:00, 207163.50it/s]


../data/llm-dump/2000s/JOUR_consolidated.ris


100%|██████████| 27745/27745 [00:00<00:00, 165396.93it/s]


../data/llm-dump/2000s/WEB_consolidated.ris


100%|██████████| 485/485 [00:00<00:00, 303889.67it/s]


::: ../data/llm-dump/2010s :::
../data/llm-dump/2010s/ADVS_consolidated.ris


100%|██████████| 9/9 [00:00<00:00, 101203.05it/s]


../data/llm-dump/2010s/BOOK_consolidated.ris


100%|██████████| 3891/3891 [00:00<00:00, 224920.92it/s]

../data/llm-dump/2010s/CHAP_consolidated.ris



100%|██████████| 6300/6300 [00:00<00:00, 249084.37it/s]


../data/llm-dump/2010s/EJOUR_consolidated.ris


100%|██████████| 569/569 [00:00<00:00, 392978.59it/s]


../data/llm-dump/2010s/JFULL_consolidated.ris


100%|██████████| 442/442 [00:00<00:00, 129461.06it/s]


../data/llm-dump/2010s/JOUR_consolidated.ris


100%|██████████| 20043/20043 [00:00<00:00, 340004.43it/s]


../data/llm-dump/2010s/WEB_consolidated.ris


100%|██████████| 85/85 [00:00<00:00, 218990.07it/s]


::: ../data/llm-dump/2020s :::
../data/llm-dump/2020s/ADVS_consolidated.ris


100%|██████████| 2/2 [00:00<00:00, 64527.75it/s]


../data/llm-dump/2020s/BOOK_consolidated.ris


100%|██████████| 825/825 [00:00<00:00, 212340.50it/s]


../data/llm-dump/2020s/CHAP_consolidated.ris


100%|██████████| 1627/1627 [00:00<00:00, 313364.22it/s]


../data/llm-dump/2020s/EJOUR_consolidated.ris


100%|██████████| 251/251 [00:00<00:00, 442898.74it/s]


../data/llm-dump/2020s/JFULL_consolidated.ris


100%|██████████| 161/161 [00:00<00:00, 137840.98it/s]

../data/llm-dump/2020s/JOUR_consolidated.ris



100%|██████████| 5238/5238 [00:00<00:00, 338131.63it/s]


../data/llm-dump/2020s/WEB_consolidated.ris


100%|██████████| 1/1 [00:00<00:00, 28532.68it/s]


::: ../data/llm-dump/misc :::
../data/llm-dump/misc/ADVS_consolidated.ris


100%|██████████| 3/3 [00:00<00:00, 62601.55it/s]


../data/llm-dump/misc/BOOK_consolidated.ris


100%|██████████| 1182/1182 [00:00<00:00, 193742.13it/s]


../data/llm-dump/misc/CHAP_consolidated.ris


100%|██████████| 470/470 [00:00<00:00, 279066.09it/s]


../data/llm-dump/misc/EJOUR_consolidated.ris


100%|██████████| 2/2 [00:00<00:00, 48770.98it/s]


../data/llm-dump/misc/JFULL_consolidated.ris


100%|██████████| 6051/6051 [00:00<00:00, 226876.20it/s]


../data/llm-dump/misc/JOUR_consolidated.ris


100%|██████████| 3483/3483 [00:00<00:00, 210440.23it/s]


../data/llm-dump/misc/WEB_consolidated.ris


100%|██████████| 32/32 [00:00<00:00, 111476.52it/s]
