In [1]:
import json

from lxml import etree
from tqdm import tqdm

In [2]:
namespaces = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'res': 'http://purl.org/vocab/resourcelist/schema#',
    'z': 'http://www.zotero.org/namespaces/export#',
    'ctag': 'http://commontag.org/ns#',
    'dcterms': 'http://purl.org/dc/terms/',
    'bibo': 'http://purl.org/ontology/bibo/',
    'foaf': 'http://xmlns.com/foaf/0.1/',
    'address': 'http://schemas.talis.com/2005/address/schema#'
}

In [3]:
with open('RDF-export.rdf', 'r', encoding='utf-8') as file:
    xml_content = file.read()

root = etree.fromstring(xml_content)

## File-wide lookup

In [4]:
author_lookup = {}
persons = root.findall('.//foaf:Person', namespaces=namespaces)

for person in persons:
    node_id = person.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
    if not node_id:
        continue

    given_name = person.find('foaf:givenName', namespaces)
    surname = person.find('foaf:surname', namespaces)

    person_str = ''
    if surname is not None:
        person_str += surname.text
        if given_name is not None:
            person_str += ', ' + given_name.text
    
    author_lookup[node_id] = person_str

author_lookup

{'n3': 'ter Braak, Menno',
 'n12': 'van Looy, Jacobus',
 'n19': 'Carstens, Wannie',
 'n21': 'Ubbink, Carla',
 'n29': 'Gerritzen, D.',
 'n31': 'Bloothooft, Gerrit',
 'n32': 'Brouwer, Matthijs',
 'n33': 'Kunst, Jan Pieter',
 'n48': 'Beelen, Hans',
 'n57': 'Huizenga, Erwin',
 'n66': 'Raemdonck, Bert van',
 'n76': 'Oppenhuis de Jong, Soetje',
 'n82': 'Hogenbirk, Marjolein',
 'n84': 'Gerritsen, W. P.',
 'n89': 'Janse, Antheun',
 'n91': 'Biesheuvel, Ingrid',
 'n92': 'Anrooij, W. van',
 'n93': 'Tilmans, Karin',
 'n94': 'Ridderikhoff, Cornelia M.',
 'n95': 'Ekkart, R. E. O.',
 'n96': 'Biemans, J. A. A. M.',
 'n101': 'Vondel, Joost van den',
 'n103': 'Spies, Marijke',
 'n107': 'Brinkman, Herman',
 'n109': 'Biemans, J. A. A. M.',
 'n110': 'Kwant, Elsbeth',
 'n111': 'Vlist, E. T. van der',
 'n115': 'Jongbloet-van Houtte, Gisela',
 'n124': 'Langbroek, Erika',
 'n126': 'Roeleveld, Annelies',
 'n127': 'Biesheuvel, Ingrid',
 'n128': 'Kienhorst, Hans',
 'n134': 'Kienhorst, Hans',
 'n136': 'Schepers, K

In [5]:
keyword_lookup = {}
user_tags = root.findall('.//ctag:UserTag', namespaces)
for user_tag in user_tags:
    node_id = user_tag.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
    if not node_id:
        continue
    label = user_tag.find('ctag:label', namespaces)
    if label is not None:
        keyword_lookup[node_id] = label.text

keyword_lookup

{'n5': 'bibliografie',
 'n6': 'Braak, Menno ter (auteur)',
 'n7': 'biografie',
 'n14': 'briefwisseling',
 'n22': 'taalkunde',
 'n23': 'Bibliografie van de Nederlandse Taal- en Literatuurwetenschap (BNTL)',
 'n34': 'naamkunde',
 'n41': 'taalverandering',
 'n42': 'taalgebruik',
 'n44': 'Zonder auteur',
 'n51': 'teksteditie',
 'n59': 'Middelnederlandse handschriften',
 'n60': 'digitale edities*',
 'n69': 'geautomatiseerde literaire teksten',
 'n77': 'Lancelotcompilatie',
 'n118': 'geschiedenis',
 'n139': 'verzamelhandschrift Wiesbaden',
 'n152': 'drukgeschiedenis',
 'n154': 'Cats, Jacob (auteur)',
 'n155': 'emblema',
 'n156': 'Venne, Adriaen Pietersz van de (auteur)',
 'n165': 'Velthem, Lodewijk van (auteur)',
 'n180': 'verzamelhandschriften',
 'n191': 'mystiek',
 'n204': 'Huygens, Constantijn (auteur)',
 'n205': 'handschriften',
 'n218': 'poëzie',
 'n225': 'poëtica',
 'n237': 'Leopold, J. H. (auteur)',
 'n254': 'negentiende-eeuwse letterkunde',
 'n260': 'varianten',
 'n261': 'Multatuli (

In [6]:
def parse_keywords(z_node):
    keywords = []
    tagged_elements = z_node.findall('.//ctag:tagged', namespaces=namespaces)
    for tagged in tagged_elements:
        node_id = tagged.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if node_id and node_id in keyword_lookup:
            keywords.append(keyword_lookup[node_id])

        user_tag = tagged.find('ctag:UserTag/ctag:label', namespaces=namespaces)
        if user_tag is not None:
            keywords.append(user_tag.text)

    return {'KW': keywords}

### Journal articles

In [7]:
def parse_academic_article(article):
    bibo_info = {}

    authors = []
    author_list = article.find('.//rdf:Seq', namespaces=namespaces)
    if author_list is not None:
        for li in author_list.findall('rdf:li', namespaces=namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                authors.append(author_lookup[node_id])
    if authors:
        bibo_info['AU'] = authors

    title_node = article.find('dcterms:title', namespaces)
    if title_node is not None:
        bibo_info['TI'] = title_node.text

    uri_node = article.find('bibo:uri', namespaces)
    if uri_node is not None:
        bibo_info['UR'] = uri_node.text
    
    abstract_node = article.find('dcterms:abstract', namespaces)
    if abstract_node is not None:
        bibo_info['AB'] = abstract_node.text
    
    pages_node = article.find('bibo:pages', namespaces)
    if pages_node is not None:
        pages = pages_node.text.split('-')
        if len(pages) == 2:
            start_page, end_page = pages
        else:
            start_page = end_page = pages[0]
        bibo_info['SP-EP'] = '-'.join((start_page, end_page))
    
    doi_node = article.find('bibo:doi', namespaces)
    if doi_node is not None:
        bibo_info['DO'] = doi_node.text

    reviewed_node = article.find('bibo:shortTitle', namespaces)
    if reviewed_node is not None:
        bibo_info['ST'] = reviewed_node.text

    reviews_node = article.find('bibo:lccn', namespaces)
    if reviews_node is not None:
        bibo_info['CN'] = reviews_node.text
    
    #language_node = article.find('dcterms:language', namespaces)
    #if language_node is not None:
    #    bibo_info['LA'] = language_node.text

    source_node = article.find('dcterms:source', namespaces)
    if source_node is not None:
        bibo_info['EX'] = source_node.text

    issue_node = article.find('.//bibo:Issue', namespaces=namespaces)
    if issue_node is not None:
        year_node = issue_node.find('dcterms:date', namespaces=namespaces)
        if year_node is not None:
            bibo_info['PY'] = year_node.text

        volume_node = issue_node.find('bibo:volume', namespaces=namespaces)
        if volume_node is not None:
            bibo_info['VL'] = volume_node.text

        issue_num_node = issue_node.find('bibo:issue', namespaces=namespaces)
        if issue_num_node is not None:
            bibo_info['IS'] = issue_num_node.text

    journal_node = article.find('.//bibo:Journal', namespaces)
    if journal_node is not None:
        journal_title_node = journal_node.find('dcterms:title', namespaces)
        if journal_title_node is not None:
            bibo_info['JO'] = journal_title_node.text
        
        issn_node = journal_node.find('bibo:issn', namespaces)
        if issn_node is not None:
            bibo_info['SN'] = issn_node.text

        theme_node = journal_node.find('.//bibo:Series', namespaces)
        if theme_node is not None:
            theme_title_node = theme_node.find('dcterms:title')
            if theme_title_node is not None:
                bibo_info['T3'] = theme_title_node.text
   
    return bibo_info
    
def parse_jour(z_node):
    info = {}

    bibo_node = None
    
    next_node = z_node.getnext()
    if next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}AcademicArticle':
        bibo_node = next_node
    
    if bibo_node is None:
        resource_node = z_node.find('.//res:resource', namespaces=namespaces)
        if resource_node is not None:
            bibo_node = resource_node.find('bibo:AcademicArticle', namespaces=namespaces)
    
    if bibo_node is None:
        bibo_node = z_node.find('.//bibo:AcademicArticle', namespaces=namespaces)

    if bibo_node is not None:
        info.update(parse_academic_article(bibo_node))
    
    info.update(parse_keywords(z_node))    
    return info

### Full books

In [8]:
def parse_bibo_book(book):
    bibo_info = {}

    authors = []
    author_list = book.find('.//bibo:authorList/rdf:Seq', namespaces)
    if author_list is not None:
        for li in author_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                authors.append(author_lookup[node_id])
    if authors:
        bibo_info['AU'] = authors

    editors = []
    editor_list = book.find('.//bibo:editorList/rdf:Seq', namespaces)
    if editor_list is not None:
        for li in editor_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                editors.append(author_lookup[node_id])
    if editors:
        bibo_info['A2'] = editors

    translators = []
    for translator_node in book.findall('.//bibo:translator', namespaces):
        node_id = translator_node.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if node_id in author_lookup:
            translators.append(author_lookup[node_id])
    if translators:
        bibo_info['A3'] = translators

    source_node = book.find('dcterms:source', namespaces)
    if source_node is not None:
        bibo_info['EX'] = source_node.text

    title_node = book.find('dcterms:title', namespaces)
    if title_node is not None:
        bibo_info['TI'] = title_node.text

    date_node = book.find('dcterms:date', namespaces)
    if date_node is not None:
        bibo_info['PY'] = date_node.text

    uri_node = book.find('bibo:uri', namespaces)
    if uri_node is not None:
        bibo_info['UR'] = uri_node.text
    
    abstract_node = book.find('dcterms:abstract', namespaces)
    if abstract_node is not None:
        bibo_info['AB'] = abstract_node.text
    
    pages_node = book.find('bibo:numPages', namespaces)
    if pages_node is not None:
        bibo_info['SP'] = pages_node.text
    
    isbn_nodes = book.findall('bibo:isbn13', namespaces)
    if isbn_nodes is not None:
        bibo_info['SN'] = ' '.join([inode.text for inode in isbn_nodes])
    
    doi_node = book.find('bibo:doi', namespaces)
    if doi_node is not None:
        bibo_info['DO'] = doi_node.text

    reviews_node = book.find('bibo:lccn', namespaces)
    if reviews_node is not None:
        bibo_info['CN'] = reviews_node.text
    
    volume_node = book.find('bibo:volume', namespaces)
    if volume_node is not None:
        bibo_info['VL'] = volume_node.text

    series_node = book.find('.//bibo:Series', namespaces)
    if series_node is not None:
        series_title_node = series_node.find('dcterms:title', namespaces)
        if series_title_node is not None:
            bibo_info['T2'] = series_title_node.text
        series_number_node = series_node.find('bibo:number', namespaces)
        if series_number_node is not None:
            bibo_info['VL'] = series_number_node.text

    publisher_node = book.find('{http://purl.org/dc/terms/}publisher/foaf:Organization', namespaces)
    if publisher_node is not None:
        publisher_name_node = publisher_node.find('{http://xmlns.com/foaf/0.1/}name', namespaces)
        locality_node = publisher_node.find('{http://schemas.talis.com/2005/address/schema#}localityName', namespaces)

        if publisher_name_node is not None:
            bibo_info['PB'] = publisher_name_node.text
        if locality_node is not None:
            bibo_info['CY'] = locality_node.text
   
    return bibo_info
    
def parse_book(z_node):
    info = {}

    bibo_node = None
    if z_node.getnext() is not None and z_node.getnext().tag == '{' + namespaces['bibo'] + '}Book':
        bibo_node = z_node.getnext()
    else:
        resource_node = z_node.find('{http://purl.org/vocab/resourcelist/schema#}resource')
        if resource_node is not None:
            bibo_node = resource_node.find('{http://purl.org/ontology/bibo/}Book')
        else:
            bibo_node = z_node.find('{http://purl.org/ontology/bibo/}Book')
    
    if bibo_node is not None:
        info.update(parse_bibo_book(bibo_node))
    
    info.update(parse_keywords(z_node))
    return info

### Chapters

In [9]:
def parse_bibo_chapter(chapter):
    bibo_info = {}

    authors = []
    author_list = chapter.find('.//bibo:authorList/rdf:Seq', namespaces=namespaces)
    if author_list is not None:
        for li in author_list.findall('rdf:li', namespaces=namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                authors.append(author_lookup[node_id])
    if authors:
        bibo_info['AU'] = authors

    editors = []
    editor_list = chapter.find('.//bibo:editorList/rdf:Seq', namespaces=namespaces)
    if editor_list is not None:
        for li in editor_list.findall('rdf:li', namespaces=namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                editors.append(author_lookup[node_id])
    if editors:
        bibo_info['A2'] = editors

    translators = []
    for translator_node in chapter.findall('.//bibo:translator', namespaces):
        node_id = translator_node.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if node_id in author_lookup:
            translators.append(author_lookup[node_id])
    if translators:
        bibo_info['A3'] = translators

    source_node = chapter.find('dcterms:source', namespaces)
    if source_node is not None:
        bibo_info['EX'] = source_node.text

    title_node = chapter.find('dcterms:title', namespaces)
    if title_node is not None:
        bibo_info['TI'] = title_node.text

    date_node = chapter.find('dcterms:date', namespaces)
    if date_node is not None:
        bibo_info['PY'] = date_node.text

    uri_node = chapter.find('bibo:uri', namespaces)
    if uri_node is not None:
        bibo_info['UR'] = uri_node.text
    
    abstract_node = chapter.find('dcterms:abstract', namespaces)
    if abstract_node is not None:
        bibo_info['AB'] = abstract_node.text

    reviews_node = chapter.find('bibo:lccn', namespaces)
    if reviews_node is not None:
        bibo_info['CN'] = reviews_node.text
    
    pages_node = chapter.find('bibo:pages', namespaces)
    if pages_node is not None:
        bibo_info['SP-EP'] = pages_node.text
    
    isbn_nodes = chapter.findall('bibo:isbn13', namespaces)
    if isbn_nodes:
        bibo_info['SN'] = ' '.join([inode.text for inode in isbn_nodes])
    
    doi_node = chapter.find('bibo:doi', namespaces)
    if doi_node is not None:
        bibo_info['DO'] = doi_node.text

    volume_node = chapter.find('bibo:volume', namespaces)
    if volume_node is not None:
        bibo_info['VL'] = volume_node.text

    series_node = chapter.find('.//bibo:Series', namespaces)
    if series_node is not None:
        series_title_node = series_node.find('dcterms:title', namespaces)
        if series_title_node is not None:
            bibo_info['T3'] = series_title_node.text
        series_number_node = series_node.find('bibo:number', namespaces)
        if series_number_node is not None:
            bibo_info['VL'] = series_number_node.text

    edited_book_node = chapter.find('.//bibo:EditedBook', namespaces)
    if edited_book_node is not None:
        edited_book_title_node = edited_book_node.find('dcterms:title', namespaces)
        if edited_book_title_node is not None:
            bibo_info['T2'] = edited_book_title_node.text

        edited_book_year_node = edited_book_node.find('dcterms:date', namespaces)
        if edited_book_year_node is not None:
            bibo_info['PY'] = edited_book_year_node.text

        isbn_node = edited_book_node.find('bibo:isbn13', namespaces)
        if isbn_node is not None:
            bibo_info['SN'] = isbn_node.text

        publisher_node = edited_book_node.find('dcterms:publisher/foaf:Organization', namespaces)
        if publisher_node is not None:
            publisher_name_node = publisher_node.find('foaf:name', namespaces)
            locality_node = publisher_node.find('address:localityName', namespaces)

            if publisher_name_node is not None:
                bibo_info['PB'] = publisher_name_node.text
            if locality_node is not None:
                bibo_info['CY'] = locality_node.text
   
    return bibo_info


def parse_chapter(z_node):
    info = {}
    bibo_node = None
    if z_node.getnext() is not None and z_node.getnext().tag == '{' + namespaces['bibo'] + '}BookSection':
        bibo_node = z_node.getnext()
    else:
        resource_node = z_node.find('res:resource', namespaces)
        if resource_node is not None:
            bibo_node = resource_node.find('bibo:BookSection', namespaces)
        else:
            bibo_node = z_node.find('bibo:BookSection', namespaces)
    
    if bibo_node is not None:
        info.update(parse_bibo_chapter(bibo_node))
    
    info.update(parse_keywords(z_node))

    return info


In [10]:
def parse_bibo_webpage(book):
    bibo_info = {}

    authors = []
    author_list = book.find('.//bibo:authorList/rdf:Seq', namespaces)
    if author_list is not None:
        for li in author_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                authors.append(author_lookup[node_id])
    if authors:
        bibo_info['AU'] = authors

    editors = []
    editor_list = book.find('.//bibo:editorList/rdf:Seq', namespaces)
    if editor_list is not None:
        for li in editor_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                editors.append(author_lookup[node_id])
    if editors:
        bibo_info['A3'] = editors

    translators = []
    for translator_node in book.findall('.//bibo:translator', namespaces):
        node_id = translator_node.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if node_id in author_lookup:
            translators.append(author_lookup[node_id])
    if translators:
        bibo_info['A4'] = translators

    source_node = book.find('dcterms:source', namespaces)
    if source_node is not None:
        bibo_info['EX'] = source_node.text

    title_node = book.find('dcterms:title', namespaces)
    if title_node is not None:
        bibo_info['TI'] = title_node.text

    date_node = book.find('.//dcterms:date', namespaces)
    if date_node is not None:
        bibo_info['PY'] = date_node.text

    uri_node = book.find('bibo:uri', namespaces)
    if uri_node is not None:
        bibo_info['UR'] = uri_node.text
    
    abstract_node = book.find('dcterms:abstract', namespaces)
    if abstract_node is not None:
        bibo_info['AB'] = abstract_node.text
    
    pages_node = book.find('bibo:numPages', namespaces)
    if pages_node is not None:
        bibo_info['SP'] = pages_node.text
    
    isbn_nodes = book.findall('bibo:isbn13', namespaces)
    if isbn_nodes is not None:
        bibo_info['SN'] = ' '.join([inode.text for inode in isbn_nodes])
    
    doi_node = book.find('bibo:doi', namespaces)
    if doi_node is not None:
        bibo_info['DO'] = doi_node.text
    
    language_node = book.find('dcterms:language', namespaces)
    if language_node is not None:
        bibo_info['CN'] = language_node.text

    volume_node = book.find('bibo:volume', namespaces)
    if volume_node is not None:
        bibo_info['VL'] = volume_node.text

    series_node = book.find('.//bibo:Series', namespaces)
    if series_node is not None:
        series_title_node = series_node.find('dcterms:title', namespaces)
        if series_title_node is not None:
            bibo_info['T2'] = series_title_node.text
        series_number_node = series_node.find('bibo:number', namespaces)
        if series_number_node is not None:
            bibo_info['SV'] = series_number_node.text

    publisher_node = book.find('{http://purl.org/dc/terms/}publisher/foaf:Organization', namespaces)
    if publisher_node is not None:
        publisher_name_node = publisher_node.find('{http://xmlns.com/foaf/0.1/}name', namespaces)
        locality_node = publisher_node.find('{http://schemas.talis.com/2005/address/schema#}localityName', namespaces)

        if publisher_name_node is not None:
            bibo_info['PB'] = publisher_name_node.text
        if locality_node is not None:
            bibo_info['CY'] = locality_node.text
   
    return bibo_info
    
def parse_advs(z_node):
    info = {}

    bibo_node = None
    if z_node.getnext() is not None and z_node.getnext().tag == '{' + namespaces['bibo'] + '}Webpage':
        bibo_node = z_node.getnext()
    else:
        resource_node = z_node.find('{http://purl.org/vocab/resourcelist/schema#}resource')
        if resource_node is not None:
            bibo_node = resource_node.find('{http://purl.org/ontology/bibo/}Webpage')
        else:
            bibo_node = z_node.find('{http://purl.org/ontology/bibo/}Webpage')
    
    if bibo_node is not None:
        info.update(parse_bibo_webpage(bibo_node))
    
    info.update(parse_keywords(z_node))

    return info

In [11]:
user_items = tuple(root.xpath('//z:UserItem', namespaces=namespaces))

parsed = []
for z_node in tqdm(user_items):
    info = {}

    # get the unique ID for each entry:
    user_item_url = z_node.xpath('@rdf:about', namespaces=namespaces)
    if user_item_url:
        info['ID'] = user_item_url[0]
    else:
        continue
    
    # get the date of creation:
    access_date_node = z_node.find('z:accessDate', namespaces)
    if access_date_node is not None:
        info['Y2'] = access_date_node.text
    
    academic_article = z_node.xpath('.//bibo:AcademicArticle', namespaces=namespaces)
    
    if not academic_article:
        resource_url = z_node.xpath('.//res:resource/@rdf:resource', namespaces=namespaces)
        if resource_url:
            academic_article = root.xpath(f"//bibo:AcademicArticle[@rdf:about='{resource_url[0]}']", namespaces=namespaces)

    if academic_article:
        info.update(parse_jour(z_node))
        info['TY'] = 'JOUR'
    
    if z_node.getnext().tag == '{' + namespaces['bibo'] + '}Book' or \
       z_node.xpath('.//bibo:Book', namespaces=namespaces):
        info.update(parse_book(z_node))
        info['TY'] = 'BOOK'

        if 'KW' in info and "Speciaal tijdschriftnummer" in set(info['KW']):
            info['TY'] = 'JFULL'

    elif z_node.getnext().tag == '{' + namespaces['bibo'] + '}BookSection' or \
       z_node.xpath('.//bibo:BookSection', namespaces=namespaces):
        info.update(parse_chapter(z_node))
        info['TY'] = 'CHAP'

    elif z_node.getnext().tag == '{' + namespaces['bibo'] + '}Webpage' or \
       z_node.xpath('.//bibo:Webpage', namespaces=namespaces):
        info.update(parse_advs(z_node))
        info['TY'] = 'ADVS'

    if info:
        parsed.append(info)

100%|██████████| 1997/1997 [00:04<00:00, 460.11it/s]


In [12]:
import rispy
with open('RIS-export.ris', 'r') as bibliography_file:
    entries = rispy.load(bibliography_file)
assert len(parsed) == len(entries)

In [13]:
with open('parsed.json', 'w') as f:
    f.write(json.dumps(parsed, indent=2))