In [1]:
import json
import os
from glob import glob

from lxml import etree
from tqdm.auto import tqdm

In [2]:
namespaces = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'res': 'http://purl.org/vocab/resourcelist/schema#',
    'z': 'http://www.zotero.org/namespaces/export#',
    'ctag': 'http://commontag.org/ns#',
    'dcterms': 'http://purl.org/dc/terms/',
    'bibo': 'http://purl.org/ontology/bibo/',
    'foaf': 'http://xmlns.com/foaf/0.1/',
    'address': 'http://schemas.talis.com/2005/address/schema#'
}

Some auxiliary functions:

In [3]:
def get_author_lookup(root):
    author_lookup = {}
    persons = root.findall('.//foaf:Person', namespaces=namespaces)

    for person in persons:
        node_id = person.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if not node_id:
            continue

        given_name = person.find('foaf:givenName', namespaces)
        surname = person.find('foaf:surname', namespaces)

        person_str = ''
        if surname is not None:
            person_str += surname.text
            if given_name is not None:
                person_str += ', ' + given_name.text
        
        author_lookup[node_id] = person_str
    
    return author_lookup

def get_keyword_lookup(root):
    keyword_lookup = {}
    user_tags = root.findall('.//ctag:UserTag', namespaces)
    for user_tag in user_tags:
        node_id = user_tag.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if not node_id:
            continue
        label = user_tag.find('ctag:label', namespaces)
        if label is not None:
            keyword_lookup[node_id] = label.text.replace('(auteur)', '').strip()

    return keyword_lookup

def parse_keywords(z_node, keyword_lookup):
    keywords = []
    tagged_elements = z_node.findall('.//ctag:tagged', namespaces=namespaces)
    for tagged in tagged_elements:
        node_id = tagged.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if node_id and node_id in keyword_lookup:
            keywords.append(keyword_lookup[node_id])

        user_tag = tagged.find('ctag:UserTag/ctag:label', namespaces=namespaces)
        if user_tag is not None:
            keywords.append(user_tag.text)

    return {'KW': keywords}

Category-specific parsing functions:

In [4]:
## journal articles
def parse_academic_article(article, author_lookup):
    bibo_info = {}

    # Authors
    authors = []
    author_list = article.find('.//rdf:Seq', namespaces=namespaces)
    if author_list is not None:
        for li in author_list.findall('rdf:li', namespaces=namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                authors.append(author_lookup[node_id])
    if authors:
        bibo_info['AU'] = authors

    # Basic metadata
    for node_name, field_key in [
        ('dcterms:title', 'TI'),
        ('bibo:uri', 'UR'),
        ('dcterms:abstract', 'AB'),
        ('bibo:doi', 'DO'),
        ('bibo:shortTitle', 'ST'),
        ('dcterms:source', 'EX')
    ]:
        node = article.find(node_name, namespaces)
        if node is not None and node.text:
            bibo_info[field_key] = node.text

    # Pages
    pages_node = article.find('bibo:pages', namespaces)
    if pages_node is not None and pages_node.text:
        pages = pages_node.text.split('-')
        if len(pages) == 2:
            start_page, end_page = pages
        else:
            start_page = end_page = pages[0]
        bibo_info['SP-EP'] = '-'.join((start_page, end_page))

    # Reviews
    reviews_node = article.find('.//bibo:lccn', namespaces)
    if reviews_node is not None and reviews_node.text:
        bibo_info['CN'] = reviews_node.text

    # Issue information
    issue_node = article.find('.//bibo:Issue', namespaces=namespaces)
    if issue_node is not None:
        for node_name, field_key in [
            ('dcterms:date', 'PY'),
            ('bibo:volume', 'VL'),
            ('bibo:issue', 'IS')
        ]:
            node = issue_node.find(node_name, namespaces=namespaces)
            if node is not None and node.text:
                bibo_info[field_key] = node.text

    # Journal information
    journal_node = article.find('.//bibo:Journal', namespaces)
    if journal_node is not None:
        # Journal title
        journal_title_node = journal_node.find('dcterms:title', namespaces)
        if journal_title_node is not None and journal_title_node.text:
            bibo_info['JO'] = journal_title_node.text
        
        # ISSN
        issn_node = journal_node.find('bibo:issn', namespaces)
        if issn_node is not None and issn_node.text:
            bibo_info['SN'] = issn_node.text

        # Look for Series within the Journal node
        series_nodes = journal_node.findall('.//bibo:Series', namespaces)
        if series_nodes:
            for series_node in series_nodes:
                if series_node is not None:
                    title_node = series_node.find('dcterms:title', namespaces)
                    if title_node is not None and title_node.text:
                        bibo_info['T3'] = title_node.text
                        break

    return bibo_info

def parse_jour(z_node, author_lookup, keyword_lookup):
    info = {}
    
    # Try to find the academic article node
    bibo_node = None
    
    # First check direct child
    next_node = z_node.getnext()
    if next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}AcademicArticle':
        bibo_node = next_node
    
    # If not found, try resource reference
    if bibo_node is None:
        resource_url = z_node.xpath('.//res:resource/@rdf:resource', namespaces=namespaces)
        if resource_url:
            try:
                articles = z_node.xpath(f"//bibo:AcademicArticle", namespaces=namespaces)
                for article in articles:
                    if article.get('{'+namespaces['rdf']+'}about') == resource_url[0]:
                        bibo_node = article
                        break
            except:
                pass
    
    # If still not found, try direct search
    if bibo_node is None:
        articles = z_node.xpath('.//bibo:AcademicArticle', namespaces=namespaces)
        if articles:
            bibo_node = articles[0] if isinstance(articles, list) else articles
    
    # Only parse if we found a valid node
    if bibo_node is not None:
        info.update(parse_academic_article(bibo_node, author_lookup))
    
    # Add keywords regardless
    info.update(parse_keywords(z_node, keyword_lookup=keyword_lookup))
    return info

## books:
def parse_bibo_book(book, author_lookup):
    bibo_info = {}

    authors = []
    author_list = book.find('.//bibo:authorList/rdf:Seq', namespaces)
    if author_list is not None:
        for li in author_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                authors.append(author_lookup[node_id])
    if authors:
        bibo_info['AU'] = authors

    editors = []
    editor_list = book.find('.//bibo:editorList/rdf:Seq', namespaces)
    if editor_list is not None:
        for li in editor_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                editors.append(author_lookup[node_id])
    if editors:
        bibo_info['A2'] = editors

    translators = []
    for translator_node in book.findall('.//bibo:translator', namespaces):
        node_id = translator_node.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if node_id in author_lookup:
            translators.append(author_lookup[node_id])
    if translators:
        bibo_info['A3'] = translators

    source_node = book.find('dcterms:source', namespaces)
    if source_node is not None:
        bibo_info['EX'] = source_node.text

    edition_node = book.find('bibo:edition', namespaces)
    if edition_node is not None:
        bibo_info['ET'] = edition_node.text

    title_node = book.find('dcterms:title', namespaces)
    if title_node is not None:
        bibo_info['TI'] = title_node.text

    date_node = book.find('dcterms:date', namespaces)
    if date_node is not None:
        bibo_info['PY'] = date_node.text

    uri_node = book.find('bibo:uri', namespaces)
    if uri_node is not None:
        bibo_info['UR'] = uri_node.text
    
    abstract_node = book.find('dcterms:abstract', namespaces)
    if abstract_node is not None:
        bibo_info['AB'] = abstract_node.text
    
    pages_node = book.find('bibo:numPages', namespaces)
    if pages_node is not None:
        bibo_info['SP'] = pages_node.text
    
    isbn_nodes = book.findall('bibo:isbn13', namespaces)
    if isbn_nodes is not None:
        bibo_info['SN'] = ' '.join([inode.text for inode in isbn_nodes])
    
    doi_node = book.find('bibo:doi', namespaces)
    if doi_node is not None:
        bibo_info['DO'] = doi_node.text

    reviews_node = book.find('.//bibo:lccn', namespaces)
    if reviews_node is not None:
        bibo_info['CN'] = reviews_node.text
    
    volume_node = book.find('bibo:volume', namespaces)
    if volume_node is not None:
        bibo_info['VL'] = volume_node.text

    reviewed_node = book.find('.//bibo:shortTitle', namespaces)
    if reviewed_node is not None:
        bibo_info['ST'] = reviewed_node.text

    series_node = book.find('.//bibo:Series', namespaces)
    if series_node is not None:
        series_title_node = series_node.find('dcterms:title', namespaces)
        if series_title_node is not None:
            bibo_info['T2'] = series_title_node.text
        series_number_node = series_node.find('bibo:number', namespaces)
        if series_number_node is not None:
            bibo_info['SV'] = series_number_node.text

    publisher_node = book.find('{http://purl.org/dc/terms/}publisher/foaf:Organization', namespaces)
    if publisher_node is not None:
        publisher_name_node = publisher_node.find('{http://xmlns.com/foaf/0.1/}name', namespaces)
        locality_node = publisher_node.find('{http://schemas.talis.com/2005/address/schema#}localityName', namespaces)

        if publisher_name_node is not None:
            bibo_info['PB'] = publisher_name_node.text
        if locality_node is not None:
            bibo_info['CY'] = locality_node.text
   
    return bibo_info
    
def parse_book(z_node, author_lookup, keyword_lookup):
    info = {}

    next_node = z_node.getnext()
    if next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}Book':
        bibo_node = next_node
    else:
        resource_node = z_node.find('{http://purl.org/vocab/resourcelist/schema#}resource')
        if resource_node is not None:
            bibo_node = resource_node.find('{http://purl.org/ontology/bibo/}Book')
        else:
            bibo_node = z_node.find('{http://purl.org/ontology/bibo/}Book')
    
    if bibo_node is not None:
        info.update(parse_bibo_book(bibo_node, author_lookup))
    
    info.update(parse_keywords(z_node, keyword_lookup))
    return info

## chapters:
def parse_bibo_chapter(chapter, author_lookup):
    bibo_info = {}

    authors = []
    author_list = chapter.find('.//bibo:authorList/rdf:Seq', namespaces=namespaces)
    if author_list is not None:
        for li in author_list.findall('rdf:li', namespaces=namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                authors.append(author_lookup[node_id])
    if authors:
        bibo_info['AU'] = authors

    editors = []
    editor_list = chapter.find('.//bibo:editorList/rdf:Seq', namespaces=namespaces)
    if editor_list is not None:
        for li in editor_list.findall('rdf:li', namespaces=namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                editors.append(author_lookup[node_id])
    if editors:
        bibo_info['A2'] = editors

    translators = []
    for translator_node in chapter.findall('.//bibo:translator', namespaces):
        node_id = translator_node.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if node_id in author_lookup:
            translators.append(author_lookup[node_id])
    if translators:
        bibo_info['A3'] = translators

    source_node = chapter.find('dcterms:source', namespaces)
    if source_node is not None:
        bibo_info['EX'] = source_node.text

    title_node = chapter.find('dcterms:title', namespaces)
    if title_node is not None:
        bibo_info['TI'] = title_node.text

    date_node = chapter.find('dcterms:date', namespaces)
    if date_node is not None:
        bibo_info['PY'] = date_node.text

    uri_node = chapter.find('bibo:uri', namespaces)
    if uri_node is not None:
        bibo_info['UR'] = uri_node.text
    
    abstract_node = chapter.find('dcterms:abstract', namespaces)
    if abstract_node is not None:
        bibo_info['AB'] = abstract_node.text

    reviews_node = chapter.find('.//bibo:lccn', namespaces)
    if reviews_node is not None:
        bibo_info['CN'] = reviews_node.text
    
    pages_node = chapter.find('bibo:pages', namespaces)
    if pages_node is not None:
        bibo_info['SP-EP'] = pages_node.text
    
    isbn_nodes = chapter.findall('bibo:isbn13', namespaces)
    if isbn_nodes:
        bibo_info['SN'] = ' '.join([inode.text for inode in isbn_nodes])
    
    doi_node = chapter.find('bibo:doi', namespaces)
    if doi_node is not None:
        bibo_info['DO'] = doi_node.text

    volume_node = chapter.find('bibo:volume', namespaces)
    if volume_node is not None:
        bibo_info['VL'] = volume_node.text

    series_node = chapter.find('.//bibo:Series', namespaces)
    if series_node is not None:
        series_title_node = series_node.find('dcterms:title', namespaces)
        if series_title_node is not None:
            bibo_info['T3'] = series_title_node.text
        series_number_node = series_node.find('bibo:number', namespaces)
        if series_number_node is not None:
            bibo_info['SV'] = series_number_node.text

    reviewed_node = chapter.find('.//bibo:shortTitle', namespaces)
    if reviewed_node is not None:
        bibo_info['ST'] = reviewed_node.text

    edited_book_node = chapter.find('.//bibo:EditedBook', namespaces)
    if edited_book_node is not None:
        edited_book_title_node = edited_book_node.find('dcterms:title', namespaces)
        if edited_book_title_node is not None:
            bibo_info['T2'] = edited_book_title_node.text

        edited_book_year_node = edited_book_node.find('dcterms:date', namespaces)
        if edited_book_year_node is not None:
            bibo_info['PY'] = edited_book_year_node.text

        isbn_node = edited_book_node.find('bibo:isbn13', namespaces)
        if isbn_node is not None:
            bibo_info['SN'] = isbn_node.text

        publisher_node = edited_book_node.find('dcterms:publisher/foaf:Organization', namespaces)
        if publisher_node is not None:
            publisher_name_node = publisher_node.find('foaf:name', namespaces)
            locality_node = publisher_node.find('address:localityName', namespaces)

            if publisher_name_node is not None:
                bibo_info['PB'] = publisher_name_node.text
            if locality_node is not None:
                bibo_info['CY'] = locality_node.text
   
    return bibo_info


def parse_chapter(z_node, author_lookup, keyword_lookup):
    info = {}

    next_node = z_node.getnext()
    if next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}BookSection':
        bibo_node = next_node
    else:
        resource_node = z_node.find('res:resource', namespaces)
        if resource_node is not None:
            bibo_node = resource_node.find('bibo:BookSection', namespaces)
        else:
            bibo_node = z_node.find('bibo:BookSection', namespaces)
    
    if bibo_node is not None:
        info.update(parse_bibo_chapter(bibo_node, author_lookup))
    
    info.update(parse_keywords(z_node, keyword_lookup))

    return info

## web pages:
def parse_bibo_webpage(book, author_lookup):
    bibo_info = {}

    authors = []
    author_list = book.find('.//bibo:authorList/rdf:Seq', namespaces)
    if author_list is not None:
        for li in author_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                authors.append(author_lookup[node_id])
    if authors:
        bibo_info['AU'] = authors

    editors = []
    editor_list = book.find('.//bibo:editorList/rdf:Seq', namespaces)
    if editor_list is not None:
        for li in editor_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                editors.append(author_lookup[node_id])
    if editors:
        bibo_info['A3'] = editors

    translators = []
    for translator_node in book.findall('.//bibo:translator', namespaces):
        node_id = translator_node.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if node_id in author_lookup:
            translators.append(author_lookup[node_id])
    if translators:
        bibo_info['A4'] = translators

    source_node = book.find('dcterms:source', namespaces)
    if source_node is not None:
        bibo_info['EX'] = source_node.text

    title_node = book.find('dcterms:title', namespaces)
    if title_node is not None:
        bibo_info['TI'] = title_node.text

    date_node = book.find('.//dcterms:date', namespaces)
    if date_node is not None:
        bibo_info['PY'] = date_node.text

    uri_node = book.find('bibo:uri', namespaces)
    if uri_node is not None:
        bibo_info['UR'] = uri_node.text
    
    abstract_node = book.find('dcterms:abstract', namespaces)
    if abstract_node is not None:
        bibo_info['AB'] = abstract_node.text
    
    pages_node = book.find('bibo:numPages', namespaces)
    if pages_node is not None:
        bibo_info['SP'] = pages_node.text
    
    isbn_nodes = book.findall('bibo:isbn13', namespaces)
    if isbn_nodes is not None:
        bibo_info['SN'] = ' '.join([inode.text for inode in isbn_nodes])
    
    doi_node = book.find('bibo:doi', namespaces)
    if doi_node is not None:
        bibo_info['DO'] = doi_node.text
    
    reviews_node = book.find('.//dcterms:language', namespaces)
    if reviews_node is not None:
        bibo_info['CN'] = reviews_node.text

    volume_node = book.find('bibo:volume', namespaces)
    if volume_node is not None:
        bibo_info['VL'] = volume_node.text

    series_node = book.find('.//bibo:Series', namespaces)
    if series_node is not None:
        series_title_node = series_node.find('dcterms:title', namespaces)
        if series_title_node is not None:
            bibo_info['T2'] = series_title_node.text
        series_number_node = series_node.find('bibo:number', namespaces)
        if series_number_node is not None:
            bibo_info['SV'] = series_number_node.text

    reviewed_node = book.find('.//bibo:shortTitle', namespaces)
    if reviewed_node is not None:
        bibo_info['ST'] = reviewed_node.text

    publisher_node = book.find('{http://purl.org/dc/terms/}publisher/foaf:Organization', namespaces)
    if publisher_node is not None:
        publisher_name_node = publisher_node.find('{http://xmlns.com/foaf/0.1/}name', namespaces)
        locality_node = publisher_node.find('{http://schemas.talis.com/2005/address/schema#}localityName', namespaces)

        if publisher_name_node is not None:
            bibo_info['PB'] = publisher_name_node.text
        if locality_node is not None:
            bibo_info['CY'] = locality_node.text
   
    return bibo_info
    
def parse_web(z_node, author_lookup, keyword_lookup):
    info = {}

    next_node = z_node.getnext()
    if next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}Webpage':
        bibo_node = next_node
    else:
        resource_node = z_node.find('{http://purl.org/vocab/resourcelist/schema#}resource')
        if resource_node is not None:
            bibo_node = resource_node.find('{http://purl.org/ontology/bibo/}Webpage')
        else:
            bibo_node = z_node.find('{http://purl.org/ontology/bibo/}Webpage')
    
    if bibo_node is not None:
        info.update(parse_bibo_webpage(bibo_node, author_lookup))
    
    info.update(parse_keywords(z_node, keyword_lookup))

    return info

## cdroms etc
def parse_bibo_film(book, author_lookup):
    bibo_info = {}

    authors = []
    author_list = book.find('.//bibo:authorList/rdf:Seq', namespaces)
    if author_list is not None:
        for li in author_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                authors.append(author_lookup[node_id])
    if authors:
        bibo_info['AU'] = authors

    editors = []
    editor_list = book.find('.//bibo:editorList/rdf:Seq', namespaces)
    if editor_list is not None:
        for li in editor_list.findall('rdf:li', namespaces):
            node_id = li.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
            if node_id in author_lookup:
                editors.append(author_lookup[node_id])
    if editors:
        bibo_info['A3'] = editors

    translators = []
    for translator_node in book.findall('.//bibo:translator', namespaces):
        node_id = translator_node.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}nodeID')
        if node_id in author_lookup:
            translators.append(author_lookup[node_id])
    if translators:
        bibo_info['A4'] = translators

    source_node = book.find('dcterms:source', namespaces)
    if source_node is not None:
        bibo_info['EX'] = source_node.text

    title_node = book.find('dcterms:title', namespaces)
    if title_node is not None:
        bibo_info['TI'] = title_node.text

    date_node = book.find('.//dcterms:date', namespaces)
    if date_node is not None:
        bibo_info['PY'] = date_node.text

    uri_node = book.find('bibo:uri', namespaces)
    if uri_node is not None:
        bibo_info['UR'] = uri_node.text
    
    abstract_node = book.find('dcterms:abstract', namespaces)
    if abstract_node is not None:
        bibo_info['AB'] = abstract_node.text
    
    pages_node = book.find('bibo:numPages', namespaces)
    if pages_node is not None:
        bibo_info['SP'] = pages_node.text
    
    isbn_nodes = book.findall('bibo:isbn13', namespaces)
    if isbn_nodes is not None:
        bibo_info['SN'] = ' '.join([inode.text for inode in isbn_nodes])
    
    doi_node = book.find('bibo:doi', namespaces)
    if doi_node is not None:
        bibo_info['DO'] = doi_node.text
    
    language_node = book.find('bibo:lccn', namespaces)
    if language_node is not None:
        bibo_info['CN'] = language_node.text

    volume_node = book.find('bibo:volume', namespaces)
    if volume_node is not None:
        bibo_info['VL'] = volume_node.text

    series_node = book.find('.//bibo:Series', namespaces)
    if series_node is not None:
        series_title_node = series_node.find('dcterms:title', namespaces)
        if series_title_node is not None:
            bibo_info['T2'] = series_title_node.text
        series_number_node = series_node.find('bibo:number', namespaces)
        if series_number_node is not None:
            bibo_info['SV'] = series_number_node.text

    publisher_node = book.find('{http://purl.org/dc/terms/}publisher/foaf:Organization', namespaces)
    if publisher_node is not None:
        publisher_name_node = publisher_node.find('{http://xmlns.com/foaf/0.1/}name', namespaces)
        locality_node = publisher_node.find('{http://schemas.talis.com/2005/address/schema#}localityName', namespaces)

        if publisher_name_node is not None:
            bibo_info['PB'] = publisher_name_node.text
        if locality_node is not None:
            bibo_info['CY'] = locality_node.text
   
    return bibo_info
    
def parse_advs(z_node, author_lookup, keyword_lookup):
    info = {}

    next_node = z_node.getnext()
    if next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}Film':
        bibo_node = next_node
    else:
        resource_node = z_node.find('{http://purl.org/vocab/resourcelist/schema#}resource')
        if resource_node is not None:
            bibo_node = resource_node.find('{http://purl.org/ontology/bibo/}Film')
        else:
            bibo_node = z_node.find('{http://purl.org/ontology/bibo/}Film')
    
    if bibo_node is not None:
        info.update(parse_bibo_webpage(bibo_node, author_lookup))
    
    info.update(parse_keywords(z_node, keyword_lookup))

    return info

In [5]:
for fn in tqdm(glob('../../data/BNTL_FinalExport/*.rdf'), 
               desc="Processing files", 
               position=0, 
               leave=True):
    
    parsedf = os.path.basename(fn).replace('.rdf', '.json')
    parsedf = f'parsed/{parsedf}'
    
    with open(fn, 'r', encoding='utf-8') as file:
        xml_content = file.read()
    root = etree.fromstring(xml_content)

    author_lookup = get_author_lookup(root)
    keyword_lookup = get_keyword_lookup(root)

    user_items = tuple(root.xpath('//z:UserItem', namespaces=namespaces))

    parsed = []
    for z_node in tqdm(user_items, 
                      desc=os.path.basename(fn),
                      position=1, 
                      leave=False):
        info = {}

        user_item_url = z_node.xpath('@rdf:about', namespaces=namespaces)
        if user_item_url:
            info['ID'] = user_item_url[0]
        else:
            continue
        
        access_date_node = z_node.find('z:accessDate', namespaces)
        if access_date_node is not None:
            info['Y2'] = access_date_node.text
        
        academic_article = z_node.xpath('.//bibo:AcademicArticle', namespaces=namespaces)
        
        if not academic_article:
            resource_url = z_node.xpath('.//res:resource/@rdf:resource', namespaces=namespaces)
            if resource_url:
                try:
                    all_articles = root.xpath("//bibo:AcademicArticle", namespaces=namespaces)
                    academic_article = [article for article in all_articles 
                                     if article.get('{'+namespaces['rdf']+'}about') == resource_url[0]]
                except:
                    academic_article = []

        if academic_article:
            info.update(parse_jour(z_node, author_lookup=author_lookup, keyword_lookup=keyword_lookup))
            info['TY'] = 'JOUR'
        
        next_node = z_node.getnext()
        
        if ((next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}Book') or 
            z_node.xpath('.//bibo:Book', namespaces=namespaces)):
            info.update(parse_book(z_node, author_lookup=author_lookup, keyword_lookup=keyword_lookup))
            info['TY'] = 'BOOK'

            if 'KW' in info and "Speciaal tijdschriftnummer" in set(info['KW']):
                info['TY'] = 'JFULL'

        elif ((next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}BookSection') or 
              z_node.xpath('.//bibo:BookSection', namespaces=namespaces)):
            info.update(parse_chapter(z_node, author_lookup=author_lookup, keyword_lookup=keyword_lookup))
            info['TY'] = 'CHAP'

        elif ((next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}Webpage') or 
              z_node.xpath('.//bibo:Webpage', namespaces=namespaces)):
            info.update(parse_web(z_node, author_lookup=author_lookup, keyword_lookup=keyword_lookup))
            info['TY'] = 'WEB'

        elif ((next_node is not None and next_node.tag == '{' + namespaces['bibo'] + '}Film') or 
              z_node.xpath('.//bibo:Film', namespaces=namespaces)):
            info.update(parse_advs(z_node, author_lookup=author_lookup, keyword_lookup=keyword_lookup))
            info['TY'] = 'ADVS'

        if info:
            if 'KW' in info and 'foutmelding (keywords)' in info['KW']:
                info['KW'].remove('foutmelding (keywords)')
            parsed.append(info)

    with open(parsedf, 'w') as f:
        f.write(json.dumps(parsed, indent=2))

Processing files:   0%|          | 0/58 [00:00<?, ?it/s]

ADVS 2000s.rdf:   0%|          | 0/54 [00:00<?, ?it/s]

Nieuwe invoer 14-3-2024 tot 9-12-2024.rdf:   0%|          | 0/14504 [00:00<?, ?it/s]

WEB 1980s.rdf:   0%|          | 0/1 [00:00<?, ?it/s]

CHAP 1960s.rdf:   0%|          | 0/4084 [00:00<?, ?it/s]

CHAP 1940s.rdf:   0%|          | 0/1764 [00:00<?, ?it/s]

EJOUR MISC.rdf:   0%|          | 0/2 [00:00<?, ?it/s]

JOUR 1990s.rdf:   0%|          | 0/44498 [00:00<?, ?it/s]

ADVS 2020s.rdf:   0%|          | 0/2 [00:00<?, ?it/s]

BOOK 1980s.rdf:   0%|          | 0/6722 [00:00<?, ?it/s]

EJOUR 2010s.rdf:   0%|          | 0/569 [00:00<?, ?it/s]

JOUR MISC.rdf:   0%|          | 0/3483 [00:00<?, ?it/s]

JOUR 2000s.rdf:   0%|          | 0/27745 [00:00<?, ?it/s]

BOOK 2010s.rdf:   0%|          | 0/3891 [00:00<?, ?it/s]

JFULL 1970s.rdf:   0%|          | 0/273 [00:00<?, ?it/s]

JFULL 1950s.rdf:   0%|          | 0/17 [00:00<?, ?it/s]

JOUR 2020s.rdf:   0%|          | 0/5238 [00:00<?, ?it/s]

WEB 2010s.rdf:   0%|          | 0/85 [00:00<?, ?it/s]

ADVS 1990s.rdf:   0%|          | 0/33 [00:00<?, ?it/s]

BOOK MISC.rdf:   0%|          | 0/1182 [00:00<?, ?it/s]

JOUR 1980s.rdf:   0%|          | 0/34995 [00:00<?, ?it/s]

CHAP 1950s.rdf:   0%|          | 0/1161 [00:00<?, ?it/s]

EJOUR 2000s.rdf:   0%|          | 0/628 [00:00<?, ?it/s]

BOOK 1990s.rdf:   0%|          | 0/7991 [00:00<?, ?it/s]

JFULL MISC.rdf:   0%|          | 0/6051 [00:00<?, ?it/s]

EJOUR 2020s.rdf:   0%|          | 0/251 [00:00<?, ?it/s]

CHAP 1970s.rdf:   0%|          | 0/6650 [00:00<?, ?it/s]

WEB 1990s.rdf:   0%|          | 0/10 [00:00<?, ?it/s]

ADVS 2010s.rdf:   0%|          | 0/9 [00:00<?, ?it/s]

JFULL 1940s.rdf:   0%|          | 0/68 [00:00<?, ?it/s]

BOOK 2020s.rdf:   0%|          | 0/825 [00:00<?, ?it/s]

ADVS 1980s.rdf:   0%|          | 0/2 [00:00<?, ?it/s]

WEB 2000s.rdf:   0%|          | 0/485 [00:00<?, ?it/s]

JOUR 2010s.rdf:   0%|          | 0/20043 [00:00<?, ?it/s]

WEB MISC.rdf:   0%|          | 0/32 [00:00<?, ?it/s]

WEB 2020s.rdf:   0%|          | 0/1 [00:00<?, ?it/s]

JFULL 1960s.rdf:   0%|          | 0/190 [00:00<?, ?it/s]

BOOK 2000s.rdf:   0%|          | 0/6229 [00:00<?, ?it/s]

EJOUR 1990s.rdf:   0%|          | 0/97 [00:00<?, ?it/s]

CHAP 2000s.rdf:   0%|          | 0/11027 [00:00<?, ?it/s]

CHAP 2020s.rdf:   0%|          | 0/1627 [00:00<?, ?it/s]

JFULL 1980s.rdf:   0%|          | 0/619 [00:00<?, ?it/s]

JOUR 1960s.rdf:   0%|          | 0/19387 [00:00<?, ?it/s]

BOOK 1970s.rdf:   0%|          | 0/3623 [00:00<?, ?it/s]

JFULL 2010s.rdf:   0%|          | 0/442 [00:00<?, ?it/s]

BOOK 1950s.rdf:   0%|          | 0/917 [00:00<?, ?it/s]

CHAP MISC.rdf:   0%|          | 0/470 [00:00<?, ?it/s]

JOUR 1940s.rdf:   0%|          | 0/9897 [00:00<?, ?it/s]

CHAP 1990s.rdf:   0%|          | 0/14247 [00:00<?, ?it/s]

JFULL 1990s.rdf:   0%|          | 0/660 [00:00<?, ?it/s]

CHAP 2010s.rdf:   0%|          | 0/6300 [00:00<?, ?it/s]

JFULL 2020s.rdf:   0%|          | 0/161 [00:00<?, ?it/s]

BOOK 1940s.rdf:   0%|          | 0/1437 [00:00<?, ?it/s]

CHAP 1980s.rdf:   0%|          | 0/12289 [00:00<?, ?it/s]

JOUR 1950s.rdf:   0%|          | 0/6218 [00:00<?, ?it/s]

JOUR 1970s.rdf:   0%|          | 0/25511 [00:00<?, ?it/s]

ADVS MISC.rdf:   0%|          | 0/3 [00:00<?, ?it/s]

JFULL 2000s.rdf:   0%|          | 0/479 [00:00<?, ?it/s]

BOOK 1960s.rdf:   0%|          | 0/2185 [00:00<?, ?it/s]