In [1]:
import re
import os
import json
import glob
from copy import deepcopy
from collections     import Counter

from tqdm import tqdm
import pandas as pd
import Levenshtein as lev
import numpy as np

from pybtex.database import parse_string
from pylatexenc.latex2text import LatexNodes2Text
conv = LatexNodes2Text()

In [2]:
import rispy
mappings = deepcopy(rispy.TAG_KEY_MAPPING)
mappings['M2'] = 'extra'
mappings['AB'] = 'orig_abstract'
mappings['SV'] = 'series_volume'

In [3]:
def extract_isbn(input_string):
    """
    Extract an ISBN from an unstructured text string.
    
    The function searches for ISBN numbers that may include a check character 'X' at the end.
    It recognizes ISBNs both with and without spaces or hyphens between segments.
    """
    # Regular expression to match "ISBN:" followed by any combination of digits, hyphens, and possibly ending with an 'X'
    pattern = r'ISBN:?\s*([\d\-]+X?)'
    
    match = re.search(pattern, input_string)
    if match:
        return match.group(1)  # Return the matched part (ISBN number)
    else:
        return None  # No ISBN found following the "ISBN:" prefix
    
import re

def extract_issn(input_string):
    """
    Extract an ISSN from an unstructured text string.
    
    The function searches for ISSN numbers, which are typically in the format '1234-5678'.
    It recognizes ISSNs both with and without spaces or hyphens between segments.
    """
    # Regular expression to match "ISSN:" followed by a valid ISSN format
    pattern = r'ISSN:?\s*(\d{4}-\d{3}[\dX])'
    
    match = re.search(pattern, input_string)
    if match:
        return match.group(1)  # Return the matched part (ISSN number)
    else:
        return None  # No ISSN found

In [4]:
extract_isbn("ISBN: 978-0-19-880393-5. P. VII-VIII Acknowledgements; p. XI-XII List of illustrations; p. XIII-XXIV Preface: what this book is (not) about; p. 1-21 Introduction: biblical philology in the sixteenth century; p. 253-280 Bibliography; p. 281-296 Index.")

'978-0-19-880393-5'

In [5]:
def map_chapter(ris, bibt):
    # make editors authors, if applicable:
    if 'editor' in bibt.persons:
        if 'author' in bibt.persons:
            ris['first_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['author']]
        else:
            if 'authors' in ris:
                del ris['authors']
        ris['secondary_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['editor']]

    if 'first_authors' in ris and 'secondary_authors' in ris:
        if 'authors' in ris:
            del ris['authors']

    # add translators
    if 'translator' in bibt.fields:
        ris['tertiary_authors'] = conv.latex_to_text(bibt.fields['translator']).split(' and ')

    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # replace non-distinct title with parsed journal title:
    if 'title' in bibt.fields:
        ris['extra'] = ris['title']
        ris['title'] = bibt.fields['title']

    if 'booktitle' in bibt.fields:
        ris['secondary_title'] = bibt.fields['booktitle']
    
    if 'pagetotal' in bibt.fields:
        ris['end_page'] = bibt.fields['pagetotal']
    
    if 'pages' in bibt.fields:
        pages = bibt.fields['pages'].split('-')
        if len(pages) == 2:
            ris['start_page'] = pages[0]
            ris['end_page'] = pages[1]
        else:
            ris['start_page'] = bibt.fields['pages']
    
    if 'publisher' in bibt.fields:
        ris['publisher'] = bibt.fields['publisher']

    if 'place' in bibt.fields:
        ris['place_published'] = bibt.fields['place']

    if 'address' in bibt.fields:
        ris['place_published'] = bibt.fields['address']
    
    if 'location' in bibt.fields:
        ris['place_published'] = bibt.fields['location']

    if 'series' in bibt.fields:
        ris['tertiary_title'] = bibt.fields['series']

    if 'volume' in bibt.fields:
        ris['volume'] = bibt.fields['volume']

    if 'number' in bibt.fields:
        ris['series_volume'] = bibt.fields['number']

    if 'edition' in bibt.fields:
        ris['edition'] = bibt.fields['edition']
    
    return ris

Collect already available normalizations for journal titles (so that we can find out below which one we miss):

In [6]:
jtitle = pd.read_excel('../data/journal_titles_master.xlsx')
existing_jtitles = set(jtitle['normalized'])
lower2jtitles = dict(zip(jtitle['normalized'].str.lower(), jtitle['normalized']))
jtitle.head(30)

Unnamed: 0,secondary_title,normalized,count,issn
0,Ons erfdeel: kultureel tijdschrift voor Zuidvl...,Ons erfdeel,2660.0,0030-2651
1,Dietsche warande en Belfort: tijdschrift voor ...,Dietsche warande en Belfort,2461.0,0012-2645
2,De nieuwe taalgids: tweemaandelijks tijdschrif...,De nieuwe taalgids,2359.0,0028-9922
3,Bzzlletin; Stichting BZZTôH Teater. Voorburg: ...,Bzzlletin,1638.0,0165-0858
4,Poëziekrant: tweemaandelijks tijdschrift. Gent...,Poëziekrant,1573.0,2030-0638
5,Onze taal: maandblad van het Genootschap Onze ...,Onze taal,1322.0,0165-7828
6,Vlaanderen: tweemaandelijks tijdschrift voor k...,Vlaanderen,1312.0,0042-7683
7,De gids: nieuwe vaderlandsche letteroefeningen...,De gids,1249.0,0016-9730
8,Levende talen: berichten en mededelingen van d...,Levende talen,1239.0,0024-1539
9,Tijdschrift voor Nederlandse taal- en letterku...,Tijdschrift voor Nederlandse taal- en letterkunde,962.0,0040-7550


In [7]:
def map_journal(ris, bibt):
    """
    Merges the newly structured information in the bibtex returned
    by the LLM into the already available RIS entry from the dump.
    Reliably structured information (e.g. authors, year, keywords, ...)
    from the RIS entries is maximally retained.
    """
    #print(ris)
    #print(bibt)
    #print('===============================================')

    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # replace unstructured title with parsed journal title (if available):
    if 'title' in bibt.fields:
        # keep track of original title description:
        ris['extra'] = [ris['title']]
        ris['title'] = bibt.fields['title']
        if ris['title'].endswith(','):
            ris['title'] = ris['title'][:-1]
        if ris['title'].strip().lower() in ('in', 'untitled', 'title of the article', 'title of the article (if provided)'):
            ris['title'] = ''
    
    # parse pagination information:
    if 'pagetotal' in bibt.fields:
        ris['end_page'] = bibt.fields['pagetotal']
    if 'pages' in bibt.fields:
        pages = bibt.fields['pages'].split('-')
        if len(pages) == 2:
            ris['start_page'] = pages[0]
            ris['end_page'] = pages[1]
        else:
            ris['end_page'] = bibt.fields['pages']

    # collect parsed journal title (unless we had that information already, which will be more reliable)
    if 'secondary_title' not in ris and 'journal' in bibt.fields:
        journal = bibt.fields['journal']
        # sometimes place of publication of the journal is added: we remove that
        journal = journal.split('(')[0].strip()
        journal = journal.split('[')[0].strip()
        ris['journal_name'] = journal
    elif 'secondary_title' in ris and 'journal' in bibt.fields:
        ris['journal_name'] = ris['secondary_title']
        del ris['secondary_title']

    if 'journal_name' in ris:
        jn = ris['journal_name']
        if jn.startswith('"') and jn.endswith('",'):
            jn = jn[1:-2]
        if jn.count('"') == 1:
            jn = jn.replace('"', '')
        ris['journal_name'] = jn
    
    # collect information on volume and issue
    if 'volume' in bibt.fields:
        ris['volume'] = bibt.fields['volume']
    if 'number' in bibt.fields:
        ris['number'] = bibt.fields['number']
    if 'number' not in bibt.fields and 'issue' in bibt.fields:
        ris['number'] = bibt.fields['issue']
    
    if 'volume' in ris and not 'number' in ris:
        ris['number'] = ris['volume']
        del ris['volume']
    
    return ris

In [8]:
def map_book(ris, bibt):
    # extract ISBN for abstract field, if available:
    if 'orig_abstract' in ris:
        abstract = ris['orig_abstract']
        isbn = extract_isbn(abstract.strip())
        if isbn:
            ris['issn'] = isbn

    # make editors authors, if applicable:
    if 'editor' in bibt.persons:
        if 'author' in bibt.persons:
            ris['first_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['author']]
        else:
            if 'author' in ris:
                del ris['author']
            if 'authors' in ris:
                del ris['authors']
        ris['tertiary_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['editor']]

    if 'first_authors' in ris and 'tertiary_authors' in ris:
        if 'author' in ris:
            del ris['author']
        if 'authors' in ris:
            del ris['authors']

    # add translators
    if 'translator' in bibt.fields:
        ris['subsidiary_authors'] = conv.latex_to_text(bibt.fields['translator']).split(' and ')
        if 'author' in ris:
            del ris['author']
        if 'first_authors' in ris:
            del ris['first_authors']

    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # replace non-distinct title with parsed book title:
    if 'title' in bibt.fields and bibt.fields['title']:
        try:
            ris['extra'].append(ris['title'])
        except KeyError:
            ris['extra'] = [ris['title']]
        ris['title'] = bibt.fields['title']
    elif 'booktitle' in bibt.fields and bibt.fields['booktitle']:
        try:
            ris['extra'].append(ris['title'])
        except KeyError:
            ris['extra'] = [ris['title']]
        ris['title'] = bibt.fields['booktitle']
    
    if 'pagetotal' in bibt.fields:
        ris['start_page'] = bibt.fields['pagetotal']
    
    if 'pages' in bibt.fields:
        ris['start_page'] = bibt.fields['pages']
    
    if 'publisher' in bibt.fields:
        ris['publisher'] = bibt.fields['publisher']

    if 'place' in bibt.fields:
        ris['place_published'] = bibt.fields['place']

    if 'place' in bibt.fields:
        place = bibt.fields['place']
        if ris['extra'] and '[' + place + ']' in ris['extra'][0]:
            place = '[' + place + ']'
        ris['place_published'] = place

    if 'address' in bibt.fields:
        ris['place_published'] = bibt.fields['address']
    
    if 'location' in bibt.fields:
        ris['place_published'] = bibt.fields['location']

    if 'series' in bibt.fields:
        if ';' in bibt.fields['series']:
            series, vol = [e.strip() for e in bibt.fields['series'].split(';', maxsplit=1)]
            ris['secondary_title'] = series
            ris['note'] = vol
        else:
            ris['secondary_title'] = bibt.fields['series']

    if 'volume' in bibt.fields:
        ris['note'] = bibt.fields['volume']

    if 'number' in bibt.fields:
        ris['note'] = bibt.fields['number']

    if 'edition' in bibt.fields:
        ris['edition'] = bibt.fields['edition']
    
    return ris

In [9]:
def map_jfull(ris, bibt):
    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # extract ISSN from title field, if available:
    if 'title' in ris:
        abstract = ris['title']
        issn = extract_issn(abstract.strip())
        if issn:
            ris['issn'] = issn
    
    # replace non-distinct title with parsed book title:
    if 'title' in bibt.fields and bibt.fields['title']:
        ris['extra'] = [ris['title']]
        ris['title'] = bibt.fields['title']
    elif 'booktitle' in bibt.fields and bibt.fields['booktitle']:
        ris['extra'] = ris['title']
        ris['title'] = bibt.fields['booktitle']

    # make editors authors, if applicable:
    if 'editor' in bibt.persons:
        if 'author' in bibt.persons:
            ris['first_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['author']]
        else:
            if 'author' in ris:
                del ris['author']
            if 'authors' in ris:
                del ris['authors']
        ris['tertiary_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['editor']]

    if 'first_authors' in ris and 'tertiary_authors' in ris:
        if 'author' in ris:
            del ris['author']
        if 'authors' in ris:
            del ris['authors']

    # add translators
    if 'translator' in bibt.fields:
        ris['subsidiary_authors'] = conv.latex_to_text(bibt.fields['translator']).split(' and ')
        if 'author' in ris:
            del ris['author']
        if 'first_authors' in ris:
            del ris['first_authors']
    
    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # pagination information:
    if 'pagetotal' in bibt.fields:
        ris['start_page'] = bibt.fields['pagetotal']
    if 'pages' in bibt.fields:
        ris['start_page'] = bibt.fields['pages']
    
    # publisher information:
    if 'publisher' in bibt.fields:
        ris['publisher'] = bibt.fields['publisher']

    # place of publication:
    if 'place' in bibt.fields:
        ris['place_published'] = bibt.fields['place']
    if 'place' in bibt.fields:
        place = bibt.fields['place']
        if ris['extra'] and '[' + place + ']' in ris['extra'][0]:
            place = '[' + place + ']'
        ris['place_published'] = place
    if 'address' in bibt.fields:
        ris['place_published'] = bibt.fields['address']
    if 'location' in bibt.fields:
        ris['place_published'] = bibt.fields['location']

    if 'volume' in bibt.fields:
        ris['volume'] = bibt.fields['volume']

    if 'number' in bibt.fields:
        ris['note'] = bibt.fields['number']

    # Hard reset to BOOK
    ris['type_of_reference'] = 'BOOK'

    if 'keywords' in ris:
        ris['keywords'] = list(set(ris['keywords']))
        ris['keywords'].append('Speciaal tijdschriftnummer')
    
    return ris

In [10]:
map_entry = {
             'JOUR': map_journal,
             'CHAP': map_chapter,
             'BOOK': map_book,
             'JFULL': map_jfull,
             'EJOUR': map_journal,
             'ADVS': map_book,
             'WEB': map_book,
            }

In [11]:
def deduplicate_bibtex(bibt):
    """
    Deduplicate repeated fields in the bibtex returned by the LLM.
    We only keep the first appearance of a given field.
    """
    lines, fields = [], set()
    for line in bibt.strip().split('\n'):
        if line.startswith('@') or line == '}':
            lines.append(line)
        else:
            field = line.split('=')[0].strip()
            if field not in fields:
                lines.append(line)
                fields.add(field)

    clean = '\n'.join([l for l in lines if l])
    if not clean.strip().endswith('}'):
        clean += '\n}\n'
    
    return clean


def clean_bibtex(bibt):
    """
    Attempts to correct some common syntactic errors in the bibtex
    returned by the LLM (which cause the pybtex parser to fail).
    """
    if not bibt:
        return ''
    
    # remove erroneous markdown syntax:
    bibt = bibt.replace('```bibtex', '').replace('```', '').replace("```tex", '')
    
    # sometimes mutliple bibtexs are created: we only keep the first one
    bibt = [b for b in bibt.split('@') if b.strip()]
    bibt = '@' + bibt[0]

    lines = []
    for line in bibt.strip().split('\n'):
        l = line.strip()

        # take care of spaces in the bibtex key:
        if l.startswith('@') and ' ' in l:
            line = ''.join(line.split())
        
        # fix missing entry keys:
        if l in ('@article{,', '@article{'):
            lines.append('@article{xxx,')
            continue
        if l in ('@book{,', '@book{'):
            lines.append('@book{xxx,')
            continue
        if l in ('@incollection{,', '@incollection{'):
            lines.append('@incollection{xxx,')
            continue

        # common errors:
        if l.endswith(']'):
            line += '},'
            lines.append(line)
            continue
        line = line.replace('{ )', '{}')
        if l.endswith("',"):
            line = line[:-2] + '},'
        
        # ensure that end-of-line syntax is respected:
        if l != '}':
            if not l.endswith('},'):
                if l.endswith('}'):
                    line += ','
                elif not l.endswith('}') and not l.endswith(','):
                    line += '},'
            if l.endswith('),'):
                line = line.replace('),', ')},')
        
        # add missing curly brackets:
        if '=' in l and (not '{' in l or not '}' in l):
            k, v = [e.strip() for e in l.split('=')][:2]
            v.replace(',', '')
            line = '  ' + k + '=' + '{' + v + '},'
        
        # remove lines with empty values:
        if '= {},' in l:
            continue

        if ' &' in line:
            line = line.replace(' &', ' \&')
        
        # correct curly bracket syntax in title field:
        if l.startswith('title') and l.count('}') > 1:
            k, v = [e.strip() for e in l.split('=')][:2]
            v = v.replace('{', '').replace('}', '')
            line = '  ' + k + '=' + '{' + v + '},'

        # correct syntax:
        if '",' in l and '=' in l:
            k, v = [e.strip() for e in l.split('=')][:2]
            if v.startswith('"') and v.endswith('",'):
                v = v[1:-2]
            line = '  ' + k + '=' + '{' + v + '},'

        lines.append(line)

    # recompose the lines of the bibtex entry:
    clean = '\n'.join([l for l in lines if l])
    if not clean.strip().endswith('}'):
        clean += '\n}\n'
    
    # return the deduplicated version of the bibtex entry:
    return deduplicate_bibtex(clean)

  line = line.replace(' &', ' \&')


In [12]:
llm_path = '../data/llm-dump'

new_jtitles = Counter()

for decade_folder in sorted(glob.glob(f'{llm_path}/*')):
    #if '2010s' not in decade_folder:
    #        continue
    print(':::', decade_folder, ':::')

    for spreadsheet_path in sorted(glob.glob(f'{decade_folder}/*.xlsx')):
        df = pd.read_excel(spreadsheet_path, header=0, engine='openpyxl')
        #n = 5000
        #if len(df) > n:
        #    df = df.sample(n)

        if 'bibtex' not in df.columns:
            continue
    
        ptype = os.path.basename(spreadsheet_path).replace('.xlsx', '')
        print('     - ', spreadsheet_path, f'({ptype})')

        if ptype not in ('BOOK', 'JOUR', 'CHAP', 'JFULL', 'EJOUR', 'ADVS', 'WEB'):
        #if ptype != 'JFULL':
            continue
        
        # parse the RIS (stored as JSON strings in the spreadsheet)
        df['RIS'] = df['RIS'].apply(json.loads)

        # clean (and deduplicate the bibtex returned by the LLM)
        cleaned = []
        for bt in df['bibtex']:
            if isinstance(bt, str):
                cleaned.append(clean_bibtex(bt))
            else:
                cleaned.append('')
        df['bibtex-clean'] = cleaned

        # Update the available RIS entries with newly structure info,
        # returned by the LLM (and keep tracked of whether or not that is successful):
        updated_ris, status = [], []
        for ris, bibtex_str in tqdm(list(zip(df['RIS'], df['bibtex-clean']))):
            if 'abstract' in ris:
                ris['orig_abstract'] = ris['abstract']
                del ris['abstract']
            if isinstance(bibtex_str, str):
                try:
                    #print(bibtex_parse)
                    bibtex_parse = parse_string(bibtex_str, 'bibtex')
                    single_key = list(bibtex_parse.entries.keys())[0]
                    updated = map_entry[ptype](ris.copy(), bibtex_parse.entries[single_key])

                    # keep track of new journal titles which lack a normalized variant,
                    # (unless the difference is only in capitalization):
                    if ptype in 'JOUR' and 'journal_name' in updated and updated['journal_name'] not in existing_jtitles:
                        try:
                            updated['journal_name'] = lower2jtitles[updated['journal_name'].lower()]
                        except KeyError:
                            new_jtitles[updated['journal_name']] += 1
                    
                    updated['label'] = 'success'
                    if 'authors' not in updated and 'secondary_authors' not in updated and \
                        'primary_authors' not in updated and 'tertiary_authors' not in updated \
                        and 'subsidiary_authors' not in updated:
                        if 'keywords' in updated:
                            updated['keywords'].append('Zonder auteur')
                        else:
                            updated['keywords'] = ['Zonder auteur']
                    
                    updated_ris.append(updated)
                    status.append('success')
                except Exception as e:
                    print(e)
                    ris['label'] = f'failure ({str(e)})'
                    updated_ris.append(ris)
                    status.append('failure')
            else:
                ris['label'] = 'failure'
                updated_ris.append(ris)
                status.append('failure')

        # store the newly merged information as a JSON string that holds a RIS entry:
        df['consolidated'] = [json.dumps(r, indent=2, ensure_ascii=False) for r in updated_ris]
        df['status'] = status

        # re-encode the original RIS entry as a JSON string in the original column:
        df['RIS'] = [json.dumps(d, indent=2, ensure_ascii=False) for d in df['RIS']]

        # remove the cleaned bibtex string:
        del df['bibtex-clean']

        # output new spreadsheet:
        df.to_excel(spreadsheet_path, index=False, header=True)

        # Ensure that 'extra' field is correctly set as a list for each record
        for record in updated_ris:
            if 'extra' in record and not isinstance(record['extra'], str):
                record['extra'] = ' /// '.join(record['extra'])

        # output updated RIS file:
        with open(f'{decade_folder}/{ptype}_consolidated.ris', 'w') as bibliography_file:
            rispy.dump(updated_ris, bibliography_file, mapping=mappings)

        # show the failure statistics:
        print(df['status'].value_counts())

::: ../data/llm-dump/1940s :::
     -  ../data/llm-dump/1940s/BOOK.xlsx (BOOK)


  9%|▊         | 124/1442 [00:00<00:01, 1235.20it/s]

list index out of range
list index out of range
list index out of range


 18%|█▊        | 256/1442 [00:00<00:00, 1284.68it/s]

list index out of range


 27%|██▋       | 388/1442 [00:00<00:00, 1298.88it/s]

syntax error in line 10: premature end of file
list index out of range


 46%|████▋     | 670/1442 [00:00<00:00, 1365.41it/s]

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range


 56%|█████▌    | 807/1442 [00:00<00:00, 1359.29it/s]

list index out of range
list index out of range
syntax error in line 8: premature end of file
list index out of range
list index out of range
list index out of range


 65%|██████▌   | 943/1442 [00:00<00:00, 1316.71it/s]

list index out of range
list index out of range
list index out of range


 75%|███████▍  | 1077/1442 [00:00<00:00, 1320.54it/s]

list index out of range


 94%|█████████▎| 1349/1442 [00:01<00:00, 1338.43it/s]

list index out of range
list index out of range


100%|██████████| 1442/1442 [00:01<00:00, 1332.65it/s]


status
success    1418
failure      24
Name: count, dtype: int64
     -  ../data/llm-dump/1940s/CHAP.xlsx (CHAP)


  0%|          | 0/1764 [00:00<?, ?it/s]

syntax error in line 8: premature end of file


 13%|█▎        | 221/1764 [00:00<00:01, 1103.78it/s]

Too many commas in 'uitv. comité: P. De Smaele, H. Uyttersprot, F. De Tollenaere, H. Liebaers, A. Van Elslander'


100%|██████████| 1764/1764 [00:01<00:00, 1079.75it/s]


status
success    1762
failure       2
Name: count, dtype: int64
     -  ../data/llm-dump/1940s/JFULL.xlsx (JFULL)


100%|██████████| 68/68 [00:00<00:00, 802.11it/s]

status
success    68
Name: count, dtype: int64





     -  ../data/llm-dump/1940s/JOUR.xlsx (JOUR)


  0%|          | 0/9897 [00:00<?, ?it/s]

list index out of range
list index out of range


  3%|▎         | 286/9897 [00:00<00:06, 1471.35it/s]

list index out of range


  6%|▋         | 628/9897 [00:00<00:05, 1623.03it/s]

list index out of range


 13%|█▎        | 1247/9897 [00:00<00:05, 1451.82it/s]

list index out of range
list index out of range
list index out of range


 19%|█▊        | 1853/9897 [00:01<00:05, 1465.03it/s]

list index out of range


 83%|████████▎ | 8251/9897 [00:05<00:01, 1502.16it/s]

syntax error in line 9: premature end of file


100%|██████████| 9897/9897 [00:06<00:00, 1480.80it/s]


status
success    9888
failure       9
Name: count, dtype: int64
::: ../data/llm-dump/1950s :::
     -  ../data/llm-dump/1950s/BOOK.xlsx (BOOK)


 54%|█████▍    | 493/917 [00:00<00:00, 1178.31it/s]

syntax error in line 9: premature end of file


100%|██████████| 917/917 [00:00<00:00, 1196.08it/s]


status
success    916
failure      1
Name: count, dtype: int64
     -  ../data/llm-dump/1950s/CHAP.xlsx (CHAP)


100%|██████████| 1161/1161 [00:01<00:00, 1160.82it/s]


status
success    1161
Name: count, dtype: int64
     -  ../data/llm-dump/1950s/JFULL.xlsx (JFULL)


100%|██████████| 17/17 [00:00<00:00, 578.76it/s]


status
success    17
Name: count, dtype: int64
     -  ../data/llm-dump/1950s/JOUR.xlsx (JOUR)


 92%|█████████▏| 5744/6218 [00:03<00:00, 1509.42it/s]

syntax error in line 9: premature end of file


100%|██████████| 6218/6218 [00:03<00:00, 1566.70it/s]


status
success    6217
failure       1
Name: count, dtype: int64
::: ../data/llm-dump/1960s :::
     -  ../data/llm-dump/1960s/BOOK.xlsx (BOOK)


 29%|██▊       | 624/2185 [00:00<00:01, 1130.78it/s]

syntax error in line 10: premature end of file
syntax error in line 10: premature end of file


 62%|██████▏   | 1354/2185 [00:01<00:00, 1186.18it/s]

syntax error in line 9: premature end of file


 85%|████████▍ | 1849/2185 [00:01<00:00, 1225.93it/s]

Too many commas in 'Menno ter Braak; [samengesteld door D. A. M. Binnendijk, Gerrit Borgers, Jan Hulsker, Jurriaan Schrofer en Ellen Warmond]'


100%|██████████| 2185/2185 [00:01<00:00, 1194.95it/s]


status
success    2181
failure       4
Name: count, dtype: int64
     -  ../data/llm-dump/1960s/CHAP.xlsx (CHAP)


100%|██████████| 4084/4084 [00:03<00:00, 1134.14it/s]


status
success    4084
Name: count, dtype: int64
     -  ../data/llm-dump/1960s/JFULL.xlsx (JFULL)


100%|██████████| 190/190 [00:00<00:00, 741.85it/s]


syntax error in line 1: '(' or '{' expected
status
success    189
failure      1
Name: count, dtype: int64
     -  ../data/llm-dump/1960s/JOUR.xlsx (JOUR)


  5%|▍         | 882/19387 [00:00<00:12, 1438.91it/s]

syntax error in line 7: premature end of file


 38%|███▊      | 7378/19387 [00:04<00:07, 1531.60it/s]

syntax error in line 9: premature end of file


 43%|████▎     | 8289/19387 [00:05<00:07, 1491.49it/s]

syntax error in line 9: premature end of file


 44%|████▍     | 8589/19387 [00:05<00:07, 1491.50it/s]

Too many commas in 'Neuseeland, Schweiz, Holland, Italien'


 52%|█████▏    | 10091/19387 [00:06<00:06, 1492.60it/s]

Too many commas in 'Apmoal, apmits, apmet, helpman'
Too many commas in 'H. A. Gomperts, Harry Mulisch, Cees Nooteboom, et al.'


 55%|█████▌    | 10721/19387 [00:07<00:06, 1417.22it/s]

Too many commas in 'Hildeboldinga, Hilbolding, Hilbolling, Hubbeling, Hummelding, Hummeling'


 59%|█████▉    | 11465/19387 [00:07<00:05, 1479.19it/s]

syntax error in line 11: premature end of file


 62%|██████▏   | 11926/19387 [00:08<00:04, 1515.03it/s]

syntax error in line 1: entry key expected


 65%|██████▍   | 12535/19387 [00:08<00:04, 1467.91it/s]

syntax error in line 7: premature end of file


 79%|███████▉  | 15323/19387 [00:10<00:02, 1524.48it/s]

syntax error in line 10: premature end of file


 81%|████████▏ | 15780/19387 [00:10<00:02, 1482.84it/s]

syntax error in line 10: premature end of file


 85%|████████▌ | 16532/19387 [00:11<00:01, 1462.67it/s]

syntax error in line 10: premature end of file


 95%|█████████▌| 18435/19387 [00:12<00:00, 1430.45it/s]

syntax error in line 10: premature end of file


100%|██████████| 19387/19387 [00:13<00:00, 1482.92it/s]


status
success    19373
failure       14
Name: count, dtype: int64
::: ../data/llm-dump/1970s :::
     -  ../data/llm-dump/1970s/BOOK.xlsx (BOOK)


 39%|███▉      | 1429/3623 [00:01<00:01, 1252.38it/s]

syntax error in line 9: premature end of file


 81%|████████  | 2919/3623 [00:02<00:00, 1212.40it/s]

syntax error in line 6: '=' expected


 91%|█████████▏| 3308/3623 [00:02<00:00, 1259.71it/s]

syntax error in line 8: '=' expected


100%|██████████| 3623/3623 [00:03<00:00, 1204.48it/s]


Too many commas in 'met medew. van Cola Debrot, Charles Eyck, Albert Helman, [et al.]'
status
success    3619
failure       4
Name: count, dtype: int64
     -  ../data/llm-dump/1970s/CHAP.xlsx (CHAP)


 66%|██████▌   | 4357/6650 [00:03<00:02, 1004.13it/s]

Too many commas in '[F. Berckelaers, Geert Grub, Herman van den Reeck, et al.]'


 72%|███████▏  | 4764/6650 [00:04<00:01, 996.84it/s] 

syntax error in line 10: premature end of file


 91%|█████████ | 6050/6650 [00:05<00:00, 978.52it/s]

Too many commas in 'Herman Uyttersprot, Claude van de Berge, Leo Mets, Adriaan Magerman, Daan Boens, Jan Vercammen'
Too many commas in 'Herman Uyttersprot, Claude van de Berge, Leo Mets, Adriaan Magerman, Daan Boens, Jan Vercammen; door José de Poortere... [et al.]'
syntax error in line 9: premature end of file


100%|██████████| 6650/6650 [00:06<00:00, 1053.28it/s]


status
success    6645
failure       5
Name: count, dtype: int64
     -  ../data/llm-dump/1970s/JFULL.xlsx (JFULL)


100%|██████████| 273/273 [00:00<00:00, 643.46it/s]

syntax error in line 1: '(' or '{' expected
status
success    272
failure      1
Name: count, dtype: int64





     -  ../data/llm-dump/1970s/JOUR.xlsx (JOUR)


  6%|▋         | 1625/25511 [00:01<00:16, 1447.73it/s]

Too many commas in 'Pedagogen, psychologen, leertheoretici, psycholinguïsten, linguïsten, sociolinguïsten'


 10%|▉         | 2486/25511 [00:01<00:16, 1414.10it/s]

syntax error in line 13: premature end of file


 14%|█▍        | 3522/25511 [00:02<00:14, 1470.70it/s]

syntax error in line 6: '}' expected


 20%|██        | 5180/25511 [00:03<00:13, 1499.33it/s]

syntax error in line 8: premature end of file


 26%|██▌       | 6674/25511 [00:04<00:12, 1483.69it/s]

syntax error in line 9: premature end of file


 28%|██▊       | 7269/25511 [00:05<00:12, 1417.09it/s]

syntax error in line 12: premature end of file


 40%|███▉      | 10082/25511 [00:06<00:10, 1469.47it/s]

syntax error in line 9: premature end of file


 49%|████▊     | 12420/25511 [00:08<00:09, 1383.51it/s]

syntax error in line 12: premature end of file


 51%|█████     | 12992/25511 [00:08<00:08, 1414.12it/s]

Too many commas in 'Spijkerhard, straatarm, doodkalm, druipnat, propvol'


 66%|██████▋   | 16957/25511 [00:11<00:05, 1460.07it/s]

Too many commas in 'De steen der wijze critici. Een schaduwloopje met M. Janssens, M.J.G. de Jong, H. Bousset, A. Nuis en J. Veulemans'


 85%|████████▍ | 21631/25511 [00:14<00:02, 1420.10it/s]

syntax error in line 9: premature end of file


 91%|█████████ | 23224/25511 [00:16<00:01, 1435.55it/s]

list index out of range
syntax error in line 12: premature end of file


 94%|█████████▍| 24104/25511 [00:16<00:00, 1441.28it/s]

syntax error in line 3: '=' expected


 97%|█████████▋| 24722/25511 [00:17<00:00, 1519.65it/s]

list index out of range


100%|██████████| 25511/25511 [00:17<00:00, 1451.50it/s]

list index out of range
syntax error in line 1: '(' or '{' expected
list index out of range





status
success    25493
failure       18
Name: count, dtype: int64
::: ../data/llm-dump/1980s :::
     -  ../data/llm-dump/1980s/ADVS.xlsx (ADVS)


100%|██████████| 2/2 [00:00<00:00, 1328.15it/s]

status
success    2
Name: count, dtype: int64





     -  ../data/llm-dump/1980s/BOOK.xlsx (BOOK)


 56%|█████▌    | 3743/6722 [00:03<00:02, 1218.12it/s]

syntax error in line 1: a valid name expected


 67%|██████▋   | 4488/6722 [00:03<00:01, 1207.23it/s]

syntax error in line 7: premature end of file


 82%|████████▏ | 5485/6722 [00:04<00:01, 1231.45it/s]

syntax error in line 4: premature end of file


100%|██████████| 6722/6722 [00:05<00:00, 1182.05it/s]


syntax error in line 4: premature end of file
status
success    6718
failure       4
Name: count, dtype: int64
     -  ../data/llm-dump/1980s/CHAP.xlsx (CHAP)


  7%|▋         | 816/12289 [00:00<00:10, 1058.95it/s]

syntax error in line 12: premature end of file


  9%|▉         | 1138/12289 [00:01<00:10, 1050.26it/s]

syntax error in line 10: premature end of file


 11%|█         | 1349/12289 [00:01<00:10, 1035.55it/s]

Too many commas in 'Werkgroep taal buitenlandse werknemers, R. Bok-Bennema, Roos van Eeden, Bert Jansen... [et al.]'


 15%|█▌        | 1877/12289 [00:01<00:10, 1021.14it/s]

Too many commas in 'Leidse werkgroep moedertaaldidactiek, Hans Hulshof (eindred.), Helge Bonset, Bernard Schut, Heleen van der Straaten'
Too many commas in 'Leidse werkgroep moedertaaldidactiek, Hans Hulshof (eindred.); Helge Bonset, Bernard Schut, Heleen van der Straaten et al.'


 29%|██▊       | 3533/12289 [00:03<00:08, 1021.74it/s]

syntax error in line 12: premature end of file


 42%|████▏     | 5205/12289 [00:05<00:07, 885.65it/s] 

Too many commas in 'Germonprez, Fred. Croquison, Pierre, Nicolas'


 57%|█████▋    | 7045/12289 [00:07<00:06, 827.05it/s]

Too many commas in 'samenstellers van de catalogus: Albert Ampe, Elly Cockx-Indestege, Erik Drigsdahl, Frans Hendrickx, Jozef Andriessen, Jan Deschamps, Karel Porteman, Paul Verdeyen'


 83%|████████▎ | 10142/12289 [00:10<00:02, 904.70it/s]

Too many commas in 'Mies Bouhuys (inl.), Wilma Soederhuizen (interviews), Anita Löwenhardt (research), Marjo van Soest (red.)'


 86%|████████▌ | 10530/12289 [00:11<00:01, 912.36it/s]

syntax error in line 11: '=' expected
Too many commas in 'Creten, J. & Geerts, G. & Jaspaert, K.'


 93%|█████████▎| 11389/12289 [00:12<00:01, 864.83it/s]

syntax error in line 6: '=' expected


100%|██████████| 12289/12289 [00:13<00:00, 927.66it/s]


status
success    12277
failure       12
Name: count, dtype: int64
     -  ../data/llm-dump/1980s/JFULL.xlsx (JFULL)


 11%|█         | 69/619 [00:00<00:00, 681.29it/s]

syntax error in line 11: '}' expected


 54%|█████▎    | 332/619 [00:00<00:00, 635.95it/s]

syntax error in line 1: '(' or '{' expected


 87%|████████▋ | 540/619 [00:00<00:00, 670.66it/s]

syntax error in line 1: '(' or '{' expected
syntax error in line 1: '(' or '{' expected


100%|██████████| 619/619 [00:00<00:00, 649.61it/s]


status
success    615
failure      4
Name: count, dtype: int64
     -  ../data/llm-dump/1980s/JOUR.xlsx (JOUR)


  9%|▉         | 3144/34995 [00:02<00:21, 1470.17it/s]

syntax error in line 10: premature end of file


 25%|██▌       | 8802/34995 [00:06<00:19, 1314.01it/s]

list index out of range


 27%|██▋       | 9348/34995 [00:06<00:18, 1350.72it/s]

syntax error in line 11: premature end of file


 28%|██▊       | 9624/34995 [00:06<00:18, 1342.71it/s]

list index out of range


 36%|███▌      | 12625/34995 [00:09<00:16, 1392.48it/s]

syntax error in line 11: premature end of file
syntax error in line 10: '=' expected


 40%|███▉      | 13851/34995 [00:10<00:17, 1188.93it/s]

syntax error in line 9: premature end of file


 46%|████▌     | 16094/34995 [00:11<00:14, 1332.51it/s]

syntax error in line 10: premature end of file


 48%|████▊     | 16910/34995 [00:12<00:13, 1350.71it/s]

syntax error in line 10: premature end of file
syntax error in line 9: premature end of file


 53%|█████▎    | 18543/34995 [00:13<00:12, 1352.24it/s]

Too many commas in 'Chabot, De Vree, Tentije, Groot'


 55%|█████▍    | 19229/34995 [00:14<00:11, 1365.12it/s]

syntax error in line 10: premature end of file


 57%|█████▋    | 20055/34995 [00:14<00:11, 1351.41it/s]

syntax error in line 10: premature end of file


 62%|██████▏   | 21551/34995 [00:15<00:09, 1355.30it/s]

Too many commas in 'Van verpreuvelen, tot verpreulen, verprillen, en wat er zoal bij komt kijken'


 67%|██████▋   | 23380/34995 [00:17<00:08, 1328.89it/s]

syntax error in line 2: '=' expected


 76%|███████▌  | 26600/34995 [00:19<00:05, 1434.84it/s]

syntax error in line 6: '}' expected


 88%|████████▊ | 30798/34995 [00:22<00:03, 1348.55it/s]

syntax error in line 10: premature end of file
syntax error in line 10: premature end of file


100%|██████████| 34995/34995 [00:25<00:00, 1373.81it/s]


status
success    34977
failure       18
Name: count, dtype: int64
     -  ../data/llm-dump/1980s/WEB.xlsx (WEB)


100%|██████████| 1/1 [00:00<00:00, 843.08it/s]


status
success    1
Name: count, dtype: int64
::: ../data/llm-dump/1990s :::
     -  ../data/llm-dump/1990s/ADVS.xlsx (ADVS)


100%|██████████| 33/33 [00:00<00:00, 863.76it/s]

status
success    33
Name: count, dtype: int64





     -  ../data/llm-dump/1990s/BOOK.xlsx (BOOK)


 27%|██▋       | 2171/7992 [00:01<00:04, 1198.47it/s]

syntax error in line 4: premature end of file


 41%|████      | 3283/7992 [00:02<00:03, 1248.66it/s]

Too many commas in '[onder redactie van:] W. Haeseryn, K. Romijn, G. Geerts, J. de Rooij \\& M. C. van den Toorn'
syntax error in line 1: '(' or '{' expected


 53%|█████▎    | 4272/7992 [00:03<00:03, 1205.83it/s]

syntax error in line 1: '=' expected


 77%|███████▋  | 6116/7992 [00:05<00:01, 1091.79it/s]

Too many commas in 'Luc François (eindred.) en Beatrijs Baelde, Maarten Bresseleers, Sofie Descamps, Sofie Geschier, Thijs Lambrecht, Christophe Verbruggen'


 93%|█████████▎| 7415/7992 [00:06<00:00, 1065.03it/s]

syntax error in line 4: '=' expected


100%|██████████| 7992/7992 [00:06<00:00, 1144.77it/s]


status
success    7986
failure       6
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/CHAP.xlsx (CHAP)


 21%|██        | 2936/14247 [00:03<00:11, 949.37it/s]

syntax error in line 10: premature end of file


 37%|███▋      | 5303/14247 [00:05<00:10, 871.77it/s]

Too many commas in 'K. Humbeeck, E. Bruinsma, K. Haagdorens, J. Dierinck, B. Nuyens'


 40%|████      | 5735/14247 [00:06<00:10, 832.13it/s]

syntax error in line 12: premature end of file


 47%|████▋     | 6722/14247 [00:07<00:09, 768.06it/s]

syntax error in line 12: premature end of file


 51%|█████     | 7293/14247 [00:08<00:08, 789.47it/s]

syntax error in line 8: premature end of file


 54%|█████▍    | 7719/14247 [00:08<00:07, 851.73it/s]

syntax error in line 6: '=' expected


 73%|███████▎  | 10357/14247 [00:11<00:04, 910.57it/s]

syntax error in line 9: premature end of file


 75%|███████▌  | 10727/14247 [00:12<00:03, 899.31it/s]

syntax error in line 11: premature end of file


 83%|████████▎ | 11839/14247 [00:13<00:02, 908.10it/s]

Too many commas in 'de Jong, Erik \\& Schellekens, Claudia \\& Tummers, Harry'


100%|██████████| 14247/14247 [00:16<00:00, 867.74it/s]

Too many commas in "Boon, Louis Paul; bezorgd door K. Humbeeck, E. Bruinsma, K. Haagdorens, J. Dierinck \\& B. Nuyens; m.m.v. D. de Geest, Anne Marie Musschoot \\& Y. T'Sjoen"





status
success    14237
failure       10
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/EJOUR.xlsx (EJOUR)


100%|██████████| 97/97 [00:00<00:00, 866.14it/s]


list index out of range
status
success    96
failure     1
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/JFULL.xlsx (JFULL)


 77%|███████▋  | 511/660 [00:00<00:00, 592.02it/s]

syntax error in line 8: '=' expected
syntax error in line 13: '}' expected


100%|██████████| 660/660 [00:01<00:00, 567.56it/s]


syntax error in line 11: premature end of file
status
success    657
failure      3
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/JOUR.xlsx (JOUR)


  8%|▊         | 3644/44498 [00:02<00:29, 1400.44it/s]

syntax error in line 10: premature end of file


 17%|█▋        | 7581/44498 [00:05<00:26, 1405.11it/s]

syntax error in line 12: premature end of file


 21%|██        | 9391/44498 [00:06<00:25, 1363.16it/s]

syntax error in line 8: premature end of file


 25%|██▍       | 10931/44498 [00:07<00:24, 1380.71it/s]

Too many commas in 'Klos, klos, klos, hoorde je ze naar boven komen'


 26%|██▋       | 11763/44498 [00:08<00:23, 1369.26it/s]

syntax error in line 10: '=' expected


 28%|██▊       | 12320/44498 [00:08<00:23, 1385.31it/s]

syntax error in line 10: premature end of file


 31%|███▏      | 13986/44498 [00:10<00:21, 1416.21it/s]

syntax error in line 9: premature end of file


 35%|███▌      | 15639/44498 [00:11<00:21, 1314.99it/s]

syntax error in line 10: premature end of file


 46%|████▌     | 20427/44498 [00:14<00:17, 1384.29it/s]

syntax error in line 10: premature end of file
syntax error in line 1: '(' or '{' expected


 52%|█████▏    | 23209/44498 [00:17<00:15, 1373.80it/s]

syntax error in line 10: premature end of file


 55%|█████▌    | 24561/44498 [00:17<00:12, 1548.37it/s]

syntax error in line 12: premature end of file


 57%|█████▋    | 25178/44498 [00:18<00:12, 1511.14it/s]

syntax error in line 8: premature end of file


 62%|██████▏   | 27422/44498 [00:19<00:10, 1618.07it/s]

syntax error in line 1: '(' or '{' expected


 68%|██████▊   | 30260/44498 [00:21<00:09, 1457.88it/s]

syntax error in line 4: '=' expected
syntax error in line 4: '=' expected


 72%|███████▏  | 31831/44498 [00:22<00:08, 1415.83it/s]

syntax error in line 10: premature end of file


 72%|███████▏  | 32255/44498 [00:23<00:09, 1264.40it/s]

syntax error in line 11: premature end of file


 75%|███████▍  | 33233/44498 [00:23<00:08, 1367.67it/s]

syntax error in line 9: premature end of file


 76%|███████▋  | 33940/44498 [00:24<00:07, 1402.80it/s]

syntax error in line 7: premature end of file


 78%|███████▊  | 34773/44498 [00:25<00:07, 1338.05it/s]

list index out of range
syntax error in line 10: premature end of file
syntax error in line 9: premature end of file


 83%|████████▎ | 37114/44498 [00:26<00:05, 1396.49it/s]

syntax error in line 9: '=' expected


 89%|████████▉ | 39769/44498 [00:28<00:02, 1605.07it/s]

syntax error in line 10: premature end of file


100%|██████████| 44498/44498 [00:31<00:00, 1408.75it/s]


status
success    44473
failure       25
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/WEB.xlsx (WEB)


100%|██████████| 10/10 [00:00<00:00, 1224.29it/s]


status
success    10
Name: count, dtype: int64
::: ../data/llm-dump/2000s :::
     -  ../data/llm-dump/2000s/ADVS.xlsx (ADVS)


100%|██████████| 54/54 [00:00<00:00, 1021.49it/s]

status
success    54
Name: count, dtype: int64





     -  ../data/llm-dump/2000s/BOOK.xlsx (BOOK)


 14%|█▍        | 858/6229 [00:00<00:04, 1142.04it/s]

Too many commas in "Kris Humbeeck (wetenschappelijke leiding), Britt Kennis (coördinatie), Ernst Bruinsma, Anne Marie Musschoot, Matthijs de Ridder, Yves T'Sjoen"


 32%|███▏      | 1969/6229 [00:01<00:03, 1253.09it/s]

syntax error in line 8: premature end of file


 38%|███▊      | 2351/6229 [00:02<00:03, 1128.92it/s]

Too many commas in 'Harry N. Sierman \\& Querido, Reynoud Homan \\& Wim Quist, Irma Boon \\& Paul Fentener van Vlissingen; [samenstelling, tekst en interviews: Mathieu Loman; tekstbijdragen: Judith Belinfante ... et al.; fotografie: Iman Heystek... et al.]'
Too many commas in "Kris Humbeeck (wetenschappelijke leiding); Britt Kennis (coördinatie), Ernst Bruinsma, Anne Marie Musschoot, Matthijs de Ridder, Yves T'Sjoen"


 61%|██████    | 3800/6229 [00:03<00:02, 1180.56it/s]

Too many commas in 'Jo Tollebeek (hoofdredacteur), Geert Buelens, Gita Deneckere, Chantal Kesteloot, Sophie de Schaepdrijver'


 70%|███████   | 4382/6229 [00:03<00:01, 1083.08it/s]

Too many commas in 'M. Celeste Augusto, Karolien van Eck, Carla de Albuquerque Dias, Ivana Brasileiro Reis'
syntax error in line 2: '}' expected


 82%|████████▏ | 5087/6229 [00:04<00:00, 1149.76it/s]

Too many commas in 'Suzan van Dijk (chief editor), P. Broomans, J. F. van der Meulen, W. R. D. van Oostrum'
syntax error in line 3: premature end of file


100%|██████████| 6229/6229 [00:05<00:00, 1150.48it/s]


status
success    6220
failure       9
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/CHAP.xlsx (CHAP)


  5%|▌         | 600/11027 [00:00<00:12, 861.63it/s]

syntax error in line 1: '(' or '{' expected


 26%|██▌       | 2887/11027 [00:03<00:08, 963.93it/s] 

syntax error in line 10: premature end of file


 28%|██▊       | 3079/11027 [00:03<00:08, 903.04it/s]

Too many commas in 'Taeldeman, man van de taal, schatbewaarder van de taal; Johan De Caluwe, Georges De Schutter, Magda Devos, Jacques Van Keymeulen'


 30%|██▉       | 3257/11027 [00:03<00:09, 832.55it/s]

syntax error in line 1: '=' expected


 85%|████████▍ | 9339/11027 [00:11<00:02, 740.76it/s]

Too many commas in 'Lenz, Alexandra N. \\& Gooskens, Charlotte \\& Reker, Siemon'


 88%|████████▊ | 9668/11027 [00:11<00:01, 769.37it/s]

syntax error in line 6: '=' expected


100%|██████████| 11027/11027 [00:13<00:00, 832.25it/s]


status
success    11021
failure        6
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/EJOUR.xlsx (EJOUR)


100%|██████████| 628/628 [00:00<00:00, 1294.32it/s]


status
success    628
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/JFULL.xlsx (JFULL)


 51%|█████     | 244/479 [00:00<00:00, 613.37it/s]

Too many commas in 'Jimmy Koppen, Marnix Beyen, Christel Stalpaert, Harry Van Velthoven'
Too many commas in '[redactie: Toef Jaeger, Menno Lievers, Ilja Leonard Pfeijffer, Allard Schröder]'


100%|██████████| 479/479 [00:00<00:00, 604.95it/s]

syntax error in line 4: '=' expected
syntax error in line 4: '=' expected
Too many commas in 'Floris Cavyn, Evelyne Coussens (eindredactie), Wouter Hillaert (hoofd- en eindredactie), et al.'
Too many commas in 'Georges Martyn, Gretha Donker, Sjoerd Faber, Dirk Heirbaut'





status
success    473
failure      6
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/JOUR.xlsx (JOUR)


  4%|▍         | 1148/27745 [00:00<00:18, 1466.12it/s]

syntax error in line 10: premature end of file


 37%|███▋      | 10163/27745 [00:07<00:13, 1320.52it/s]

syntax error in line 11: premature end of file


 40%|████      | 11126/27745 [00:08<00:12, 1349.81it/s]

syntax error in line 4: premature end of file


 68%|██████▊   | 18959/27745 [00:13<00:06, 1432.35it/s]

syntax error in line 3: premature end of file


 80%|███████▉  | 22137/27745 [00:16<00:04, 1349.53it/s]

syntax error in line 10: premature end of file


 83%|████████▎ | 22934/27745 [00:16<00:03, 1315.29it/s]

syntax error in line 5: premature end of file


 85%|████████▍ | 23582/27745 [00:17<00:03, 1264.15it/s]

syntax error in line 10: premature end of file


 93%|█████████▎| 25941/27745 [00:19<00:01, 1283.87it/s]

syntax error in line 10: premature end of file
list index out of range


 99%|█████████▉| 27596/27745 [00:20<00:00, 1509.56it/s]

syntax error in line 9: premature end of file


100%|██████████| 27745/27745 [00:20<00:00, 1362.56it/s]


status
success    27735
failure       10
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/WEB.xlsx (WEB)


 29%|██▉       | 140/485 [00:00<00:00, 1393.18it/s]

Too many commas in 'redactie: Dirk Caluwé (redactionele leiding), An Boumans, Stefaan Croon, Sylvianne de Schepper, Katleen Maesen en Kristien Spillebeen; met dank aan Anne Ruette, Sara van Calster, Marianne van Scherpenzeel, Veronique Verreycken en de Werkgroep Spelling Suriname'


 58%|█████▊    | 280/485 [00:00<00:00, 1370.49it/s]

Too many commas in 'redactie: Peter Boot, Herman Brinkman, Peter de Bruijn, Jan Gielkens, Joke Roelevink en Renske Siemens'


100%|██████████| 485/485 [00:00<00:00, 1356.04it/s]


status
success    483
failure      2
Name: count, dtype: int64
::: ../data/llm-dump/2010s :::
     -  ../data/llm-dump/2010s/ADVS.xlsx (ADVS)


100%|██████████| 9/9 [00:00<00:00, 1233.42it/s]


status
success    9
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/BOOK.xlsx (BOOK)


 46%|████▌     | 1779/3891 [00:01<00:01, 1186.88it/s]

Too many commas in "[samenstelling:] Johan Pas \\& Yves T'Sjoen; [met medewerking van Filip Demeyer... [et al.; teksten: Roger de Neef, Johan Pas, Yves T'Sjoen, Els van Damme]"
Too many commas in 'Kurt Feyaerts, Geert Brône, Karoline Claes, Birgitta Meex, Steven Schoonjans \\& Jelena Vranjes'


 71%|███████   | 2760/3891 [00:02<00:00, 1201.44it/s]

syntax error in line 1: '=' expected


 81%|████████  | 3142/3891 [00:02<00:00, 1241.70it/s]

Too many commas in 'Carolien Ceton (hoofdredactie), Annemie Halsema, Ineke van der Burg, Karen Vintges en Veronica Vasterling'


 90%|█████████ | 3515/3891 [00:02<00:00, 1189.34it/s]

Too many commas in 'Michaël van Houtte, Pieterjan Buggenhout, Tom de Ridder, Veronique de Tier'
Too many commas in "Kris Humbeeck (wetenschappelijke leiding); Britt Kennis (coördinatie), Ernst Bruinsma, Taana Peeters, Matthijs de Ridder, Valerie Rousseau, Tom Sintobin, Yves T'Sjoen, Liesbeth Vantorre, Sara Verbeeck"
Too many commas in "Kris Humbeeck (wetenschappelĳke leiding), Britt Kennis (coördinatie), Ernst Bruinsma, Anne Marie Musschoot, Taana Peeters, Matthijs de Ridder, Yves T'Sjoen, Liesbeth Vantorre"


100%|██████████| 3891/3891 [00:03<00:00, 1232.09it/s]


syntax error in line 3: premature end of file
status
success    3883
failure       8
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/CHAP.xlsx (CHAP)


  4%|▎         | 236/6300 [00:00<00:07, 787.32it/s]

Too many commas in 'Astrid Geudens, Dieter Baeyens, Kirsten Schraeyen, Kathleen Maetens, Jolien de Brauwer \\& Maaike Loncke'


 13%|█▎        | 850/6300 [00:01<00:06, 819.22it/s]

Too many commas in 'Herbert Van Uffelen, Dirk de Geest, Susan Mahmody, Pieter Verstraeten'


 21%|██▏       | 1347/6300 [00:01<00:06, 798.22it/s]

syntax error in line 11: premature end of file


 47%|████▋     | 2947/6300 [00:03<00:04, 691.19it/s]

Too many commas in "Michiels, Ivo; Nuyens, Bart; T'Sjoen, Yves; van Damme, Els"


 69%|██████▉   | 4334/6300 [00:05<00:02, 769.25it/s]

syntax error in line 10: premature end of file


 77%|███████▋  | 4851/6300 [00:06<00:01, 842.47it/s]

syntax error in line 6: '=' expected


 83%|████████▎ | 5206/6300 [00:06<00:01, 804.16it/s]

Too many commas in 'Carolien Ceton (hoofdredactie), Annemie Halsema, Ineke van der Burg, Karen Vintges en Veronica Vasterling'
Too many commas in 'Carolien Ceton (hoofdredactie), Annemie Halsema, Ineke van der Burg, Karen Vintges en Veronica Vasterling'
Too many commas in 'Timothy Colleman, Johan De Caluwe, Veronique De Tier, Anne-Sophie Ghyselen, Liesbet Triest, Roxane Vandenberghe & Ulrike Vogl'


 91%|█████████ | 5726/6300 [00:07<00:00, 855.42it/s]

list index out of range
Too many commas in 'Els Hendrickx, Karl Hendrickx, Willy Martin, Hans Smessaert, William Van Belle en Joop van der Horst'


 96%|█████████▌| 6059/6300 [00:07<00:00, 776.76it/s]

Too many commas in 'Herbert van Uffelen, Dirk de Geest, Marlou de Bont, Christine Hermann'


100%|██████████| 6300/6300 [00:07<00:00, 788.26it/s]


status
success    6288
failure      12
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/EJOUR.xlsx (EJOUR)


 21%|██▏       | 121/569 [00:00<00:00, 1206.61it/s]

Too many commas in 'Peter Boot, Herman Brinkman, Peter de Bruijn, Jan Gielkens, Joke Roelevink en Renske Siemens'


100%|██████████| 569/569 [00:00<00:00, 1406.92it/s]


status
success    568
failure      1
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/JFULL.xlsx (JFULL)


 14%|█▍        | 63/442 [00:00<00:00, 619.60it/s]

syntax error in line 10: premature end of file


 44%|████▍     | 195/442 [00:00<00:00, 615.46it/s]

syntax error in line 4: premature end of file
syntax error in line 1: entry key expected
syntax error in line 5: premature end of file


100%|██████████| 442/442 [00:00<00:00, 617.79it/s]


syntax error in line 6: premature end of file
syntax error in line 6: premature end of file
Too many commas in 'Gábor Pusztai, Réka Bozzay, Jaap Doedens, Annyke de Jong, Márta Kántor-Faragó \\& Gert Loosen'
status
success    435
failure      7
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/JOUR.xlsx (JOUR)


  1%|▏         | 273/20043 [00:00<00:14, 1366.98it/s]

syntax error in line 7: premature end of file


  9%|▉         | 1813/20043 [00:01<00:13, 1381.89it/s]

syntax error in line 10: premature end of file


 19%|█▉        | 3898/20043 [00:02<00:11, 1382.31it/s]

syntax error in line 10: premature end of file


 27%|██▋       | 5431/20043 [00:03<00:10, 1391.04it/s]

syntax error in line 8: premature end of file
syntax error in line 9: premature end of file


 32%|███▏      | 6506/20043 [00:04<00:10, 1303.94it/s]

Too many commas in 'Peter Boot, Herman Brinkman, Peter de Bruijn, Jan Gielkens, Joke Roelevink, Renske Siemens'
syntax error in line 1: '(' or '{' expected


 36%|███▋      | 7309/20043 [00:05<00:09, 1330.33it/s]

syntax error in line 9: premature end of file


 42%|████▏     | 8390/20043 [00:06<00:08, 1354.32it/s]

syntax error in line 5: premature end of file
syntax error in line 5: premature end of file


 75%|███████▌  | 15114/20043 [00:11<00:03, 1364.33it/s]

Too many commas in "Lucy B. en C. W. van der Hoogt-prijs 2017: advies van de Commissie voor schone letteren; [Pia de Jong, Kaster Freriks, Lieke Marsman, Gerard Raat, Yves T'Sjoen]"


 85%|████████▌ | 17060/20043 [00:12<00:02, 1259.78it/s]

syntax error in line 7: '=' expected


 98%|█████████▊| 19584/20043 [00:14<00:00, 1391.18it/s]

syntax error in line 10: premature end of file


100%|██████████| 20043/20043 [00:14<00:00, 1346.35it/s]


status
success    20030
failure       13
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/WEB.xlsx (WEB)


100%|██████████| 85/85 [00:00<00:00, 1189.71it/s]


status
success    85
Name: count, dtype: int64
::: ../data/llm-dump/2020s :::
     -  ../data/llm-dump/2020s/ADVS.xlsx (ADVS)


100%|██████████| 2/2 [00:00<00:00, 2173.78it/s]


syntax error in line 6: premature end of file
status
success    1
failure    1
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/BOOK.xlsx (BOOK)


100%|██████████| 825/825 [00:00<00:00, 1359.08it/s]

Too many commas in 'Dijkhof, E. C. i.s.m. A. Berteloot, J. A. A. M. Biemans, J. W. J. Burgers, V. Van Camp, H. van Engen, J. S. Love, E. De Paermentier, A. T. Smith, en M. K. Williams'





status
success    824
failure      1
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/CHAP.xlsx (CHAP)


100%|██████████| 1627/1627 [00:01<00:00, 895.66it/s]


status
success    1627
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/EJOUR.xlsx (EJOUR)


100%|██████████| 251/251 [00:00<00:00, 1481.56it/s]


status
success    251
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/JFULL.xlsx (JFULL)


100%|██████████| 161/161 [00:00<00:00, 665.56it/s]


status
success    161
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/JOUR.xlsx (JOUR)


 22%|██▏       | 1173/5238 [00:00<00:02, 1391.13it/s]

syntax error in line 3: '=' expected


100%|██████████| 5238/5238 [00:03<00:00, 1351.14it/s]


status
success    5237
failure       1
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/WEB.xlsx (WEB)


100%|██████████| 1/1 [00:00<00:00, 1373.83it/s]


status
success    1
Name: count, dtype: int64
::: ../data/llm-dump/misc :::
     -  ../data/llm-dump/misc/ADVS.xlsx (ADVS)


100%|██████████| 3/3 [00:00<00:00, 875.64it/s]


status
success    3
Name: count, dtype: int64
     -  ../data/llm-dump/misc/BOOK.xlsx (BOOK)


 81%|████████  | 957/1182 [00:00<00:00, 1352.21it/s]

syntax error in line 7: '}' expected


100%|██████████| 1182/1182 [00:00<00:00, 1350.35it/s]


syntax error in line 7: premature end of file
status
success    1180
failure       2
Name: count, dtype: int64
     -  ../data/llm-dump/misc/CHAP.xlsx (CHAP)


100%|██████████| 470/470 [00:00<00:00, 1134.90it/s]


syntax error in line 14: premature end of file
status
success    469
failure      1
Name: count, dtype: int64
     -  ../data/llm-dump/misc/EJOUR.xlsx (EJOUR)


100%|██████████| 2/2 [00:00<00:00, 932.48it/s]


status
success    2
Name: count, dtype: int64
     -  ../data/llm-dump/misc/JFULL.xlsx (JFULL)


  5%|▍         | 279/6051 [00:00<00:08, 698.34it/s]

syntax error in line 7: '}' expected


  8%|▊         | 497/6051 [00:00<00:08, 679.67it/s]

syntax error in line 6: premature end of file
syntax error in line 7: premature end of file


 10%|█         | 632/6051 [00:00<00:08, 648.90it/s]

syntax error in line 1: a valid name expected
syntax error in line 1: a valid name expected


 15%|█▌        | 921/6051 [00:01<00:07, 683.34it/s]

syntax error in line 1: '(' or '{' expected


 20%|█▉        | 1199/6051 [00:01<00:07, 675.96it/s]

syntax error in line 1: a valid name expected
syntax error in line 2: '=' expected
syntax error in line 4: '}' expected


 22%|██▏       | 1335/6051 [00:01<00:07, 664.43it/s]

syntax error in line 4: '}' expected
syntax error in line 5: '}' expected


 28%|██▊       | 1695/6051 [00:02<00:06, 674.23it/s]

syntax error in line 5: '}' expected


 30%|███       | 1829/6051 [00:02<00:06, 650.83it/s]

Too many commas in 'Ostfriesischen Landschaft in Verb. mit den Heimatvereinen, der Industrie- und Handelskammer für Ostfriesland und Papenburg, der Handwerkskammer Aurich, dem Landesverkehrsverband Ostfriesland und dem Landwirtschaftlichen Hauptverein für Ostfriesland'
syntax error in line 6: premature end of file
entry with key JaarverslagCoehoorn has a duplicate issn field


 34%|███▍      | 2043/6051 [00:03<00:05, 690.94it/s]

syntax error in line 1: a valid name expected
syntax error in line 11: '=' expected


 39%|███▊      | 2338/6051 [00:03<00:05, 716.49it/s]

syntax error in line 8: premature end of file
syntax error in line 5: '}' expected
syntax error in line 1: '(' or '{' expected


 41%|████      | 2481/6051 [00:03<00:05, 661.57it/s]

syntax error in line 4: '}' expected
syntax error in line 5: '}' expected
syntax error in line 5: '}' expected
syntax error in line 5: '}' expected


 47%|████▋     | 2822/6051 [00:04<00:04, 798.82it/s]

syntax error in line 5: '}' expected
syntax error in line 6: '=' expected


 51%|█████     | 3080/6051 [00:04<00:03, 838.37it/s]

syntax error in line 5: '}' expected
syntax error in line 1: a valid name expected


 54%|█████▎    | 3252/6051 [00:04<00:03, 845.00it/s]

syntax error in line 7: premature end of file


 57%|█████▋    | 3432/6051 [00:04<00:03, 855.37it/s]

syntax error in line 5: '}' expected


 61%|██████    | 3687/6051 [00:05<00:02, 820.24it/s]

Too many commas in 'historische Kommission f{\\"u}r Hannover, Oldenburg, Braunschweig, Schaumburg-Lippe und Bremen'
syntax error in line 1: ')' expected
Too many commas in 'Wilken Engelbrecht, Judit Gera, Marta Kantor Farago, Jelica Novakovic, Jan Pekelder, Jana Raksanyiova'


 69%|██████▉   | 4193/6051 [00:05<00:02, 822.61it/s]

syntax error in line 1: a valid name expected
Too many commas in 'Centrale Vereniging voor Openbare Bibliotheken, Centrum voor Literatuuronderzoekers, Nederlands Instituut voor Informatie, Documentatie en Registratuur, Nederlandse Vereniging van Bedrijfsarchivarissen, Nederlandse Vereniging van Bibliothecarissen'


 72%|███████▏  | 4363/6051 [00:05<00:02, 821.73it/s]

syntax error in line 1: a valid name expected
syntax error in line 4: '}' expected
syntax error in line 8: '}' expected


 79%|███████▉  | 4788/6051 [00:06<00:01, 779.22it/s]

syntax error in line 16: premature end of file
syntax error in line 1: a valid name expected


 82%|████████▏ | 4944/6051 [00:06<00:01, 715.94it/s]

syntax error in line 1: a valid name expected


 87%|████████▋ | 5260/6051 [00:07<00:01, 754.91it/s]

Too many commas in 'Bureau voor Muziekauteursrecht, BUMA, Stichting tot Exploitatie van Mechanische Reproductierechten der Auteurs, STEMRA en Stichting SEBA tot Exploitatie van Auteursrechten'
syntax error in line 5: '}' expected
syntax error in line 5: '}' expected


 89%|████████▉ | 5413/6051 [00:07<00:00, 711.55it/s]

syntax error in line 5: '}' expected
syntax error in line 12: '=' expected
syntax error in line 5: '}' expected


 96%|█████████▌| 5790/6051 [00:07<00:00, 727.56it/s]

syntax error in line 1: entry key expected


100%|██████████| 6051/6051 [00:08<00:00, 740.13it/s]


status
success    6003
failure      48
Name: count, dtype: int64
     -  ../data/llm-dump/misc/JOUR.xlsx (JOUR)


 45%|████▍     | 1554/3483 [00:01<00:01, 1380.54it/s]

syntax error in line 9: premature end of file


 94%|█████████▍| 3274/3483 [00:02<00:00, 1399.42it/s]

syntax error in line 8: premature end of file
syntax error in line 8: premature end of file


100%|██████████| 3483/3483 [00:02<00:00, 1393.87it/s]


status
success    3480
failure       3
Name: count, dtype: int64
     -  ../data/llm-dump/misc/WEB.xlsx (WEB)


100%|██████████| 32/32 [00:00<00:00, 1381.28it/s]

Too many commas in 'Wim van Anrooij, Ingrid Biesheuvel, Karina van Dalen-Oskam, Jan Noordegraaf'
status
success    31
failure     1
Name: count, dtype: int64





Extract journal titles for which we don't have a normalization yet and map them provionally to the closest available normalized title (using the Levenshtein distance):

In [13]:
#mappings = []
#for nj, cnt in new_jtitles.items():
#    distances = np.array([lev.distance(nj, oj) for oj in jtitle['normalized']])
#    mappings.append([nj, cnt] + list(jtitle.iloc[np.argmin(distances)][['normalized', 'issn']]))

#mappings = pd.DataFrame(mappings, columns=['raw title', 'count', 'normalized', 'issn'])
#mappings = mappings.sort_values('count', ascending=False)
#mappings.head(30)

We save this spreadsheet for manual correction:

In [14]:
#mappings.to_excel('../data/journal_titles_2ndBatch.xlsx', header=True, index=False)