In [1]:
import re
import os
import json
import glob
from collections import Counter

from tqdm import tqdm
import pandas as pd
import Levenshtein as lev
import numpy as np

import rispy
from pybtex.database import parse_string
from pylatexenc.latex2text import LatexNodes2Text
conv = LatexNodes2Text()

In [2]:
def extract_isbn(input_string):
    """
    Extract an ISBN from an unstructured text string.
    """
    # Regular expression to match "ISBN:" followed by any space-free combination of digits and hyphens.
    pattern = r'ISBN:?\s*([\d-]+)'
    
    match = re.search(pattern, input_string)
    if match:
        return match.group(1)  # Return the matched part (ISBN number)
    else:
        return None  # No ISBN found following the "ISBN:" prefix

In [3]:
def map_book(ris, bibt):
    # extract ISBN for abstract field, if available:
    if 'abstract' in ris:
        abstract = ris['abstract']
        isbn = extract_isbn(abstract.strip())
        if isbn:
            ris['issn'] = isbn

    # make editors authors, if applicable:
    if 'editor' in bibt.persons:
        if 'author' in bibt.persons:
            ris['first_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['author']]
        else:
            if 'author' in ris:
                del ris['author']
            if 'authors' in ris:
                del ris['authors']
        ris['secondary_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['editor']]

    if 'first_authors' in ris and 'secondary_authors' in ris:
        if 'author' in ris:
            del ris['author']
        if 'authors' in ris:
            del ris['authors']

    # add translators
    if 'translator' in bibt.fields:
        ris['tertiary_authors'] = conv.latex_to_text(bibt.fields['translator']).split(' and ')
        if 'author' in ris:
            del ris['author']
        if 'first_authors' in ris:
            del ris['first_authors']

    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # replace non-distinct title with parsed journal title:
    if 'title' in bibt.fields and bibt.fields['title'].split():
        ris['notes_abstract'] = ris['title']
        ris['title'] = bibt.fields['title']
    elif 'booktitle' in bibt.fields and bibt.fields['booktitle'].split():
        ris['notes_abstract'] = ris['title']
        ris['title'] = bibt.fields['booktitle']
    
    if 'pagetotal' in bibt.fields:
        ris['end_page'] = bibt.fields['pagetotal']
    
    if 'pages' in bibt.fields:
        ris['end_page'] = bibt.fields['pages']
    
    if 'publisher' in bibt.fields:
        ris['publisher'] = bibt.fields['publisher']

    if 'place' in bibt.fields:
        ris['place_published'] = bibt.fields['place']

    if 'place' in bibt.fields:
        ris['place_published'] = bibt.fields['place']

    if 'address' in bibt.fields:
        ris['place_published'] = bibt.fields['address']
    
    if 'location' in bibt.fields:
        ris['place_published'] = bibt.fields['location']

    if 'series' in bibt.fields:
        if ';' in bibt.fields['series']:
            series, vol = [e.strip() for e in bibt.fields['series'].split(';', maxsplit=1)]
            ris['secondary_title'] = series
            ris['volume'] = vol
        else:
            ris['secondary_title'] = bibt.fields['series']

    if 'volume' in bibt.fields:
        ris['volume'] = bibt.fields['volume']

    if 'number' in bibt.fields:
        ris['number'] = bibt.fields['number']

    if 'edition' in bibt.fields:
        ris['edition'] = bibt.fields['edition']
    
    return ris

In [4]:
def map_chapter(ris, bibt):
    # extract ISBN for abstract field, if available:
    if 'abstract' in ris:
        abstract = ris['abstract']
        isbn = extract_isbn(abstract.strip())
        if isbn:
            ris['isbn'] = isbn

    # make editors authors, if applicable:
    if 'editor' in bibt.persons:
        if 'author' in bibt.persons:
            ris['first_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['author']]
        else:
            if 'authors' in ris:
                del ris['authors']
        ris['secondary_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['editor']]

    if 'first_authors' in ris and 'secondary_authors' in ris:
        if 'authors' in ris:
            del ris['authors']

    # add translators
    if 'translator' in bibt.fields:
        ris['tertiary_authors'] = conv.latex_to_text(bibt.fields['translator']).split(' and ')

    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # replace non-distinct title with parsed journal title:
    if 'title' in bibt.fields:
        ris['notes_abstract'] = ris['title']
        ris['title'] = bibt.fields['title']

    if 'booktitle' in bibt.fields:
        ris['secondary_title'] = bibt.fields['booktitle']
    
    if 'pagetotal' in bibt.fields:
        ris['end_page'] = bibt.fields['pagetotal']
    
    if 'pages' in bibt.fields:
        pages = bibt.fields['pages'].split('-')
        if len(pages) == 2:
            ris['start_page'] = pages[0]
            ris['end_page'] = pages[1]
        else:
            ris['end_page'] = bibt.fields['pages']
    
    if 'publisher' in bibt.fields:
        ris['publisher'] = bibt.fields['publisher']

    if 'place' in bibt.fields:
        ris['place_published'] = bibt.fields['place']

    if 'address' in bibt.fields:
        ris['place_published'] = bibt.fields['address']
    
    if 'location' in bibt.fields:
        ris['place_published'] = bibt.fields['location']

    if 'series' in bibt.fields:
        ris['tertiary_title'] = bibt.fields['series']

    if 'volume' in bibt.fields:
        ris['volume'] = bibt.fields['volume']

    if 'number' in bibt.fields:
        ris['number'] = bibt.fields['number']

    if 'edition' in bibt.fields:
        ris['edition'] = bibt.fields['edition']
    
    return ris

Collect already available normalizations for journal titles (so that we can find out below which one we miss):

In [5]:
jtitle = pd.read_excel('../data/journal_titles_master.xlsx')
existing_jtitles = set(jtitle['normalized'])
lower2jtitles = dict(zip(jtitle['normalized'].str.lower(), jtitle['normalized']))
jtitle.head(30)

Unnamed: 0,secondary_title,normalized,count,issn
0,Ons erfdeel: kultureel tijdschrift voor Zuidvl...,Ons erfdeel,2660.0,0030-2651
1,Dietsche warande en Belfort: tijdschrift voor ...,Dietsche warande en Belfort,2461.0,0012-2645
2,De nieuwe taalgids: tweemaandelijks tijdschrif...,De nieuwe taalgids,2359.0,0028-9922
3,Bzzlletin; Stichting BZZTôH Teater. Voorburg: ...,Bzzlletin,1638.0,0165-0858
4,Poëziekrant: tweemaandelijks tijdschrift. Gent...,Poëziekrant,1573.0,2030-0638
5,Onze taal: maandblad van het Genootschap Onze ...,Onze taal,1322.0,0165-7828
6,Vlaanderen: tweemaandelijks tijdschrift voor k...,Vlaanderen,1312.0,0042-7683
7,De gids: nieuwe vaderlandsche letteroefeningen...,De gids,1249.0,0016-9730
8,Levende talen: berichten en mededelingen van d...,Levende talen,1239.0,0024-1539
9,Tijdschrift voor Nederlandse taal- en letterku...,Tijdschrift voor Nederlandse taal- en letterkunde,962.0,0040-7550


In [6]:
def map_journal(ris, bibt):
    """
    Merges the newly structured information in the bibtex returned
    by the LLM into the already available RIS entry from the dump.
    Reliably structured information (e.g. authors, year, keywords, ...)
    from the RIS entries is maximally retained.
    """
    #print(ris)
    #print(bibt)
    #print('===============================================')

    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # replace unstrucuted title with parsed journal title (if available):
    if 'title' in bibt.fields:
        # keep track of original title description:
        ris['notes_abstract'] = ris['title']
        ris['title'] = bibt.fields['title']
        if ris['title'].strip().lower() in ('in', 'untitled', 'title of the article', 'title of the article (if provided)'):
            ris['title'] = ''
    
    # parse pagination information:
    if 'pagetotal' in bibt.fields:
        ris['end_page'] = bibt.fields['pagetotal']
    if 'pages' in bibt.fields:
        pages = bibt.fields['pages'].split('-')
        if len(pages) == 2:
            ris['start_page'] = pages[0]
            ris['end_page'] = pages[1]
        else:
            ris['end_page'] = bibt.fields['pages']

    # collect parsed journal title (unless we had that information already, which will be more reliable)
    if 'secondary_title' not in ris and 'journal' in bibt.fields:
        journal = bibt.fields['journal']
        # sometimes place of publication of the journal is added: we remove that
        journal = journal.split('(')[0].strip()
        journal = journal.split('[')[0].strip()
        ris['journal_name'] = journal
    elif 'secondary_title' in ris and 'journal' in bibt.fields:
        ris['journal_name'] = ris['secondary_title']
        del ris['secondary_title']

    if 'journal_name' in ris:
        jn = ris['journal_name']
        if jn.startswith('"') and jn.endswith('",'):
            jn = jn[1:-2]
        if jn.count('"') == 1:
            jn = jn.replace('"', '')
        ris['journal_name'] = jn
    
    # collect information on volume and issue
    if 'volume' in bibt.fields:
        ris['volume'] = bibt.fields['volume']
    if 'number' in bibt.fields:
        ris['number'] = bibt.fields['number']
    if 'number' not in bibt.fields and 'issue' in bibt.fields:
        ris['number'] = bibt.fields['issue']
    
    return ris

In [7]:
map_entry = {
             'JOUR': map_journal,
             'CHAP': map_chapter,
             'BOOK': map_book,
            }

In [8]:
def deduplicate_bibtex(bibt):
    """
    Deduplicate repeated fields in the bibtex returned by the LLM.
    We only keep the first appearance of a given field.
    """
    lines, fields = [], set()
    for line in bibt.strip().split('\n'):
        if line.startswith('@') or line == '}':
            lines.append(line)
        else:
            field = line.split('=')[0].strip()
            if field not in fields:
                lines.append(line)
                fields.add(field)

    clean = '\n'.join([l for l in lines if l])
    if not clean.strip().endswith('}'):
        clean += '\n}\n'
    
    return clean


def clean_bibtex(bibt):
    """
    Attempts to correct some common syntactic errors in the bibtex
    returned by the LLM (which cause the pybtex parser to fail).
    """
    if not bibt:
        return ''
    
    # remove erroneous markdown syntax:
    bibt = bibt.replace('```bibtex', '').replace('```', '').replace("```tex", '')
    
    # sometimes mutliple bibtexs are created: we only keep the first one
    bibt = [b for b in bibt.split('@') if b.strip()]
    bibt = '@' + bibt[0]

    lines = []
    for line in bibt.strip().split('\n'):
        l = line.strip()

        # take care of spaces in the bibtex key:
        if l.startswith('@') and ' ' in l:
            line = ''.join(line.split())
        
        # fix missing entry keys:
        if l in ('@article{,', '@article{'):
            lines.append('@article{xxx,')
            continue

        # common errors:
        if l.endswith(']'):
            line += '},'
            lines.append(line)
            continue
        line = line.replace('{ )', '{}')
        if l.endswith("',"):
            line = line[:-2] + '},'
        
        # ensure that end-of-line syntax is respected:
        if l != '}':
            if not l.endswith('},'):
                if l.endswith('}'):
                    line += ','
                elif not l.endswith('}') and not l.endswith(','):
                    line += '},'
            if l.endswith('),'):
                line = line.replace('),', ')},')
        
        # add missing curly brackets:
        if '=' in l and (not '{' in l or not '}' in l):
            k, v = [e.strip() for e in l.split('=')][:2]
            v.replace(',', '')
            line = '  ' + k + '=' + '{' + v + '},'
        
        # correct curly bracket syntax in title field:
        if l.startswith('title') and l.count('}') > 1:
            k, v = [e.strip() for e in l.split('=')][:2]
            v = v.replace('{', '').replace('}', '')
            line = '  ' + k + '=' + '{' + v + '},'

        lines.append(line)

    # recompose the lines of the bibtex entry:
    clean = '\n'.join([l for l in lines if l])
    if not clean.strip().endswith('}'):
        clean += '\n}\n'
    
    # return the deduplicated version of the bibtex entry:
    return deduplicate_bibtex(clean)

In [9]:
d = """@article{smit1951,
  author = {Van Duinkerken, Anton},
  title = {Over: Ternauwernood},
  journal = {Critisch Bulletin},
  volume = {18},
  number = {10},
  month = {October},
  year = {1951},
  pages = {462-467},
  publisher = {Utrecht [etc.]
}"""
print(clean_bibtex(d))
assert parse_string(clean_bibtex(d), 'bibtex')

@article{smit1951,
  author = {Van Duinkerken, Anton},
  title = {Over: Ternauwernood},
  journal = {Critisch Bulletin},
  volume = {18},
  number = {10},
  month = {October},
  year = {1951},
  pages = {462-467},
  publisher = {Utrecht [etc.]},
}


In [10]:
s = """@article{,
  title={Uitreiking Taaluniepenning 1991},
  author={unknown},
  journal={Publikatieblad; Nederlandse Taalunie},
  year={1992},
  volume={23},
  number={jan},
  pages={1-9}
}"""

print(clean_bibtex(s))
assert parse_string(clean_bibtex(s), 'bibtex')

@article{xxx,
  title={Uitreiking Taaluniepenning 1991},
  author={unknown},
  journal={Publikatieblad; Nederlandse Taalunie},
  year={1992},
  volume={23},
  number={jan},
  pages={1-9},
}


In [11]:
llm_path = '../data/llm-dump'

new_jtitles = Counter()

for decade_folder in sorted(glob.glob(f'{llm_path}/*')):
    #if '1990s' not in decade_folder:
    #        continue
    print(':::', decade_folder, ':::')

    for spreadsheet_path in sorted(glob.glob(f'{decade_folder}/*.xlsx')):
        df = pd.read_excel(spreadsheet_path, header=0, engine='openpyxl')
        #n = 1000
        #if len(df) > n:
        #    df = df.sample(n)

        if 'bibtex' not in df.columns:
            continue
    
        ptype = os.path.basename(spreadsheet_path).replace('.xlsx', '')
        print('     - ', spreadsheet_path, f'({ptype})')

        if ptype != 'JOUR':
            continue
        
        # parse the RIS (stored as JSON strings in the spreadsheet)
        df['RIS'] = df['RIS'].apply(json.loads)

        # clean (and deduplicate the bibtex returned by the LLM)
        cleaned = []
        for bt in df['bibtex']:
            if isinstance(bt, str):
                cleaned.append(clean_bibtex(bt))
            else:
                cleaned.append('')
        df['bibtex-clean'] = cleaned

        # Update the available RIS entries with newly structure info,
        # returned by the LLM (and keep tracked of whether or not that is successful):
        updated_ris, status = [], []
        for ris, bibtex_str in tqdm(list(zip(df['RIS'], df['bibtex-clean']))):
            if isinstance(bibtex_str, str):
                try:
                    #print(bibtex_parse)
                    bibtex_parse = parse_string(bibtex_str, 'bibtex')
                    single_key = list(bibtex_parse.entries.keys())[0]
                    updated = map_entry[ptype](ris.copy(), bibtex_parse.entries[single_key])

                    # keep track of new journal titles which lack a normalized variant,
                    # (unless the difference is only in capitalization):
                    if ptype == 'JOUR' and 'journal_name' in updated and updated['journal_name'] not in existing_jtitles:
                        try:
                            updated['journal_name'] = lower2jtitles[updated['journal_name'].lower()]
                        except KeyError:
                            new_jtitles[updated['journal_name']] += 1
                    
                    updated['label'] = 'success'
                    updated_ris.append(updated)
                    status.append('success')
                except Exception as e:
                    print(e)
                    ris['label'] = f'failure ({str(e)})'
                    updated_ris.append(ris)
                    status.append('failure')
            else:
                ris['label'] = 'failure'
                updated_ris.append(ris)
                status.append('failure')

        # store the newly merged information as a JSON string that holds a RIS entry:
        df['consolidated'] = [json.dumps(r, indent=2, ensure_ascii=False) for r in updated_ris]
        df['status'] = status

        # re-encode the original RIS entry as a JSON string in the original column:
        df['RIS'] = [json.dumps(d, indent=2, ensure_ascii=False) for d in df['RIS']]

        # remove the cleaned bibtex string:
        del df['bibtex-clean']

        # output new spreadsheet:
        df.to_excel(spreadsheet_path, index=False, header=True)

        # output updated RIS file:
        with open(f'{decade_folder}/{ptype}_consolidated.ris', 'w') as bibliography_file:
            rispy.dump(updated_ris, bibliography_file)

        # show the failure statistics:
        print(df['status'].value_counts())

::: ../data/llm-dump/1940s :::
     -  ../data/llm-dump/1940s/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/1940s/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/1940s/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/1940s/JOUR.xlsx (JOUR)


  0%|          | 0/9897 [00:00<?, ?it/s]

list index out of range
list index out of range


  4%|▍         | 374/9897 [00:00<00:05, 1874.55it/s]

list index out of range
list index out of range


 13%|█▎        | 1279/9897 [00:00<00:05, 1660.88it/s]

list index out of range
list index out of range
list index out of range


 18%|█▊        | 1808/9897 [00:01<00:04, 1719.02it/s]

list index out of range


100%|██████████| 9897/9897 [00:05<00:00, 1749.13it/s]


status
success    9889
failure       8
Name: count, dtype: int64
::: ../data/llm-dump/1950s :::
     -  ../data/llm-dump/1950s/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/1950s/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/1950s/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/1950s/JOUR.xlsx (JOUR)


 93%|█████████▎| 5753/6218 [00:03<00:00, 1724.63it/s]

syntax error in line 9: premature end of file


100%|██████████| 6218/6218 [00:03<00:00, 1795.88it/s]


status
success    6217
failure       1
Name: count, dtype: int64
::: ../data/llm-dump/1960s :::
     -  ../data/llm-dump/1960s/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/1960s/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/1960s/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/1960s/JOUR.xlsx (JOUR)


  5%|▍         | 906/19387 [00:00<00:10, 1690.88it/s]

syntax error in line 7: premature end of file


 38%|███▊      | 7360/19387 [00:04<00:07, 1697.24it/s]

syntax error in line 9: premature end of file


 44%|████▍     | 8558/19387 [00:05<00:06, 1612.79it/s]

Too many commas in 'Neuseeland, Schweiz, Holland, Italien'


 52%|█████▏    | 10079/19387 [00:05<00:05, 1648.93it/s]

Too many commas in 'Apmoal, apmits, apmet, helpman'
Too many commas in 'H. A. Gomperts, Harry Mulisch, Cees Nooteboom, et al.'


 56%|█████▌    | 10780/19387 [00:06<00:05, 1685.29it/s]

Too many commas in 'Hildeboldinga, Hilbolding, Hilbolling, Hubbeling, Hummelding, Hummeling'


 59%|█████▉    | 11453/19387 [00:06<00:04, 1657.90it/s]

syntax error in line 11: premature end of file


 62%|██████▏   | 11963/19387 [00:07<00:04, 1678.19it/s]

syntax error in line 1: entry key expected


 65%|██████▌   | 12646/19387 [00:07<00:04, 1680.33it/s]

syntax error in line 7: premature end of file


 79%|███████▉  | 15359/19387 [00:09<00:02, 1726.57it/s]

syntax error in line 10: premature end of file


 82%|████████▏ | 15878/19387 [00:09<00:02, 1718.26it/s]

syntax error in line 10: premature end of file


 86%|████████▌ | 16586/19387 [00:09<00:01, 1758.19it/s]

syntax error in line 10: premature end of file


 95%|█████████▌| 18494/19387 [00:10<00:00, 1644.23it/s]

syntax error in line 10: premature end of file


100%|██████████| 19387/19387 [00:11<00:00, 1685.42it/s]


status
success    19374
failure       13
Name: count, dtype: int64
::: ../data/llm-dump/1970s :::
     -  ../data/llm-dump/1970s/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/1970s/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/1970s/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/1970s/JOUR.xlsx (JOUR)


  6%|▋         | 1632/25511 [00:01<00:14, 1703.11it/s]

Too many commas in '"Pedagogen, psychologen, leertheoretici, psycholinguïsten, linguïsten, sociolinguïsten",'


 10%|▉         | 2500/25511 [00:01<00:13, 1728.39it/s]

syntax error in line 13: premature end of file


 14%|█▍        | 3538/25511 [00:02<00:12, 1700.49it/s]

syntax error in line 6: '}' expected


 21%|██        | 5294/25511 [00:03<00:11, 1712.85it/s]

syntax error in line 8: premature end of file


 26%|██▋       | 6709/25511 [00:03<00:10, 1766.20it/s]

syntax error in line 9: premature end of file


 29%|██▉       | 7431/25511 [00:04<00:10, 1794.11it/s]

syntax error in line 12: premature end of file
Too many commas in '"Billon, B. - Billoen, B. - Billio",'


 40%|███▉      | 10099/25511 [00:05<00:08, 1748.93it/s]

syntax error in line 9: premature end of file


 49%|████▊     | 12392/25511 [00:07<00:07, 1712.25it/s]

syntax error in line 12: premature end of file


 51%|█████     | 12919/25511 [00:07<00:07, 1733.45it/s]

Too many commas in 'Spijkerhard, straatarm, doodkalm, druipnat, propvol'
syntax error in line 3: '}' expected


 67%|██████▋   | 16968/25511 [00:09<00:04, 1752.32it/s]

Too many commas in 'De steen der wijze critici. Een schaduwloopje met M. Janssens, M.J.G. de Jong, H. Bousset, A. Nuis en J. Veulemans'


 85%|████████▌ | 21705/25511 [00:12<00:02, 1684.07it/s]

syntax error in line 9: premature end of file


 91%|█████████ | 23234/25511 [00:13<00:01, 1654.91it/s]

list index out of range
syntax error in line 12: premature end of file


 95%|█████████▍| 24234/25511 [00:14<00:00, 1646.48it/s]

syntax error in line 3: '=' expected


 97%|█████████▋| 24752/25511 [00:14<00:00, 1687.83it/s]

list index out of range


100%|██████████| 25511/25511 [00:14<00:00, 1706.57it/s]


list index out of range
syntax error in line 1: '(' or '{' expected
list index out of range
status
success    25491
failure       20
Name: count, dtype: int64
::: ../data/llm-dump/1980s :::
     -  ../data/llm-dump/1980s/ADVS.xlsx (ADVS)
     -  ../data/llm-dump/1980s/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/1980s/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/1980s/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/1980s/JOUR.xlsx (JOUR)


  9%|▉         | 3198/34995 [00:01<00:18, 1737.67it/s]

syntax error in line 10: premature end of file


 25%|██▌       | 8907/34995 [00:05<00:15, 1638.62it/s]

list index out of range


 27%|██▋       | 9405/34995 [00:05<00:15, 1640.18it/s]

syntax error in line 11: premature end of file


 28%|██▊       | 9737/34995 [00:05<00:15, 1649.04it/s]

list index out of range


 36%|███▋      | 12741/34995 [00:07<00:13, 1667.89it/s]

syntax error in line 12: premature end of file
syntax error in line 10: '=' expected


 40%|███▉      | 13901/34995 [00:08<00:12, 1634.28it/s]

syntax error in line 9: premature end of file


 46%|████▌     | 16083/34995 [00:09<00:11, 1640.36it/s]

syntax error in line 10: premature end of file


 48%|████▊     | 16747/34995 [00:10<00:12, 1502.93it/s]

Too many commas in '"Haffel, haffelen, haffelkatje",'
syntax error in line 10: premature end of file
syntax error in line 9: premature end of file


 53%|█████▎    | 18554/34995 [00:11<00:10, 1619.67it/s]

Too many commas in '"Chabot, De Vree, Tentije, Groot",'


 55%|█████▍    | 19204/34995 [00:11<00:09, 1601.46it/s]

syntax error in line 10: premature end of file


 58%|█████▊    | 20183/34995 [00:12<00:09, 1624.63it/s]

syntax error in line 10: premature end of file


 62%|██████▏   | 21672/34995 [00:13<00:08, 1624.74it/s]

Too many commas in 'Van verpreuvelen, tot verpreulen, verprillen, en wat er zoal bij komt kijken'


 67%|██████▋   | 23399/34995 [00:14<00:06, 1750.39it/s]

syntax error in line 2: '=' expected


 76%|███████▋  | 26707/34995 [00:16<00:05, 1654.87it/s]

syntax error in line 6: '}' expected


 88%|████████▊ | 30858/34995 [00:18<00:02, 1597.84it/s]

syntax error in line 10: premature end of file
syntax error in line 10: premature end of file


 91%|█████████ | 31829/34995 [00:19<00:02, 1581.02it/s]

syntax error in line 3: '=' expected


100%|██████████| 34995/34995 [00:21<00:00, 1647.80it/s]


status
success    34975
failure       20
Name: count, dtype: int64
     -  ../data/llm-dump/1980s/WEB.xlsx (WEB)
::: ../data/llm-dump/1990s :::
     -  ../data/llm-dump/1990s/ADVS.xlsx (ADVS)
     -  ../data/llm-dump/1990s/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/1990s/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/1990s/EJOUR.xlsx (EJOUR)
     -  ../data/llm-dump/1990s/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/1990s/JOUR.xlsx (JOUR)


  3%|▎         | 1490/44498 [00:00<00:27, 1577.51it/s]

Too many commas in '"Bots, H. & Rademaker, C. S. M.",'


  8%|▊         | 3748/44498 [00:02<00:25, 1629.99it/s]

syntax error in line 10: premature end of file


 17%|█▋        | 7703/44498 [00:04<00:22, 1633.43it/s]

syntax error in line 12: premature end of file


 21%|██        | 9325/44498 [00:05<00:21, 1601.75it/s]

syntax error in line 8: premature end of file


 25%|██▍       | 10941/44498 [00:06<00:21, 1548.15it/s]

Too many commas in 'Klos, klos, klos, hoorde je ze naar boven komen'


 27%|██▋       | 11907/44498 [00:07<00:20, 1592.47it/s]

syntax error in line 10: '=' expected


 28%|██▊       | 12391/44498 [00:07<00:20, 1588.00it/s]

syntax error in line 10: premature end of file


 31%|███▏      | 13990/44498 [00:08<00:19, 1573.25it/s]

syntax error in line 9: premature end of file


 35%|███▍      | 15537/44498 [00:09<00:21, 1373.04it/s]

syntax error in line 10: premature end of file


 46%|████▌     | 20476/44498 [00:13<00:15, 1567.35it/s]

syntax error in line 10: premature end of file
syntax error in line 1: '(' or '{' expected


 52%|█████▏    | 23225/44498 [00:14<00:13, 1572.60it/s]

syntax error in line 10: premature end of file


 55%|█████▌    | 24561/44498 [00:15<00:12, 1639.59it/s]

syntax error in line 12: premature end of file


 62%|██████▏   | 27485/44498 [00:17<00:09, 1839.66it/s]

syntax error in line 1: '(' or '{' expected


 68%|██████▊   | 30405/44498 [00:19<00:08, 1676.60it/s]

syntax error in line 4: '=' expected
syntax error in line 4: '=' expected


 73%|███████▎  | 32407/44498 [00:20<00:07, 1662.81it/s]

syntax error in line 11: premature end of file


 75%|███████▍  | 33249/44498 [00:20<00:06, 1653.65it/s]

syntax error in line 9: premature end of file


 77%|███████▋  | 34085/44498 [00:21<00:06, 1663.04it/s]

syntax error in line 7: premature end of file


 78%|███████▊  | 34902/44498 [00:21<00:06, 1557.27it/s]

list index out of range
syntax error in line 10: premature end of file
syntax error in line 9: premature end of file


 83%|████████▎ | 37044/44498 [00:23<00:04, 1607.05it/s]

syntax error in line 9: '=' expected


 90%|████████▉ | 39908/44498 [00:24<00:02, 1799.07it/s]

syntax error in line 10: premature end of file


100%|██████████| 44498/44498 [00:27<00:00, 1616.59it/s]


status
success    44474
failure       24
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/WEB.xlsx (WEB)
::: ../data/llm-dump/2000s :::
     -  ../data/llm-dump/2000s/ADVS.xlsx (ADVS)
     -  ../data/llm-dump/2000s/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/2000s/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/2000s/EJOUR.xlsx (EJOUR)
     -  ../data/llm-dump/2000s/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/2000s/JOUR.xlsx (JOUR)


  4%|▍         | 1049/27745 [00:00<00:17, 1522.00it/s]

syntax error in line 10: premature end of file


 37%|███▋      | 10257/27745 [00:06<00:11, 1569.34it/s]

syntax error in line 11: premature end of file


 40%|████      | 11214/27745 [00:07<00:10, 1569.55it/s]

syntax error in line 4: premature end of file


 68%|██████▊   | 18875/27745 [00:11<00:05, 1618.65it/s]

syntax error in line 3: premature end of file


 80%|███████▉  | 22064/27745 [00:13<00:03, 1587.05it/s]

syntax error in line 10: premature end of file


 82%|████████▏ | 22851/27745 [00:14<00:03, 1556.51it/s]

syntax error in line 5: premature end of file


 85%|████████▌ | 23607/27745 [00:15<00:02, 1441.77it/s]

syntax error in line 10: premature end of file


 94%|█████████▎| 26008/27745 [00:16<00:01, 1612.12it/s]

syntax error in line 10: premature end of file
list index out of range


100%|█████████▉| 27611/27745 [00:17<00:00, 1742.27it/s]

syntax error in line 9: premature end of file


100%|██████████| 27745/27745 [00:17<00:00, 1582.85it/s]


status
success    27735
failure       10
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/WEB.xlsx (WEB)
::: ../data/llm-dump/2010s :::
     -  ../data/llm-dump/2010s/ADVS.xlsx (ADVS)
     -  ../data/llm-dump/2010s/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/2010s/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/2010s/EJOUR.xlsx (EJOUR)
     -  ../data/llm-dump/2010s/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/2010s/JOUR.xlsx (JOUR)


  2%|▏         | 308/20043 [00:00<00:12, 1538.81it/s]

syntax error in line 7: premature end of file


 10%|▉         | 1953/20043 [00:01<00:11, 1642.87it/s]

syntax error in line 10: premature end of file


 20%|█▉        | 3930/20043 [00:02<00:10, 1557.65it/s]

syntax error in line 10: premature end of file


 27%|██▋       | 5412/20043 [00:03<00:08, 1630.29it/s]

syntax error in line 8: premature end of file
syntax error in line 9: premature end of file


 33%|███▎      | 6529/20043 [00:04<00:08, 1547.53it/s]

Too many commas in 'Peter Boot, Herman Brinkman, Peter de Bruijn, Jan Gielkens, Joke Roelevink, Renske Siemens'
syntax error in line 1: '(' or '{' expected


 42%|████▏     | 8434/20043 [00:05<00:07, 1586.49it/s]

syntax error in line 5: premature end of file
syntax error in line 5: premature end of file


 76%|███████▌  | 15158/20043 [00:09<00:03, 1579.19it/s]

Too many commas in "Lucy B. en C. W. van der Hoogt-prijs 2017: advies van de Commissie voor schone letteren; [Pia de Jong, Kaster Freriks, Lieke Marsman, Gerard Raat, Yves T'Sjoen]"
Too many commas in '"[Barber van de Pol, Carl de Strycker, Maria Vlaar]",'
syntax error in line 3: '}' expected


 85%|████████▌ | 17072/20043 [00:10<00:02, 1386.00it/s]

syntax error in line 7: '=' expected


 98%|█████████▊| 19633/20043 [00:12<00:00, 1613.69it/s]

syntax error in line 10: premature end of file


100%|██████████| 20043/20043 [00:12<00:00, 1569.87it/s]


status
success    20029
failure       14
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/WEB.xlsx (WEB)
::: ../data/llm-dump/2020s :::
     -  ../data/llm-dump/2020s/ADVS.xlsx (ADVS)
     -  ../data/llm-dump/2020s/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/2020s/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/2020s/EJOUR.xlsx (EJOUR)
     -  ../data/llm-dump/2020s/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/2020s/JOUR.xlsx (JOUR)


 23%|██▎       | 1184/5238 [00:00<00:02, 1692.17it/s]

syntax error in line 3: '=' expected


 54%|█████▎    | 2808/5238 [00:01<00:01, 1602.73it/s]

syntax error in line 3: '}' expected


100%|██████████| 5238/5238 [00:03<00:00, 1609.28it/s]


status
success    5236
failure       2
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/WEB.xlsx (WEB)
::: ../data/llm-dump/2030s :::
     -  ../data/llm-dump/2030s/BOOK.xlsx (BOOK)
::: ../data/llm-dump/misc :::
     -  ../data/llm-dump/misc/ADVS.xlsx (ADVS)
     -  ../data/llm-dump/misc/BOOK.xlsx (BOOK)
     -  ../data/llm-dump/misc/CHAP.xlsx (CHAP)
     -  ../data/llm-dump/misc/EJOUR.xlsx (EJOUR)
     -  ../data/llm-dump/misc/JFULL.xlsx (JFULL)
     -  ../data/llm-dump/misc/JOUR.xlsx (JOUR)


 98%|█████████▊| 3419/3483 [00:02<00:00, 1659.90it/s]

syntax error in line 8: premature end of file


100%|██████████| 3483/3483 [00:02<00:00, 1673.14it/s]


status
success    3482
failure       1
Name: count, dtype: int64
     -  ../data/llm-dump/misc/WEB.xlsx (WEB)


Extract journal titles for which we don't have a normalization yet and map them provionally to the closest available normalized title (using the Levenshtein distance):

In [12]:
mappings = []
for nj, cnt in new_jtitles.items():
    distances = np.array([lev.distance(nj, oj) for oj in jtitle['normalized']])
    mappings.append([nj, cnt] + list(jtitle.iloc[np.argmin(distances)][['normalized', 'issn']]))

mappings = pd.DataFrame(mappings, columns=['raw title', 'count', 'normalized', 'issn'])
mappings = mappings.sort_values('count', ascending=False)
mappings.sample(30)

Unnamed: 0,raw title,count,normalized,issn
3031,Tydskrif vir Nederlands & Afrikaans,20,Tydskrif vir wetenskap en kuns,0372-3526
3154,Tijdschrift voor Nederlandse taal- letterk...,1,Tijdschrift voor Nederlandse taal- en letterkunde,0040-7550
289,Land van mijn hart,1,Rond Janus en Bet,
2188,Verrigtinge van die vyfde driejaarlikse neerla...,2,Vereniging van Vlaamse toneelauteurs,
737,Rivista di etnografia,1,Lust en gratie,0168-8413
1125,Jaarboek van de Heemkundevereniging Roerstreek,1,Blad van de Heemkundevereniging Den Dungen,
3634,Nederkandsch museum,1,Nederlandse post,
1879,Juffr. Ida,5,Juffrouw Ida,0927-4847
448,Leidraad voor de wereld van reklame en marketing,1,Werkblad voor Nederlandse didactiek,
361,vol. 1,1,Volk,


We save this spreadsheet for manual correction:

In [13]:
mappings.to_excel('../data/journal_titles_2ndBatch.xlsx', header=True, index=False)