In [1]:
import re
import os
import json
import glob
from copy import deepcopy
from collections     import Counter

from tqdm import tqdm
import pandas as pd
import Levenshtein as lev
import numpy as np

from pybtex.database import parse_string
from pylatexenc.latex2text import LatexNodes2Text
conv = LatexNodes2Text()

In [2]:
import rispy
mappings = deepcopy(rispy.TAG_KEY_MAPPING)
mappings['M2'] = 'extra'
mappings['M3'] = 'orig_abstract'

In [3]:
def extract_isbn(input_string):
    """
    Extract an ISBN from an unstructured text string.
    
    The function searches for ISBN numbers that may include a check character 'X' at the end.
    It recognizes ISBNs both with and without spaces or hyphens between segments.
    """
    # Regular expression to match "ISBN:" followed by any combination of digits, hyphens, and possibly ending with an 'X'
    pattern = r'ISBN:?\s*([\d\-]+X?)'
    
    match = re.search(pattern, input_string)
    if match:
        return match.group(1)  # Return the matched part (ISBN number)
    else:
        return None  # No ISBN found following the "ISBN:" prefix
    
import re

def extract_issn(input_string):
    """
    Extract an ISSN from an unstructured text string.
    
    The function searches for ISSN numbers, which are typically in the format '1234-5678'.
    It recognizes ISSNs both with and without spaces or hyphens between segments.
    """
    # Regular expression to match "ISSN:" followed by a valid ISSN format
    pattern = r'ISSN:?\s*(\d{4}-\d{3}[\dX])'
    
    match = re.search(pattern, input_string)
    if match:
        return match.group(1)  # Return the matched part (ISSN number)
    else:
        return None  # No ISSN found

In [4]:
extract_isbn("ISBN: 978-0-19-880393-5. P. VII-VIII Acknowledgements; p. XI-XII List of illustrations; p. XIII-XXIV Preface: what this book is (not) about; p. 1-21 Introduction: biblical philology in the sixteenth century; p. 253-280 Bibliography; p. 281-296 Index.")

'978-0-19-880393-5'

In [5]:
def map_chapter(ris, bibt):
    # make editors authors, if applicable:
    if 'editor' in bibt.persons:
        if 'author' in bibt.persons:
            ris['first_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['author']]
        else:
            if 'authors' in ris:
                del ris['authors']
        ris['secondary_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['editor']]

    if 'first_authors' in ris and 'secondary_authors' in ris:
        if 'authors' in ris:
            del ris['authors']

    # add translators
    if 'translator' in bibt.fields:
        ris['tertiary_authors'] = conv.latex_to_text(bibt.fields['translator']).split(' and ')

    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # replace non-distinct title with parsed journal title:
    if 'title' in bibt.fields:
        ris['extra'] = ris['title']
        ris['title'] = bibt.fields['title']

    if 'booktitle' in bibt.fields:
        ris['secondary_title'] = bibt.fields['booktitle']
    
    if 'pagetotal' in bibt.fields:
        ris['end_page'] = bibt.fields['pagetotal']
    
    if 'pages' in bibt.fields:
        pages = bibt.fields['pages'].split('-')
        if len(pages) == 2:
            ris['start_page'] = pages[0]
            ris['end_page'] = pages[1]
        else:
            ris['start_page'] = bibt.fields['pages']
    
    if 'publisher' in bibt.fields:
        ris['publisher'] = bibt.fields['publisher']

    if 'place' in bibt.fields:
        ris['place_published'] = bibt.fields['place']

    if 'address' in bibt.fields:
        ris['place_published'] = bibt.fields['address']
    
    if 'location' in bibt.fields:
        ris['place_published'] = bibt.fields['location']

    if 'series' in bibt.fields:
        ris['tertiary_title'] = bibt.fields['series']

    if 'volume' in bibt.fields:
        ris['volume'] = bibt.fields['volume']

    if 'number' in bibt.fields:
        ris['number'] = bibt.fields['number']

    if 'edition' in bibt.fields:
        ris['edition'] = bibt.fields['edition']
    
    return ris

Collect already available normalizations for journal titles (so that we can find out below which one we miss):

In [6]:
jtitle = pd.read_excel('../data/journal_titles_master.xlsx')
existing_jtitles = set(jtitle['normalized'])
lower2jtitles = dict(zip(jtitle['normalized'].str.lower(), jtitle['normalized']))
jtitle.head(30)

Unnamed: 0,secondary_title,normalized,count,issn
0,Ons erfdeel: kultureel tijdschrift voor Zuidvl...,Ons erfdeel,2660.0,0030-2651
1,Dietsche warande en Belfort: tijdschrift voor ...,Dietsche warande en Belfort,2461.0,0012-2645
2,De nieuwe taalgids: tweemaandelijks tijdschrif...,De nieuwe taalgids,2359.0,0028-9922
3,Bzzlletin; Stichting BZZTôH Teater. Voorburg: ...,Bzzlletin,1638.0,0165-0858
4,Poëziekrant: tweemaandelijks tijdschrift. Gent...,Poëziekrant,1573.0,2030-0638
5,Onze taal: maandblad van het Genootschap Onze ...,Onze taal,1322.0,0165-7828
6,Vlaanderen: tweemaandelijks tijdschrift voor k...,Vlaanderen,1312.0,0042-7683
7,De gids: nieuwe vaderlandsche letteroefeningen...,De gids,1249.0,0016-9730
8,Levende talen: berichten en mededelingen van d...,Levende talen,1239.0,0024-1539
9,Tijdschrift voor Nederlandse taal- en letterku...,Tijdschrift voor Nederlandse taal- en letterkunde,962.0,0040-7550


In [7]:
def map_journal(ris, bibt):
    """
    Merges the newly structured information in the bibtex returned
    by the LLM into the already available RIS entry from the dump.
    Reliably structured information (e.g. authors, year, keywords, ...)
    from the RIS entries is maximally retained.
    """
    #print(ris)
    #print(bibt)
    #print('===============================================')

    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # replace unstructured title with parsed journal title (if available):
    if 'title' in bibt.fields:
        # keep track of original title description:
        ris['extra'] = [ris['title']]
        ris['title'] = bibt.fields['title']
        if ris['title'].endswith(','):
            ris['title'] = ris['title'][:-1]
        if ris['title'].strip().lower() in ('in', 'untitled', 'title of the article', 'title of the article (if provided)'):
            ris['title'] = ''
    
    # parse pagination information:
    if 'pagetotal' in bibt.fields:
        ris['end_page'] = bibt.fields['pagetotal']
    if 'pages' in bibt.fields:
        pages = bibt.fields['pages'].split('-')
        if len(pages) == 2:
            ris['start_page'] = pages[0]
            ris['end_page'] = pages[1]
        else:
            ris['end_page'] = bibt.fields['pages']

    # collect parsed journal title (unless we had that information already, which will be more reliable)
    if 'secondary_title' not in ris and 'journal' in bibt.fields:
        journal = bibt.fields['journal']
        # sometimes place of publication of the journal is added: we remove that
        journal = journal.split('(')[0].strip()
        journal = journal.split('[')[0].strip()
        ris['journal_name'] = journal
    elif 'secondary_title' in ris and 'journal' in bibt.fields:
        ris['journal_name'] = ris['secondary_title']
        del ris['secondary_title']

    if 'journal_name' in ris:
        jn = ris['journal_name']
        if jn.startswith('"') and jn.endswith('",'):
            jn = jn[1:-2]
        if jn.count('"') == 1:
            jn = jn.replace('"', '')
        ris['journal_name'] = jn
    
    # collect information on volume and issue
    if 'volume' in bibt.fields:
        ris['volume'] = bibt.fields['volume']
    if 'number' in bibt.fields:
        ris['number'] = bibt.fields['number']
    if 'number' not in bibt.fields and 'issue' in bibt.fields:
        ris['number'] = bibt.fields['issue']
    
    if 'volume' in ris and not 'number' in ris:
        ris['number'] = ris['volume']
        del ris['volume']
    
    return ris

In [8]:
def map_book(ris, bibt):
    # extract ISBN for abstract field, if available:
    if 'orig_abstract' in ris:
        abstract = ris['orig_abstract']
        isbn = extract_isbn(abstract.strip())
        if isbn:
            ris['issn'] = isbn

    # make editors authors, if applicable:
    if 'editor' in bibt.persons:
        if 'author' in bibt.persons:
            ris['first_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['author']]
        else:
            if 'author' in ris:
                del ris['author']
            if 'authors' in ris:
                del ris['authors']
        ris['secondary_authors'] = [conv.latex_to_text(str(editor)) for editor in bibt.persons['editor']]

    if 'first_authors' in ris and 'secondary_authors' in ris:
        if 'author' in ris:
            del ris['author']
        if 'authors' in ris:
            del ris['authors']

    # add translators
    if 'translator' in bibt.fields:
        ris['tertiary_authors'] = conv.latex_to_text(bibt.fields['translator']).split(' and ')
        if 'author' in ris:
            del ris['author']
        if 'first_authors' in ris:
            del ris['first_authors']

    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # replace non-distinct title with parsed book title:
    if 'title' in bibt.fields and bibt.fields['title']:
        try:
            ris['extra'].append(ris['title'])
        except KeyError:
            ris['extra'] = [ris['title']]
        ris['title'] = bibt.fields['title']
    elif 'booktitle' in bibt.fields and bibt.fields['booktitle']:
        try:
            ris['extra'].append(ris['title'])
        except KeyError:
            ris['extra'] = [ris['title']]
        ris['title'] = bibt.fields['booktitle']
    
    if 'pagetotal' in bibt.fields:
        ris['start_page'] = bibt.fields['pagetotal']
    
    if 'pages' in bibt.fields:
        ris['start_page'] = bibt.fields['pages']
    
    if 'publisher' in bibt.fields:
        ris['publisher'] = bibt.fields['publisher']

    if 'place' in bibt.fields:
        ris['place_published'] = bibt.fields['place']

    if 'place' in bibt.fields:
        place = bibt.fields['place']
        if ris['extra'] and '[' + place + ']' in ris['extra'][0]:
            place = '[' + place + ']'
        ris['place_published'] = place

    if 'address' in bibt.fields:
        ris['place_published'] = bibt.fields['address']
    
    if 'location' in bibt.fields:
        ris['place_published'] = bibt.fields['location']

    if 'series' in bibt.fields:
        if ';' in bibt.fields['series']:
            series, vol = [e.strip() for e in bibt.fields['series'].split(';', maxsplit=1)]
            ris['secondary_title'] = series
            ris['note'] = vol
        else:
            ris['secondary_title'] = bibt.fields['series']

    if 'volume' in bibt.fields:
        ris['note'] = bibt.fields['volume']

    if 'number' in bibt.fields:
        ris['note'] = bibt.fields['number']

    if 'edition' in bibt.fields:
        ris['edition'] = bibt.fields['edition']
    
    return ris

In [9]:
def map_jfull(ris, bibt):
    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # extract ISSN from title field, if available:
    if 'title' in ris:
        abstract = ris['title']
        issn = extract_issn(abstract.strip())
        if issn:
            ris['issn'] = issn
    
    # replace non-distinct title with parsed book title:
    if 'title' in bibt.fields and bibt.fields['title']:
        ris['extra'] = [ris['title']]
        ris['title'] = bibt.fields['title']
    elif 'booktitle' in bibt.fields and bibt.fields['booktitle']:
        ris['extra'] = ris['title']
        ris['title'] = bibt.fields['booktitle']

    if 'first_authors' in ris:
        ris['secondary_authors'] = ris['first_authors']
        del ris['first_authors']
    
    if 'authors' in ris:
        ris['secondary_authors'] = ris['authors']
        del ris['authors']
    
    for f in bibt.fields:
        bibt.fields[f] = conv.latex_to_text(bibt.fields[f])
    
    # pagination information:
    if 'pagetotal' in bibt.fields:
        ris['start_page'] = bibt.fields['pagetotal']
    if 'pages' in bibt.fields:
        ris['start_page'] = bibt.fields['pages']
    
    # publisher information:
    if 'publisher' in bibt.fields:
        ris['publisher'] = bibt.fields['publisher']

    # place of publication:
    if 'place' in bibt.fields:
        ris['place_published'] = bibt.fields['place']
    if 'place' in bibt.fields:
        place = bibt.fields['place']
        if ris['extra'] and '[' + place + ']' in ris['extra'][0]:
            place = '[' + place + ']'
        ris['place_published'] = place
    if 'address' in bibt.fields:
        ris['place_published'] = bibt.fields['address']
    if 'location' in bibt.fields:
        ris['place_published'] = bibt.fields['location']

    if 'volume' in bibt.fields:
        ris['volume'] = bibt.fields['volume']

    if 'number' in bibt.fields:
        ris['number'] = bibt.fields['number']

    # Reset to BOOK
    ris['type_of_reference'] = 'BOOK'
    if 'keywords' in ris:
        ris['keywords'].append('Speciaal tijdschriftnummer')
    ris['keywords'] = tuple(set(ris['keywords']))
    
    return ris

In [10]:
map_entry = {
             'JOUR': map_journal,
             'CHAP': map_chapter,
             'BOOK': map_book,
             'JFULL': map_jfull,
             'EJOUR': map_journal,
             'ADVS': map_book,
             'WEB': map_book,
            }

In [11]:
def deduplicate_bibtex(bibt):
    """
    Deduplicate repeated fields in the bibtex returned by the LLM.
    We only keep the first appearance of a given field.
    """
    lines, fields = [], set()
    for line in bibt.strip().split('\n'):
        if line.startswith('@') or line == '}':
            lines.append(line)
        else:
            field = line.split('=')[0].strip()
            if field not in fields:
                lines.append(line)
                fields.add(field)

    clean = '\n'.join([l for l in lines if l])
    if not clean.strip().endswith('}'):
        clean += '\n}\n'
    
    return clean


def clean_bibtex(bibt):
    """
    Attempts to correct some common syntactic errors in the bibtex
    returned by the LLM (which cause the pybtex parser to fail).
    """
    if not bibt:
        return ''
    
    # remove erroneous markdown syntax:
    bibt = bibt.replace('```bibtex', '').replace('```', '').replace("```tex", '')
    
    # sometimes mutliple bibtexs are created: we only keep the first one
    bibt = [b for b in bibt.split('@') if b.strip()]
    bibt = '@' + bibt[0]

    lines = []
    for line in bibt.strip().split('\n'):
        l = line.strip()

        # take care of spaces in the bibtex key:
        if l.startswith('@') and ' ' in l:
            line = ''.join(line.split())
        
        # fix missing entry keys:
        if l in ('@article{,', '@article{'):
            lines.append('@article{xxx,')
            continue
        if l in ('@book{,', '@book{'):
            lines.append('@book{xxx,')
            continue
        if l in ('@incollection{,', '@incollection{'):
            lines.append('@incollection{xxx,')
            continue

        # common errors:
        if l.endswith(']'):
            line += '},'
            lines.append(line)
            continue
        line = line.replace('{ )', '{}')
        if l.endswith("',"):
            line = line[:-2] + '},'
        
        # ensure that end-of-line syntax is respected:
        if l != '}':
            if not l.endswith('},'):
                if l.endswith('}'):
                    line += ','
                elif not l.endswith('}') and not l.endswith(','):
                    line += '},'
            if l.endswith('),'):
                line = line.replace('),', ')},')
        
        # add missing curly brackets:
        if '=' in l and (not '{' in l or not '}' in l):
            k, v = [e.strip() for e in l.split('=')][:2]
            v.replace(',', '')
            line = '  ' + k + '=' + '{' + v + '},'
        
        # remove lines with empty values:
        if '= {},' in l:
            continue

        if ' &' in line:
            line = line.replace(' &', ' \&')
        
        # correct curly bracket syntax in title field:
        if l.startswith('title') and l.count('}') > 1:
            k, v = [e.strip() for e in l.split('=')][:2]
            v = v.replace('{', '').replace('}', '')
            line = '  ' + k + '=' + '{' + v + '},'

        # correct syntax:
        if '",' in l and '=' in l:
            k, v = [e.strip() for e in l.split('=')][:2]
            if v.startswith('"') and v.endswith('",'):
                v = v[1:-2]
            line = '  ' + k + '=' + '{' + v + '},'

        lines.append(line)

    # recompose the lines of the bibtex entry:
    clean = '\n'.join([l for l in lines if l])
    if not clean.strip().endswith('}'):
        clean += '\n}\n'
    
    # return the deduplicated version of the bibtex entry:
    return deduplicate_bibtex(clean)

  line = line.replace(' &', ' \&')


In [12]:
llm_path = '../data/llm-dump'

new_jtitles = Counter()

for decade_folder in sorted(glob.glob(f'{llm_path}/*')):
    #if '1990s' not in decade_folder:
    #        continue
    print(':::', decade_folder, ':::')

    for spreadsheet_path in sorted(glob.glob(f'{decade_folder}/*.xlsx')):
        df = pd.read_excel(spreadsheet_path, header=0, engine='openpyxl')
        #n = 5000
        #if len(df) > n:
        #    df = df.sample(n)

        if 'bibtex' not in df.columns:
            continue
    
        ptype = os.path.basename(spreadsheet_path).replace('.xlsx', '')
        print('     - ', spreadsheet_path, f'({ptype})')

        if ptype not in ('BOOK', 'JOUR', 'CHAP', 'JFULL', 'EJOUR', 'ADVS', 'WEB'):
        #if ptype != 'BOOK':
            continue
        
        # parse the RIS (stored as JSON strings in the spreadsheet)
        df['RIS'] = df['RIS'].apply(json.loads)

        # clean (and deduplicate the bibtex returned by the LLM)
        cleaned = []
        for bt in df['bibtex']:
            if isinstance(bt, str):
                cleaned.append(clean_bibtex(bt))
            else:
                cleaned.append('')
        df['bibtex-clean'] = cleaned

        # Update the available RIS entries with newly structure info,
        # returned by the LLM (and keep tracked of whether or not that is successful):
        updated_ris, status = [], []
        for ris, bibtex_str in tqdm(list(zip(df['RIS'], df['bibtex-clean']))):
            if 'abstract' in ris:
                ris['orig_abstract'] = ris['abstract']
                del ris['abstract']
            if isinstance(bibtex_str, str):
                try:
                    #print(bibtex_parse)
                    bibtex_parse = parse_string(bibtex_str, 'bibtex')
                    single_key = list(bibtex_parse.entries.keys())[0]
                    updated = map_entry[ptype](ris.copy(), bibtex_parse.entries[single_key])

                    # keep track of new journal titles which lack a normalized variant,
                    # (unless the difference is only in capitalization):
                    if ptype in 'JOUR' and 'journal_name' in updated and updated['journal_name'] not in existing_jtitles:
                        try:
                            updated['journal_name'] = lower2jtitles[updated['journal_name'].lower()]
                        except KeyError:
                            new_jtitles[updated['journal_name']] += 1
                    
                    updated['label'] = 'success'
                    updated_ris.append(updated)
                    status.append('success')
                except Exception as e:
                    #print(e)
                    ris['label'] = f'failure ({str(e)})'
                    updated_ris.append(ris)
                    status.append('failure')
            else:
                ris['label'] = 'failure'
                updated_ris.append(ris)
                status.append('failure')

        # store the newly merged information as a JSON string that holds a RIS entry:
        df['consolidated'] = [json.dumps(r, indent=2, ensure_ascii=False) for r in updated_ris]
        df['status'] = status

        # re-encode the original RIS entry as a JSON string in the original column:
        df['RIS'] = [json.dumps(d, indent=2, ensure_ascii=False) for d in df['RIS']]

        # remove the cleaned bibtex string:
        del df['bibtex-clean']

        # output new spreadsheet:
        df.to_excel(spreadsheet_path, index=False, header=True)

        # Ensure that 'extra' field is correctly set as a list for each record
        for record in updated_ris:
            if 'extra' in record and not isinstance(record['extra'], str):
                record['extra'] = ' /// '.join(record['extra'])

        # output updated RIS file:
        with open(f'{decade_folder}/{ptype}_consolidated.ris', 'w') as bibliography_file:
            rispy.dump(updated_ris, bibliography_file, mapping=mappings)

        # show the failure statistics:
        print(df['status'].value_counts())

::: ../data/llm-dump/1940s :::
     -  ../data/llm-dump/1940s/BOOK.xlsx (BOOK)


100%|██████████| 1442/1442 [00:01<00:00, 1368.76it/s]


status
success    1418
failure      24
Name: count, dtype: int64
     -  ../data/llm-dump/1940s/CHAP.xlsx (CHAP)


100%|██████████| 1764/1764 [00:01<00:00, 1078.88it/s]


status
success    1762
failure       2
Name: count, dtype: int64
     -  ../data/llm-dump/1940s/JFULL.xlsx (JFULL)


100%|██████████| 68/68 [00:00<00:00, 656.68it/s]

status
success    56
failure    12
Name: count, dtype: int64





     -  ../data/llm-dump/1940s/JOUR.xlsx (JOUR)


100%|██████████| 9897/9897 [00:06<00:00, 1470.73it/s]


status
success    9888
failure       9
Name: count, dtype: int64
::: ../data/llm-dump/1950s :::
     -  ../data/llm-dump/1950s/BOOK.xlsx (BOOK)


100%|██████████| 917/917 [00:00<00:00, 1188.57it/s]


status
success    916
failure      1
Name: count, dtype: int64
     -  ../data/llm-dump/1950s/CHAP.xlsx (CHAP)


100%|██████████| 1161/1161 [00:00<00:00, 1178.13it/s]


status
success    1161
Name: count, dtype: int64
     -  ../data/llm-dump/1950s/JFULL.xlsx (JFULL)


100%|██████████| 17/17 [00:00<00:00, 350.57it/s]


status
success    16
failure     1
Name: count, dtype: int64
     -  ../data/llm-dump/1950s/JOUR.xlsx (JOUR)


100%|██████████| 6218/6218 [00:04<00:00, 1500.12it/s]


status
success    6217
failure       1
Name: count, dtype: int64
::: ../data/llm-dump/1960s :::
     -  ../data/llm-dump/1960s/BOOK.xlsx (BOOK)


100%|██████████| 2185/2185 [00:02<00:00, 1082.11it/s]


status
success    2181
failure       4
Name: count, dtype: int64
     -  ../data/llm-dump/1960s/CHAP.xlsx (CHAP)


100%|██████████| 4084/4084 [00:03<00:00, 1088.50it/s]


status
success    4084
Name: count, dtype: int64
     -  ../data/llm-dump/1960s/JFULL.xlsx (JFULL)


100%|██████████| 190/190 [00:00<00:00, 747.72it/s]


status
success    174
failure     16
Name: count, dtype: int64
     -  ../data/llm-dump/1960s/JOUR.xlsx (JOUR)


100%|██████████| 19387/19387 [00:13<00:00, 1474.20it/s]


status
success    19373
failure       14
Name: count, dtype: int64
::: ../data/llm-dump/1970s :::
     -  ../data/llm-dump/1970s/BOOK.xlsx (BOOK)


100%|██████████| 3623/3623 [00:02<00:00, 1248.33it/s]


status
success    3619
failure       4
Name: count, dtype: int64
     -  ../data/llm-dump/1970s/CHAP.xlsx (CHAP)


100%|██████████| 6650/6650 [00:06<00:00, 1053.06it/s]


status
success    6645
failure       5
Name: count, dtype: int64
     -  ../data/llm-dump/1970s/JFULL.xlsx (JFULL)


100%|██████████| 273/273 [00:00<00:00, 702.23it/s]


status
success    258
failure     15
Name: count, dtype: int64
     -  ../data/llm-dump/1970s/JOUR.xlsx (JOUR)


100%|██████████| 25511/25511 [00:17<00:00, 1463.01it/s]


status
success    25493
failure       18
Name: count, dtype: int64
::: ../data/llm-dump/1980s :::
     -  ../data/llm-dump/1980s/ADVS.xlsx (ADVS)


100%|██████████| 2/2 [00:00<00:00, 1195.47it/s]

status
success    2
Name: count, dtype: int64





     -  ../data/llm-dump/1980s/BOOK.xlsx (BOOK)


100%|██████████| 6722/6722 [00:05<00:00, 1191.93it/s]


status
success    6718
failure       4
Name: count, dtype: int64
     -  ../data/llm-dump/1980s/CHAP.xlsx (CHAP)


100%|██████████| 12289/12289 [00:12<00:00, 958.12it/s] 


status
success    12277
failure       12
Name: count, dtype: int64
     -  ../data/llm-dump/1980s/JFULL.xlsx (JFULL)


100%|██████████| 619/619 [00:00<00:00, 680.42it/s]


status
success    592
failure     27
Name: count, dtype: int64
     -  ../data/llm-dump/1980s/JOUR.xlsx (JOUR)


100%|██████████| 34995/34995 [00:24<00:00, 1441.22it/s]


status
success    34977
failure       18
Name: count, dtype: int64
     -  ../data/llm-dump/1980s/WEB.xlsx (WEB)


100%|██████████| 1/1 [00:00<00:00, 784.86it/s]


status
success    1
Name: count, dtype: int64
::: ../data/llm-dump/1990s :::
     -  ../data/llm-dump/1990s/ADVS.xlsx (ADVS)


100%|██████████| 33/33 [00:00<00:00, 917.40it/s]

status
success    33
Name: count, dtype: int64





     -  ../data/llm-dump/1990s/BOOK.xlsx (BOOK)


100%|██████████| 7992/7992 [00:06<00:00, 1167.61it/s]


status
success    7986
failure       6
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/CHAP.xlsx (CHAP)


100%|██████████| 14247/14247 [00:16<00:00, 879.47it/s]


status
success    14237
failure       10
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/EJOUR.xlsx (EJOUR)


100%|██████████| 97/97 [00:00<00:00, 885.26it/s]


status
success    96
failure     1
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/JFULL.xlsx (JFULL)


100%|██████████| 660/660 [00:01<00:00, 616.93it/s]


status
success    636
failure     24
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/JOUR.xlsx (JOUR)


100%|██████████| 44498/44498 [00:31<00:00, 1433.76it/s]


status
success    44473
failure       25
Name: count, dtype: int64
     -  ../data/llm-dump/1990s/WEB.xlsx (WEB)


100%|██████████| 10/10 [00:00<00:00, 1243.01it/s]


status
success    10
Name: count, dtype: int64
::: ../data/llm-dump/2000s :::
     -  ../data/llm-dump/2000s/ADVS.xlsx (ADVS)


100%|██████████| 54/54 [00:00<00:00, 1051.85it/s]

status
success    54
Name: count, dtype: int64





     -  ../data/llm-dump/2000s/BOOK.xlsx (BOOK)


100%|██████████| 6229/6229 [00:05<00:00, 1224.96it/s]


status
success    6220
failure       9
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/CHAP.xlsx (CHAP)


100%|██████████| 11027/11027 [00:12<00:00, 850.86it/s]


status
success    11021
failure        6
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/EJOUR.xlsx (EJOUR)


100%|██████████| 628/628 [00:00<00:00, 1441.43it/s]


status
success    628
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/JFULL.xlsx (JFULL)


100%|██████████| 479/479 [00:00<00:00, 661.74it/s]


status
success    461
failure     18
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/JOUR.xlsx (JOUR)


100%|██████████| 27745/27745 [00:19<00:00, 1392.98it/s]


status
success    27735
failure       10
Name: count, dtype: int64
     -  ../data/llm-dump/2000s/WEB.xlsx (WEB)


100%|██████████| 485/485 [00:00<00:00, 1506.61it/s]


status
success    483
failure      2
Name: count, dtype: int64
::: ../data/llm-dump/2010s :::
     -  ../data/llm-dump/2010s/ADVS.xlsx (ADVS)


100%|██████████| 9/9 [00:00<00:00, 1125.89it/s]


status
success    9
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/BOOK.xlsx (BOOK)


100%|██████████| 3891/3891 [00:02<00:00, 1299.68it/s]


status
success    3883
failure       8
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/CHAP.xlsx (CHAP)


100%|██████████| 6300/6300 [00:07<00:00, 809.62it/s]


status
success    6288
failure      12
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/EJOUR.xlsx (EJOUR)


100%|██████████| 569/569 [00:00<00:00, 1516.79it/s]


status
success    568
failure      1
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/JFULL.xlsx (JFULL)


100%|██████████| 442/442 [00:00<00:00, 687.71it/s]


status
success    416
failure     26
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/JOUR.xlsx (JOUR)


100%|██████████| 20043/20043 [00:14<00:00, 1352.83it/s]


status
success    20030
failure       13
Name: count, dtype: int64
     -  ../data/llm-dump/2010s/WEB.xlsx (WEB)


100%|██████████| 85/85 [00:00<00:00, 735.64it/s]


status
success    85
Name: count, dtype: int64
::: ../data/llm-dump/2020s :::
     -  ../data/llm-dump/2020s/ADVS.xlsx (ADVS)


100%|██████████| 2/2 [00:00<00:00, 2205.21it/s]

status
success    1
failure    1
Name: count, dtype: int64





     -  ../data/llm-dump/2020s/BOOK.xlsx (BOOK)


100%|██████████| 825/825 [00:00<00:00, 1363.85it/s]


status
success    824
failure      1
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/CHAP.xlsx (CHAP)


100%|██████████| 1627/1627 [00:01<00:00, 880.50it/s]


status
success    1627
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/EJOUR.xlsx (EJOUR)


100%|██████████| 251/251 [00:00<00:00, 1493.64it/s]


status
success    251
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/JFULL.xlsx (JFULL)


100%|██████████| 161/161 [00:00<00:00, 706.05it/s]


status
success    150
failure     11
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/JOUR.xlsx (JOUR)


100%|██████████| 5238/5238 [00:03<00:00, 1373.78it/s]


status
success    5237
failure       1
Name: count, dtype: int64
     -  ../data/llm-dump/2020s/WEB.xlsx (WEB)


100%|██████████| 1/1 [00:00<00:00, 1290.56it/s]


status
success    1
Name: count, dtype: int64
::: ../data/llm-dump/misc :::
     -  ../data/llm-dump/misc/ADVS.xlsx (ADVS)


100%|██████████| 3/3 [00:00<00:00, 846.48it/s]


status
success    3
Name: count, dtype: int64
     -  ../data/llm-dump/misc/BOOK.xlsx (BOOK)


100%|██████████| 1182/1182 [00:00<00:00, 1345.81it/s]


status
success    1180
failure       2
Name: count, dtype: int64
     -  ../data/llm-dump/misc/CHAP.xlsx (CHAP)


100%|██████████| 470/470 [00:00<00:00, 1169.01it/s]


status
success    469
failure      1
Name: count, dtype: int64
     -  ../data/llm-dump/misc/EJOUR.xlsx (EJOUR)


100%|██████████| 2/2 [00:00<00:00, 1210.65it/s]


status
success    2
Name: count, dtype: int64
     -  ../data/llm-dump/misc/JFULL.xlsx (JFULL)


100%|██████████| 6051/6051 [00:07<00:00, 791.12it/s]


status
failure    4163
success    1888
Name: count, dtype: int64
     -  ../data/llm-dump/misc/JOUR.xlsx (JOUR)


100%|██████████| 3483/3483 [00:02<00:00, 1418.13it/s]


status
success    3480
failure       3
Name: count, dtype: int64
     -  ../data/llm-dump/misc/WEB.xlsx (WEB)


100%|██████████| 32/32 [00:00<00:00, 785.80it/s]

status
success    31
failure     1
Name: count, dtype: int64





Extract journal titles for which we don't have a normalization yet and map them provionally to the closest available normalized title (using the Levenshtein distance):

In [13]:
#mappings = []
#for nj, cnt in new_jtitles.items():
#    distances = np.array([lev.distance(nj, oj) for oj in jtitle['normalized']])
#    mappings.append([nj, cnt] + list(jtitle.iloc[np.argmin(distances)][['normalized', 'issn']]))

#mappings = pd.DataFrame(mappings, columns=['raw title', 'count', 'normalized', 'issn'])
#mappings = mappings.sort_values('count', ascending=False)
#mappings.head(30)

We save this spreadsheet for manual correction:

In [14]:
#mappings.to_excel('../data/journal_titles_2ndBatch.xlsx', header=True, index=False)