diff --git a/bibolamazi/core/version.py b/bibolamazi/core/version.py index ca95a98..f3e432a 100644 --- a/bibolamazi/core/version.py +++ b/bibolamazi/core/version.py @@ -24,13 +24,13 @@ # This is the current BIBOLAMAZI version. # Bump the number here for different version numbers. -version_str = "4.5" +version_str = "4.6b0" """ The version string. This is increased upon each release. """ -copyright_year = "2021" +copyright_year = "2023" """ Year of copyright. """ diff --git a/bibolamazi/filters/url.py b/bibolamazi/filters/url.py index 0ee3588..3dc5488 100644 --- a/bibolamazi/filters/url.py +++ b/bibolamazi/filters/url.py @@ -54,7 +54,8 @@ class UrlNormalizeFilter(BibFilter): helptext = HELP_TEXT def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, StripArxivUrl=True, - UrlFromDoi=False, UrlFromArxiv=False, KeepFirstUrlOnly=False, StripForTypes=None, + UrlFromDoi=False, UrlFromArxiv=False, RemoveArxivDoi=True, + KeepFirstUrlOnly=False, StripForTypes=None, AddAsHowPublished=False, HowPublishedText='available at {urlstr}'): r""" UrlNormalizeFilter constructor. @@ -80,6 +81,9 @@ def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, St that links to the arXiv page, i.e. `https://arxiv.org/abs/` [default: False] + - RemoveArxivDoi(bool): Remove DOIs of the form + `https://doi.org/10.48550/arXiv.`. + - KeepFirstUrlOnly(bool): If the entry has several URLs, then after applying all the other stripping rules, keep only the first remaining URL, if any. [default: False] @@ -94,7 +98,6 @@ def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, St list of URLs concatenated with a comma, '{url}' to insert the first url and the key 'urls' is passed the raw Python list as argument. - """ super().__init__() @@ -104,6 +107,7 @@ def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, St self.striparxivurl = getbool(StripArxivUrl) self.urlfromdoi = getbool(UrlFromDoi) self.urlfromarxiv = getbool(UrlFromArxiv) + self.removearxivdoi = getbool(RemoveArxivDoi) self.keepfirsturlonly = getbool(KeepFirstUrlOnly) self.stripfortypes = None if (StripForTypes is not None): @@ -184,6 +188,22 @@ def filter_bibentry(self, entry): #logger.longdebug("%s: urls is now %r", entry.key, urls) + if self.removearxivdoi: + if 'doi' in entry.fields: + dois = re.split(r'[ \t\n,]+', entry.fields['doi']) + new_dois = [ + doi + for doi in dois + if arxivutil.rx_arxiv_own_doi.match(doi) is None + ] + newdoifield = " ".join(new_dois) + if newdoifield == "": + del entry.fields['doi'] + else: + entry.fields['doi'] = newdoifield + + #logger.longdebug("%s: urls is now %r", entry.key, urls) + if (self.keepfirsturlonly): if (urls): urls[1:] = [] diff --git a/bibolamazi/filters/util/arxivutil.py b/bibolamazi/filters/util/arxivutil.py index b96d1e4..b8536eb 100644 --- a/bibolamazi/filters/util/arxivutil.py +++ b/bibolamazi/filters/util/arxivutil.py @@ -101,12 +101,16 @@ def _mk_braced_pair_rx(mid): _rx_aid_year = re.compile(r'(?P\d{2})(?P\d{2})(?:\.\d{4,}|\d{3})') + +rx_arxiv_own_doi = re.compile(r'^((https?://)?(dx\.)?(doi\.org/))?10\.48550/arXiv\.(?P.*)$', re.IGNORECASE) + + # # A list of fields which are inspected for arXiv information. This is useful for # cache invalidation in various instances. # arxivinfo_from_bibtex_fields = [ - 'journal', 'doi', 'eprint', 'arxivid', 'url', + 'journal', 'doi', 'eprint', 'arxivid', 'arxiv', 'url', 'note', 'annote', 'primaryclass', 'archiveprefix', ] @@ -186,20 +190,41 @@ def extract_pure_id(x, primaryclass=None): return m.group('purearxivid') - if ('doi' in fields and fields['doi']): - d['doi'] = fields['doi'] + if 'doi' in fields: + dois = re.split(r'[ \t\n,]+', fields['doi']) + for doi in dois: + if doi.strip() == "": + continue + m = re.match(rx_arxiv_own_doi, doi) + if m is not None: + # get arXiv ID + d['arxivid'] = m.group('arxivid') + else: + # this is a journal DOI -- keep only first one + if d.get('doi', None) is not None: + d['doi'] = fields['doi'] - if ('eprint' in fields): - # this gives the arxiv ID - try: - d['arxivid'] = extract_pure_id(fields['eprint'], primaryclass=fields.get('primaryclass', None)) - m = re.match(r'^([-\w.]+)/', d['arxivid']) - if (m): - d['primaryclass'] = m.group(1) - except IndexError as e: - logger.longdebug("Indexerror: invalid arXiv ID [%r/]%r: %s", - fields.get('primaryclass',None), fields['eprint'], e) - logger.warning("Entry `%s' has invalid arXiv ID %r", entry.key, fields['eprint']) + if d['arxivid'] is None: + for eprintfield in ('arxivid', 'arxiv', 'eprint'): + if not eprintfield in fields: + continue + # this field might reveal the arxiv ID + arxivid = None + try: + arxivid = extract_pure_id(fields[eprintfield], primaryclass=fields.get('primaryclass', None)) + except IndexError as e: + logger.longdebug("Indexerror: invalid arXiv ID in field ā€˜%sā€™ [%r/]%r: %s", + eprintfield, fields.get('primaryclass',None), fields[eprintfield], e) + # could be because, e.g., Zotero exporter used the PubMed ID here. + logger.debug("Entry `%s' has invalid arXiv ID %r in eprint field ā€˜%sā€™", entry.key, fields[eprintfield], + eprintfield) + continue + if arxivid is not None: + d['arxivid'] = arxivid + m = re.match(r'^([-\w.]+)/', arxivid) + if (m): + d['primaryclass'] = m.group(1) + break if ('primaryclass' in fields): d['primaryclass'] = fields['primaryclass'] @@ -207,7 +232,7 @@ def extract_pure_id(x, primaryclass=None): if ('archiveprefix' in fields): d['archiveprefix'] = fields['archiveprefix'] - logger.longdebug("processed doi,eprint,primaryclass,archiveprefix fields -> d = %r", d) + logger.longdebug("processed doi,eprint,arxiv,arxivid,primaryclass,archiveprefix fields -> d = %r", d) def processNoteField(notefield, d, isurl=False): @@ -651,12 +676,17 @@ def complete_cache(self, bibdata, arxiv_api_accessor): entrydic[k]['primaryclass'][:len(primaryclass)] != primaryclass[:len(entrydic[k]['primaryclass'])]): # - summary_info_mismatch.append( - (k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass) - ) + # ### Ignore mismatches in primaryclass, e.g. Zotero's exporter + # ### exports all archive classes as a comma-separated list + # ### which would be terrible for us to parse and check... + # + # summary_info_mismatch.append( + # (k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass) + # ) # logger.warning("Conflicting primaryclass values for entry %s (%s): " # "%s (given in bibtex) != %s (retrieved from the arxiv)", # k, aid, entrydic[k]['primaryclass'], primaryclass) + pass else: entrydic[k]['primaryclass'] = primaryclass