url fix now removes pure-arXiv DOI

phfaist · Mar 28, 2023 · 8a24560 · 8a24560
1 parent cec1140
commit 8a24560
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 22 deletions.
diff --git a/bibolamazi/core/version.py b/bibolamazi/core/version.py
@@ -24,13 +24,13 @@
 # This is the current BIBOLAMAZI version.
 # Bump the number here for different version numbers.
 
-version_str = "4.5"
+version_str = "4.6b0"
 """
 The version string. This is increased upon each release.
 """
 
 
-copyright_year = "2021"
+copyright_year = "2023"
 """
 Year of copyright.
 """
diff --git a/bibolamazi/filters/url.py b/bibolamazi/filters/url.py
@@ -54,7 +54,8 @@ class UrlNormalizeFilter(BibFilter):
     helptext = HELP_TEXT
 
     def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, StripArxivUrl=True,
-                 UrlFromDoi=False, UrlFromArxiv=False, KeepFirstUrlOnly=False, StripForTypes=None,
+                 UrlFromDoi=False, UrlFromArxiv=False, RemoveArxivDoi=True,
+                 KeepFirstUrlOnly=False, StripForTypes=None,
                  AddAsHowPublished=False, HowPublishedText='available at {urlstr}'):
         r"""
         UrlNormalizeFilter constructor.
@@ -80,6 +81,9 @@ def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, St
                          that links to the arXiv page, i.e. `https://arxiv.org/abs/<ARXIV-ID>`
                          [default: False]
 
+          - RemoveArxivDoi(bool): Remove DOIs of the form
+                         `https://doi.org/10.48550/arXiv.<ARXIV-ID>`.
+
           - KeepFirstUrlOnly(bool): If the entry has several URLs, then after applying all
                          the other stripping rules, keep only the first remaining URL, if any.
                          [default: False]
@@ -94,7 +98,6 @@ def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, St
                          list of URLs concatenated with a comma, '{url}' to insert the
                          first url and the key 'urls' is passed the raw Python list as
                          argument.
-
         """
         super().__init__()
 
@@ -104,6 +107,7 @@ def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, St
         self.striparxivurl = getbool(StripArxivUrl)
         self.urlfromdoi = getbool(UrlFromDoi)
         self.urlfromarxiv = getbool(UrlFromArxiv)
+        self.removearxivdoi = getbool(RemoveArxivDoi)
         self.keepfirsturlonly = getbool(KeepFirstUrlOnly)
         self.stripfortypes = None
         if (StripForTypes is not None):
@@ -184,6 +188,22 @@ def filter_bibentry(self, entry):
 
         #logger.longdebug("%s: urls is now  %r", entry.key, urls)
 
+        if self.removearxivdoi:
+            if 'doi' in entry.fields:
+                dois = re.split(r'[ \t\n,]+', entry.fields['doi'])
+                new_dois = [
+                    doi
+                    for doi in dois
+                    if arxivutil.rx_arxiv_own_doi.match(doi) is None
+                ]
+                newdoifield = " ".join(new_dois)
+                if newdoifield == "":
+                    del entry.fields['doi']
+                else:
+                    entry.fields['doi'] = newdoifield
+
+        #logger.longdebug("%s: urls is now  %r", entry.key, urls)
+
         if (self.keepfirsturlonly):
             if (urls):
                 urls[1:] = []

diff --git a/bibolamazi/filters/util/arxivutil.py b/bibolamazi/filters/util/arxivutil.py
@@ -101,12 +101,16 @@ def _mk_braced_pair_rx(mid):
 
 _rx_aid_year = re.compile(r'(?P<year>\d{2})(?P<mon>\d{2})(?:\.\d{4,}|\d{3})')
 
+
+rx_arxiv_own_doi = re.compile(r'^((https?://)?(dx\.)?(doi\.org/))?10\.48550/arXiv\.(?P<arxivid>.*)$', re.IGNORECASE)
+
+
 #
 # A list of fields which are inspected for arXiv information. This is useful for
 # cache invalidation in various instances.
 #
 arxivinfo_from_bibtex_fields = [
-    'journal', 'doi', 'eprint', 'arxivid', 'url',
+    'journal', 'doi', 'eprint', 'arxivid', 'arxiv', 'url',
     'note', 'annote', 'primaryclass',
     'archiveprefix', ]
 
@@ -186,28 +190,49 @@ def extract_pure_id(x, primaryclass=None):
         return m.group('purearxivid')
 
 
-    if ('doi' in fields and fields['doi']):
-        d['doi'] = fields['doi']
+    if 'doi' in fields:
+        dois = re.split(r'[ \t\n,]+', fields['doi'])
+        for doi in dois:
+            if doi.strip() == "":
+                continue
+            m = re.match(rx_arxiv_own_doi, doi)
+            if m is not None:
+                # get arXiv ID
+                d['arxivid'] = m.group('arxivid')
+            else:
+                # this is a journal DOI -- keep only first one
+                if d.get('doi', None) is not None:
+                    d['doi'] = fields['doi']
 
-    if ('eprint' in fields):
-        # this gives the arxiv ID
-        try:
-            d['arxivid'] = extract_pure_id(fields['eprint'], primaryclass=fields.get('primaryclass', None))
-            m = re.match(r'^([-\w.]+)/', d['arxivid'])
-            if (m):
-                d['primaryclass'] = m.group(1)
-        except IndexError as e:
-            logger.longdebug("Indexerror: invalid arXiv ID [%r/]%r: %s",
-                             fields.get('primaryclass',None), fields['eprint'], e)
-            logger.warning("Entry `%s' has invalid arXiv ID %r", entry.key, fields['eprint'])
+    if d['arxivid'] is None:
+        for eprintfield in ('arxivid', 'arxiv', 'eprint'):
+            if not eprintfield in fields:
+                continue
+            # this field might reveal the arxiv ID
+            arxivid = None
+            try:
+                arxivid = extract_pure_id(fields[eprintfield], primaryclass=fields.get('primaryclass', None))
+            except IndexError as e:
+                logger.longdebug("Indexerror: invalid arXiv ID in field ‘%s’ [%r/]%r: %s",
+                                 eprintfield, fields.get('primaryclass',None), fields[eprintfield], e)
+                # could be because, e.g., Zotero exporter used the PubMed ID here.
+                logger.debug("Entry `%s' has invalid arXiv ID %r in eprint field ‘%s’", entry.key, fields[eprintfield],
+                             eprintfield)
+                continue
+            if arxivid is not None:
+                d['arxivid'] = arxivid
+                m = re.match(r'^([-\w.]+)/', arxivid)
+                if (m):
+                    d['primaryclass'] = m.group(1)
+                break
 
     if ('primaryclass' in fields):
         d['primaryclass'] = fields['primaryclass']
 
     if ('archiveprefix' in fields):
         d['archiveprefix'] = fields['archiveprefix']
 
-    logger.longdebug("processed doi,eprint,primaryclass,archiveprefix fields -> d = %r", d)
+    logger.longdebug("processed doi,eprint,arxiv,arxivid,primaryclass,archiveprefix fields -> d = %r", d)
 
     def processNoteField(notefield, d, isurl=False):
 
@@ -651,12 +676,17 @@ def complete_cache(self, bibdata, arxiv_api_accessor):
                 entrydic[k]['primaryclass'][:len(primaryclass)] !=
                 primaryclass[:len(entrydic[k]['primaryclass'])]):
                 #
-                summary_info_mismatch.append(
-                    (k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass)
-                )
+                # ### Ignore mismatches in primaryclass, e.g. Zotero's exporter
+                # ### exports all archive classes as a comma-separated list
+                # ### which would be terrible for us to parse and check...
+                #
+                # summary_info_mismatch.append(
+                #     (k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass)
+                # )
                 # logger.warning("Conflicting primaryclass values for entry %s (%s): "
                 #                "%s (given in bibtex) != %s (retrieved from the arxiv)",
                 #                k, aid, entrydic[k]['primaryclass'], primaryclass)
+                pass
             else:
                 entrydic[k]['primaryclass'] = primaryclass