Skip to content

Commit

Permalink
url fix now removes pure-arXiv DOI
Browse files Browse the repository at this point in the history
  • Loading branch information
phfaist committed Mar 28, 2023
1 parent cec1140 commit 8a24560
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 22 deletions.
4 changes: 2 additions & 2 deletions bibolamazi/core/version.py
Expand Up @@ -24,13 +24,13 @@
# This is the current BIBOLAMAZI version.
# Bump the number here for different version numbers.

version_str = "4.5"
version_str = "4.6b0"
"""
The version string. This is increased upon each release.
"""


copyright_year = "2021"
copyright_year = "2023"
"""
Year of copyright.
"""
24 changes: 22 additions & 2 deletions bibolamazi/filters/url.py
Expand Up @@ -54,7 +54,8 @@ class UrlNormalizeFilter(BibFilter):
helptext = HELP_TEXT

def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, StripArxivUrl=True,
UrlFromDoi=False, UrlFromArxiv=False, KeepFirstUrlOnly=False, StripForTypes=None,
UrlFromDoi=False, UrlFromArxiv=False, RemoveArxivDoi=True,
KeepFirstUrlOnly=False, StripForTypes=None,
AddAsHowPublished=False, HowPublishedText='available at {urlstr}'):
r"""
UrlNormalizeFilter constructor.
Expand All @@ -80,6 +81,9 @@ def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, St
that links to the arXiv page, i.e. `https://arxiv.org/abs/<ARXIV-ID>`
[default: False]
- RemoveArxivDoi(bool): Remove DOIs of the form
`https://doi.org/10.48550/arXiv.<ARXIV-ID>`.
- KeepFirstUrlOnly(bool): If the entry has several URLs, then after applying all
the other stripping rules, keep only the first remaining URL, if any.
[default: False]
Expand All @@ -94,7 +98,6 @@ def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, St
list of URLs concatenated with a comma, '{url}' to insert the
first url and the key 'urls' is passed the raw Python list as
argument.
"""
super().__init__()

Expand All @@ -104,6 +107,7 @@ def __init__(self, Strip=False, StripAllIfDoiOrArxiv=False, StripDoiUrl=True, St
self.striparxivurl = getbool(StripArxivUrl)
self.urlfromdoi = getbool(UrlFromDoi)
self.urlfromarxiv = getbool(UrlFromArxiv)
self.removearxivdoi = getbool(RemoveArxivDoi)
self.keepfirsturlonly = getbool(KeepFirstUrlOnly)
self.stripfortypes = None
if (StripForTypes is not None):
Expand Down Expand Up @@ -184,6 +188,22 @@ def filter_bibentry(self, entry):

#logger.longdebug("%s: urls is now %r", entry.key, urls)

if self.removearxivdoi:
if 'doi' in entry.fields:
dois = re.split(r'[ \t\n,]+', entry.fields['doi'])
new_dois = [
doi
for doi in dois
if arxivutil.rx_arxiv_own_doi.match(doi) is None
]
newdoifield = " ".join(new_dois)
if newdoifield == "":
del entry.fields['doi']
else:
entry.fields['doi'] = newdoifield

#logger.longdebug("%s: urls is now %r", entry.key, urls)

if (self.keepfirsturlonly):
if (urls):
urls[1:] = []
Expand Down
66 changes: 48 additions & 18 deletions bibolamazi/filters/util/arxivutil.py
Expand Up @@ -101,12 +101,16 @@ def _mk_braced_pair_rx(mid):

_rx_aid_year = re.compile(r'(?P<year>\d{2})(?P<mon>\d{2})(?:\.\d{4,}|\d{3})')


rx_arxiv_own_doi = re.compile(r'^((https?://)?(dx\.)?(doi\.org/))?10\.48550/arXiv\.(?P<arxivid>.*)$', re.IGNORECASE)


#
# A list of fields which are inspected for arXiv information. This is useful for
# cache invalidation in various instances.
#
arxivinfo_from_bibtex_fields = [
'journal', 'doi', 'eprint', 'arxivid', 'url',
'journal', 'doi', 'eprint', 'arxivid', 'arxiv', 'url',
'note', 'annote', 'primaryclass',
'archiveprefix', ]

Expand Down Expand Up @@ -186,28 +190,49 @@ def extract_pure_id(x, primaryclass=None):
return m.group('purearxivid')


if ('doi' in fields and fields['doi']):
d['doi'] = fields['doi']
if 'doi' in fields:
dois = re.split(r'[ \t\n,]+', fields['doi'])
for doi in dois:
if doi.strip() == "":
continue
m = re.match(rx_arxiv_own_doi, doi)
if m is not None:
# get arXiv ID
d['arxivid'] = m.group('arxivid')
else:
# this is a journal DOI -- keep only first one
if d.get('doi', None) is not None:
d['doi'] = fields['doi']

if ('eprint' in fields):
# this gives the arxiv ID
try:
d['arxivid'] = extract_pure_id(fields['eprint'], primaryclass=fields.get('primaryclass', None))
m = re.match(r'^([-\w.]+)/', d['arxivid'])
if (m):
d['primaryclass'] = m.group(1)
except IndexError as e:
logger.longdebug("Indexerror: invalid arXiv ID [%r/]%r: %s",
fields.get('primaryclass',None), fields['eprint'], e)
logger.warning("Entry `%s' has invalid arXiv ID %r", entry.key, fields['eprint'])
if d['arxivid'] is None:
for eprintfield in ('arxivid', 'arxiv', 'eprint'):
if not eprintfield in fields:
continue
# this field might reveal the arxiv ID
arxivid = None
try:
arxivid = extract_pure_id(fields[eprintfield], primaryclass=fields.get('primaryclass', None))
except IndexError as e:
logger.longdebug("Indexerror: invalid arXiv ID in field ‘%s’ [%r/]%r: %s",
eprintfield, fields.get('primaryclass',None), fields[eprintfield], e)
# could be because, e.g., Zotero exporter used the PubMed ID here.
logger.debug("Entry `%s' has invalid arXiv ID %r in eprint field ‘%s’", entry.key, fields[eprintfield],
eprintfield)
continue
if arxivid is not None:
d['arxivid'] = arxivid
m = re.match(r'^([-\w.]+)/', arxivid)
if (m):
d['primaryclass'] = m.group(1)
break

if ('primaryclass' in fields):
d['primaryclass'] = fields['primaryclass']

if ('archiveprefix' in fields):
d['archiveprefix'] = fields['archiveprefix']

logger.longdebug("processed doi,eprint,primaryclass,archiveprefix fields -> d = %r", d)
logger.longdebug("processed doi,eprint,arxiv,arxivid,primaryclass,archiveprefix fields -> d = %r", d)

def processNoteField(notefield, d, isurl=False):

Expand Down Expand Up @@ -651,12 +676,17 @@ def complete_cache(self, bibdata, arxiv_api_accessor):
entrydic[k]['primaryclass'][:len(primaryclass)] !=
primaryclass[:len(entrydic[k]['primaryclass'])]):
#
summary_info_mismatch.append(
(k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass)
)
# ### Ignore mismatches in primaryclass, e.g. Zotero's exporter
# ### exports all archive classes as a comma-separated list
# ### which would be terrible for us to parse and check...
#
# summary_info_mismatch.append(
# (k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass)
# )
# logger.warning("Conflicting primaryclass values for entry %s (%s): "
# "%s (given in bibtex) != %s (retrieved from the arxiv)",
# k, aid, entrydic[k]['primaryclass'], primaryclass)
pass
else:
entrydic[k]['primaryclass'] = primaryclass

Expand Down

0 comments on commit 8a24560

Please sign in to comment.