Skip to content

Commit

Permalink
improvements to log messages; postrun() filter method
Browse files Browse the repository at this point in the history
  • Loading branch information
phfaist committed Nov 1, 2020
1 parent bff4b0a commit 8ea3cbd
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 57 deletions.
6 changes: 4 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ env:
- BIBOLAMAZI_TESTS_SKIP_GITHUB_ACCESS=1

python:
- "3.4" # windows builds still use 3.4 (yes, I know...)
- "3.6"
- "3.7"
- "3.8"
- "3.9"

install:
# On Python 3.4, we need to install pyyaml==3.13 because newer PyYaml dropped py3.4 support
- if [ "`python -c 'import sys; print(\"%d.%d\"%sys.version_info[0:2])'`" == "3.4" ]; then pip install pyyaml==3.13; fi

- pip install -r pip_requirements.txt

# command to run tests
Expand Down
21 changes: 21 additions & 0 deletions bibolamazi/core/bibfilter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,27 @@ def prerun(self, bibolamazifile):
The default implementation does nothing.
"""
return

def postrun(self, bibolamazifile):
"""
This function gets called immediately after the filter is run, before any
further filters are executed.
It is not very useful if the :py:meth:`action()` is
:py:const:`BibFilter.BIB_FILTER_BIBOLAMAZIFILE`, but it can prove useful
for filters with action :py:const:`BibFilter.BIB_FILTER_SINGLE_ENTRY`,
if any sort of global post-processing task should be done immediately
after the actual filtering of the data.
You can use this function, e.g., to produce an aggregated warning or
report message.
This method is not called if the filter raised an exception, whether
internal or not.
The default implementation does nothing.
"""
return


def filter_bibentry(self, x):
Expand Down
122 changes: 84 additions & 38 deletions bibolamazi/core/bibolamazifile.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,8 @@ def fullFilterPath(self):
See also filterPath().
"""
return PrependOrderedDict(list(self._filterpath.items()) + list(factory.filterpath.items()))
return PrependOrderedDict(list(self._filterpath.items())
+ list(factory.filterpath.items()))

def filters(self):
"""
Expand Down Expand Up @@ -579,7 +580,8 @@ def bibliographydata(self):
.. deprecated:: 2.0
Use `bibliographyData()` instead!
"""
butils.warn_deprecated("BibolamaziFile", "bibliographydata()", "bibliographyData()", __name__)
butils.warn_deprecated("BibolamaziFile", "bibliographydata()",
"bibliographyData()", __name__)
return self.bibliographyData()

def cacheFileName(self):
Expand Down Expand Up @@ -1026,8 +1028,13 @@ def _load_contents(self):
self._bibliographydata = None

if (not len(self._source_lists)):
logger.warning("File `%s': No source files specified. You need source files to provide bib entries!",
self._fname)
logger.warning(
"File ‘%s’: No source files specified. You need source files to provide "
"bibliographic entries!",
self._fname
)

logger.info('{:+^80s}'.format(' collecting sources '))

# now, populate all bibliographydata.
num_conflicting_keys = 0
Expand All @@ -1040,6 +1047,7 @@ def _load_contents(self):
if num_conflicting_keys:
logger.info(CONFLICT_KEY_INFO)

logger.info('{:+^80s}\n'.format(''))

# Now, try to load the cache
# --------------------------
Expand Down Expand Up @@ -1119,7 +1127,7 @@ def _populate_from_src(self, src):
# ignore source, will have to try next in list
return (False,0)

logger.info("Found Source: %s", src)
logger.info(" %s", src)

try:
# parse bibtex
Expand Down Expand Up @@ -1149,7 +1157,8 @@ def _populate_from_src(self, src):
key = oldkey + u".conflictkey." + str(n)

entry.key = key
logger.debug("Key conflict in source file %s: renamed %s -> %s", src, oldkey, key)
logger.debug("Key conflict in source file %s: renamed %s -> %s",
src, oldkey, key)

self._bibliographydata.add_entry(key, entry)

Expand Down Expand Up @@ -1239,24 +1248,43 @@ def runFilter(self, filter_instance):
# entries etc.), or it can act on a single entry.
#

filtername = ''
class _WrapFilterAction:
def __init__(self, bibolamazifile, logger, filtername, filter_instance):
super().__init__()
self.bibolamazifile = bibolamazifile
self.logger = logger
self.filtername = filtername
self.filter_instance = filter_instance

def __enter__(self):
self.logger.info('{:-^80s}'.format(' filter ‘{}’ '.format(self.filtername)))
msg = self.filter_instance.getRunningMessage()
if msg != self.filtername:
self.logger.info(msg)

self.filter_instance.prerun(self.bibolamazifile)

def __exit__(self, exc_type, exc_value, exc_traceback):
if exc_type is None:
# success
self.filter_instance.postrun(self.bibolamazifile)
self.logger.info('{:-^79s}\n'.format(' filter ✅ '))

try:
filtername = filter_instance.name()
action = filter_instance.action()

logger.info("=== Filter: %s", filter_instance.getRunningMessage())

filter_instance.prerun(self)

#
# pass the whole bibolamazifile to the filter. the filter can actually do
# whatever it wants with it (!!)
#
if (action == BibFilter.BIB_FILTER_BIBOLAMAZIFILE):
filter_instance.filter_bibolamazifile(self)

logger.debug('filter %s filtered the full bibolamazifile.',
with _WrapFilterAction(self, logger, filtername, filter_instance):

filter_instance.filter_bibolamazifile(self)

logger.debug('filter ‘%s’ processed the full bibolamazifile.',
filter_instance.name())
return

Expand All @@ -1266,12 +1294,13 @@ def runFilter(self, filter_instance):
#
if (action == BibFilter.BIB_FILTER_SINGLE_ENTRY):

bibdata = self.bibliographyData()
with _WrapFilterAction(self, logger, filtername, filter_instance):

for (k, entry) in bibdata.entries.items():
filter_instance.filter_bibentry(entry)
bibdata = self.bibliographyData()
for (k, entry) in bibdata.entries.items():
filter_instance.filter_bibentry(entry)

logger.debug('filter %s filtered each of the the bibentries one by one.',
logger.debug('filter %s processed all the bibliographic entries.',
filter_instance.name())
return

Expand Down Expand Up @@ -1321,8 +1350,6 @@ def saveRawToFile(self, fname=None, cachefname=None):
logger.info("Saved file '%s'", fname)




def saveToFile(self, fname=None, cachefname=None):
"""
Save the current bibolamazi file object to disk.
Expand Down Expand Up @@ -1354,16 +1381,9 @@ def saveToFile(self, fname=None, cachefname=None):
always silently overwritten (so be careful). The same
applies to the cache file.
"""
if fname is None:
fname = self._fname

if cachefname is None or (isinstance(cachefname, bool) and cachefname):
cachefname = self.cacheFileName()
elif isinstance(cachefname, bool) and not cachefname:
cachefname = ''
else:
pass # cachefname has a specific file name

fname, cachefname = self._get_fname_and_cachefname(fname, cachefname)

with codecs.open(fname, 'w', BIBOLAMAZI_FILE_ENCODING) as f:
f.write(self._header)
f.write(self._config)
Expand All @@ -1385,26 +1405,52 @@ def saveToFile(self, fname=None, cachefname=None):
#
# Write to bibtex output
#
#w = outputbibtex.Writer()
#w.write_stream(self._bibliographydata, f)
#
#f.write(self._bibliographydata.to_string('bibtex'))
#
w = outputbibtex.Writer()
f.write(w.to_string(self._bibliographydata))

logger.info("Updated output file '%s'", fname)
logger.info("✨ Updated output file '%s'", fname)

self.saveCache(cachefname=cachefname)

def _get_fname_and_cachefname(self, fname, cachefname):
if fname is None:
fname = self._fname

if cachefname is None or (isinstance(cachefname, bool) and cachefname):
cachefname = self.cacheFileName()
elif isinstance(cachefname, bool) and not cachefname:
cachefname = ''
else:
pass # cachefname has a specific file name

return fname, cachefname

def saveCache(self, cachefname=None):
r"""
Save the cache. If `cachefname` is None, the cache file name is deduced
from the current file name (see :py:meth:`fname()`).
The argument `cachefname` is parsed exactly as in the method
:py:meth:`saveToFile()`.
.. note:: If you call :py:meth:`saveToFile()`, then the cache is
automatically saved and a separate call to `saveCache()` is
not necessary.
Warning: This method will silently overwrite any existing file of the
same name.
"""

_, cachefname = self._get_fname_and_cachefname(None, cachefname)

# if we have cache to save, save it
if (cachefname and self._user_cache and self._user_cache.hasCache()):
try:
with open(cachefname, 'wb') as f:
logger.debug("Writing cache to file %s", cachefname)
logger.debug("Writing cache to file ‘%s’", cachefname)
self._user_cache.saveCache(f)
except IOError as e:
logger.debug("Couldn't save cache to file '%s'.", cachefname)
logger.debug("Error saving cache to file ‘%s’: %s", cachefname, e)





Expand Down
45 changes: 43 additions & 2 deletions bibolamazi/filters/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,9 @@ def __init__(self,

self.warn_journal_ref = butils.getbool(warn_journal_ref)

self.summary_info_mismatch = None
self.summary_published = None

if self.unpublished_mode == MODE_EPRINT and 'journal' in self.strip_unpublished_fields:
if self.arxiv_journal_name and self.arxiv_journal_name != _default_arxiv_journal_name:
raise ValueError(
Expand Down Expand Up @@ -364,6 +367,39 @@ def prerun(self, bibolamazifile):
logger.debug("arxiv prerun(): re-validating arxiv info cache")
bibolamazifile.cacheAccessor(arxivutil.ArxivInfoCacheAccessor).revalidate(bibolamazifile)

# initialize "summary" messages
#
# summary_info_mismatch --- entries for which info fetched from
# arXiv.org don't match the provided bibtex entry
self.summary_info_mismatch = []
# summary_published --- entries that refer to the arXiv but
# which are in fact published in some other venue
self.summary_published = []


def postrun(self, bibolamazifile):
# Save the cache to disk immediately. This action is useful in case
# there is an exception in a later filter, which would cause the cache
# not to be saved in a future run.
bibolamazifile.saveCache()

# present any "summaries":

# -- for published entries
logger.warning(
"The following arXiv-only bibliographic {entryname} have been published in "
"other {venuename}:\n"
.format(entryname='entry' if len(self.summary_published) == 1 else 'entries',
venuename='venue' if len(self.summary_published) == 1 else 'venues')
+ "\n".join(
"- {key} ({arxivid}) → {doiurl}".format(
key='‘'+key+'’',
arxivid=arxivinfo['arxivid'],
doiurl='https://doi.org/'+arxivinfo['doi'],
)
for key, arxivinfo in self.summary_published
)
)

def filter_bibentry(self, entry):
#
Expand Down Expand Up @@ -399,8 +435,13 @@ def filter_bibentry(self, entry):
if (self.warn_journal_ref and not we_are_published and arxivinfo['doi']):
# we think we are not published but we actually are, as reported by arXiv.org API. This
# could be because the authors published their paper in the meantime.
logger.warning("arxiv: Entry `%s' refers to arXiv version of published entry with DOI %r",
entry.key, arxivinfo['doi'])
#logger.warning(
# "arxiv: Entry `%s' refers to arXiv version of published entry with DOI %r",
# entry.key, arxivinfo['doi']
#)
self.summary_published.append(
(entry.key, arxivinfo,)
)

logger.longdebug("arXiv: entry %s: published=%r, mode=%r", entry.key, we_are_published, mode)

Expand Down
39 changes: 33 additions & 6 deletions bibolamazi/filters/util/arxivutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,8 @@ def complete_cache(self, bibdata, arxiv_api_accessor):
# with info from the arXiv API.
needs_to_be_completed = []

summary_info_mismatch = []

#
# Do a first scan through all the bibdata entries, and detect the API
# information using only what we have (to figure out the arxiv
Expand Down Expand Up @@ -649,16 +651,22 @@ def complete_cache(self, bibdata, arxiv_api_accessor):
entrydic[k]['primaryclass'][:len(primaryclass)] !=
primaryclass[:len(entrydic[k]['primaryclass'])]):
#
logger.warning("Conflicting primaryclass values for entry %s (%s): "
"%s (given in bibtex) != %s (retrieved from the arxiv)",
k, aid, entrydic[k]['primaryclass'], primaryclass)
summary_info_mismatch.append(
(k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass)
)
# logger.warning("Conflicting primaryclass values for entry %s (%s): "
# "%s (given in bibtex) != %s (retrieved from the arxiv)",
# k, aid, entrydic[k]['primaryclass'], primaryclass)
else:
entrydic[k]['primaryclass'] = primaryclass

if (doi and entrydic[k]['doi'] and entrydic[k]['doi'].lower() != doi.lower()):
logger.warning("Conflicting doi values for entry %s (%s): "
"%s (given in bibtex) != %s (retrieved from the arxiv)",
k, aid, entrydic[k]['doi'], doi)
summary_info_mismatch.append(
(k, aid, 'doi', entrydic[k]['doi'], doi)
)
# logger.warning("Conflicting doi values for entry %s (%s): "
# "%s (given in bibtex) != %s (retrieved from the arxiv)",
# k, aid, entrydic[k]['doi'], doi)
else:
entrydic[k]['doi'] = doi

Expand All @@ -669,6 +677,25 @@ def complete_cache(self, bibdata, arxiv_api_accessor):
logger.warning("Failed to fetch information from the arXiv for %d entries: %s",
len(fail_aids), joined)

# warning for info mismatch
if summary_info_mismatch:
logger.warning(
"Mismatch: info in bibtex ≠ info from arxiv.org\n" +
"\n".join(
"- ‘{key}’ ({arxivid}) [{field}]:\n"
" {value_from_bibtex} ≠ {value_from_arxivorg}"
.format(key=key,
arxivid=arxivid,
field=field,
value_from_bibtex='“'+value_from_bibtex+'”',
value_from_arxivorg='“'+value_from_arxivorg+'”',)
for key, arxivid, field, value_from_bibtex, value_from_arxivorg
in summary_info_mismatch
)
)




def getArXivInfo(self, entrykey):
"""
Expand Down

0 comments on commit 8ea3cbd

Please sign in to comment.