diff --git a/.travis.yml b/.travis.yml index bc4709f..6b2efdd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,12 +9,14 @@ env: - BIBOLAMAZI_TESTS_SKIP_GITHUB_ACCESS=1 python: + - "3.4" # windows builds still use 3.4 (yes, I know...) - "3.6" - - "3.7" - - "3.8" - "3.9" install: + # On Python 3.4, we need to install pyyaml==3.13 because newer PyYaml dropped py3.4 support + - if [ "`python -c 'import sys; print(\"%d.%d\"%sys.version_info[0:2])'`" == "3.4" ]; then pip install pyyaml==3.13; fi + - pip install -r pip_requirements.txt # command to run tests diff --git a/bibolamazi/core/bibfilter/__init__.py b/bibolamazi/core/bibfilter/__init__.py index 3001ce8..b1319e0 100644 --- a/bibolamazi/core/bibfilter/__init__.py +++ b/bibolamazi/core/bibfilter/__init__.py @@ -170,6 +170,27 @@ def prerun(self, bibolamazifile): The default implementation does nothing. """ return + + def postrun(self, bibolamazifile): + """ + This function gets called immediately after the filter is run, before any + further filters are executed. + + It is not very useful if the :py:meth:`action()` is + :py:const:`BibFilter.BIB_FILTER_BIBOLAMAZIFILE`, but it can prove useful + for filters with action :py:const:`BibFilter.BIB_FILTER_SINGLE_ENTRY`, + if any sort of global post-processing task should be done immediately + after the actual filtering of the data. + + You can use this function, e.g., to produce an aggregated warning or + report message. + + This method is not called if the filter raised an exception, whether + internal or not. + + The default implementation does nothing. + """ + return def filter_bibentry(self, x): diff --git a/bibolamazi/core/bibolamazifile.py b/bibolamazi/core/bibolamazifile.py index ca22a54..e4107f9 100644 --- a/bibolamazi/core/bibolamazifile.py +++ b/bibolamazi/core/bibolamazifile.py @@ -547,7 +547,8 @@ def fullFilterPath(self): See also filterPath(). """ - return PrependOrderedDict(list(self._filterpath.items()) + list(factory.filterpath.items())) + return PrependOrderedDict(list(self._filterpath.items()) + + list(factory.filterpath.items())) def filters(self): """ @@ -579,7 +580,8 @@ def bibliographydata(self): .. deprecated:: 2.0 Use `bibliographyData()` instead! """ - butils.warn_deprecated("BibolamaziFile", "bibliographydata()", "bibliographyData()", __name__) + butils.warn_deprecated("BibolamaziFile", "bibliographydata()", + "bibliographyData()", __name__) return self.bibliographyData() def cacheFileName(self): @@ -1026,8 +1028,13 @@ def _load_contents(self): self._bibliographydata = None if (not len(self._source_lists)): - logger.warning("File `%s': No source files specified. You need source files to provide bib entries!", - self._fname) + logger.warning( + "File ‘%s’: No source files specified. You need source files to provide " + "bibliographic entries!", + self._fname + ) + + logger.info('{:+^80s}'.format(' collecting sources ')) # now, populate all bibliographydata. num_conflicting_keys = 0 @@ -1040,6 +1047,7 @@ def _load_contents(self): if num_conflicting_keys: logger.info(CONFLICT_KEY_INFO) + logger.info('{:+^80s}\n'.format('')) # Now, try to load the cache # -------------------------- @@ -1119,7 +1127,7 @@ def _populate_from_src(self, src): # ignore source, will have to try next in list return (False,0) - logger.info("Found Source: %s", src) + logger.info("→ %s", src) try: # parse bibtex @@ -1149,7 +1157,8 @@ def _populate_from_src(self, src): key = oldkey + u".conflictkey." + str(n) entry.key = key - logger.debug("Key conflict in source file %s: renamed %s -> %s", src, oldkey, key) + logger.debug("Key conflict in source file %s: renamed %s -> %s", + src, oldkey, key) self._bibliographydata.add_entry(key, entry) @@ -1239,24 +1248,43 @@ def runFilter(self, filter_instance): # entries etc.), or it can act on a single entry. # - filtername = '' + class _WrapFilterAction: + def __init__(self, bibolamazifile, logger, filtername, filter_instance): + super().__init__() + self.bibolamazifile = bibolamazifile + self.logger = logger + self.filtername = filtername + self.filter_instance = filter_instance + + def __enter__(self): + self.logger.info('{:-^80s}'.format(' filter ‘{}’ '.format(self.filtername))) + msg = self.filter_instance.getRunningMessage() + if msg != self.filtername: + self.logger.info(msg) + + self.filter_instance.prerun(self.bibolamazifile) + + def __exit__(self, exc_type, exc_value, exc_traceback): + if exc_type is None: + # success + self.filter_instance.postrun(self.bibolamazifile) + self.logger.info('{:-^79s}\n'.format(' filter ✅ ')) try: filtername = filter_instance.name() action = filter_instance.action() - logger.info("=== Filter: %s", filter_instance.getRunningMessage()) - - filter_instance.prerun(self) - # # pass the whole bibolamazifile to the filter. the filter can actually do # whatever it wants with it (!!) # if (action == BibFilter.BIB_FILTER_BIBOLAMAZIFILE): - filter_instance.filter_bibolamazifile(self) - logger.debug('filter %s filtered the full bibolamazifile.', + with _WrapFilterAction(self, logger, filtername, filter_instance): + + filter_instance.filter_bibolamazifile(self) + + logger.debug('filter ‘%s’ processed the full bibolamazifile.', filter_instance.name()) return @@ -1266,12 +1294,13 @@ def runFilter(self, filter_instance): # if (action == BibFilter.BIB_FILTER_SINGLE_ENTRY): - bibdata = self.bibliographyData() + with _WrapFilterAction(self, logger, filtername, filter_instance): - for (k, entry) in bibdata.entries.items(): - filter_instance.filter_bibentry(entry) + bibdata = self.bibliographyData() + for (k, entry) in bibdata.entries.items(): + filter_instance.filter_bibentry(entry) - logger.debug('filter %s filtered each of the the bibentries one by one.', + logger.debug('filter %s processed all the bibliographic entries.', filter_instance.name()) return @@ -1321,8 +1350,6 @@ def saveRawToFile(self, fname=None, cachefname=None): logger.info("Saved file '%s'", fname) - - def saveToFile(self, fname=None, cachefname=None): """ Save the current bibolamazi file object to disk. @@ -1354,16 +1381,9 @@ def saveToFile(self, fname=None, cachefname=None): always silently overwritten (so be careful). The same applies to the cache file. """ - if fname is None: - fname = self._fname - if cachefname is None or (isinstance(cachefname, bool) and cachefname): - cachefname = self.cacheFileName() - elif isinstance(cachefname, bool) and not cachefname: - cachefname = '' - else: - pass # cachefname has a specific file name - + fname, cachefname = self._get_fname_and_cachefname(fname, cachefname) + with codecs.open(fname, 'w', BIBOLAMAZI_FILE_ENCODING) as f: f.write(self._header) f.write(self._config) @@ -1385,26 +1405,52 @@ def saveToFile(self, fname=None, cachefname=None): # # Write to bibtex output # - #w = outputbibtex.Writer() - #w.write_stream(self._bibliographydata, f) - # - #f.write(self._bibliographydata.to_string('bibtex')) - # w = outputbibtex.Writer() f.write(w.to_string(self._bibliographydata)) - logger.info("Updated output file '%s'", fname) + logger.info("✨ Updated output file '%s'", fname) + + self.saveCache(cachefname=cachefname) + + def _get_fname_and_cachefname(self, fname, cachefname): + if fname is None: + fname = self._fname + + if cachefname is None or (isinstance(cachefname, bool) and cachefname): + cachefname = self.cacheFileName() + elif isinstance(cachefname, bool) and not cachefname: + cachefname = '' + else: + pass # cachefname has a specific file name + + return fname, cachefname + + def saveCache(self, cachefname=None): + r""" + Save the cache. If `cachefname` is None, the cache file name is deduced + from the current file name (see :py:meth:`fname()`). + + The argument `cachefname` is parsed exactly as in the method + :py:meth:`saveToFile()`. + + .. note:: If you call :py:meth:`saveToFile()`, then the cache is + automatically saved and a separate call to `saveCache()` is + not necessary. + + Warning: This method will silently overwrite any existing file of the + same name. + """ + + _, cachefname = self._get_fname_and_cachefname(None, cachefname) - # if we have cache to save, save it if (cachefname and self._user_cache and self._user_cache.hasCache()): try: with open(cachefname, 'wb') as f: - logger.debug("Writing cache to file %s", cachefname) + logger.debug("Writing cache to file ‘%s’", cachefname) self._user_cache.saveCache(f) except IOError as e: - logger.debug("Couldn't save cache to file '%s'.", cachefname) + logger.debug("Error saving cache to file ‘%s’: %s", cachefname, e) - diff --git a/bibolamazi/filters/arxiv.py b/bibolamazi/filters/arxiv.py index 8c54722..608fb63 100644 --- a/bibolamazi/filters/arxiv.py +++ b/bibolamazi/filters/arxiv.py @@ -329,6 +329,9 @@ def __init__(self, self.warn_journal_ref = butils.getbool(warn_journal_ref) + self.summary_info_mismatch = None + self.summary_published = None + if self.unpublished_mode == MODE_EPRINT and 'journal' in self.strip_unpublished_fields: if self.arxiv_journal_name and self.arxiv_journal_name != _default_arxiv_journal_name: raise ValueError( @@ -364,6 +367,39 @@ def prerun(self, bibolamazifile): logger.debug("arxiv prerun(): re-validating arxiv info cache") bibolamazifile.cacheAccessor(arxivutil.ArxivInfoCacheAccessor).revalidate(bibolamazifile) + # initialize "summary" messages + # + # summary_info_mismatch --- entries for which info fetched from + # arXiv.org don't match the provided bibtex entry + self.summary_info_mismatch = [] + # summary_published --- entries that refer to the arXiv but + # which are in fact published in some other venue + self.summary_published = [] + + + def postrun(self, bibolamazifile): + # Save the cache to disk immediately. This action is useful in case + # there is an exception in a later filter, which would cause the cache + # not to be saved in a future run. + bibolamazifile.saveCache() + + # present any "summaries": + + # -- for published entries + logger.warning( + "The following arXiv-only bibliographic {entryname} have been published in " + "other {venuename}:\n" + .format(entryname='entry' if len(self.summary_published) == 1 else 'entries', + venuename='venue' if len(self.summary_published) == 1 else 'venues') + + "\n".join( + "- {key} ({arxivid}) → {doiurl}".format( + key='‘'+key+'’', + arxivid=arxivinfo['arxivid'], + doiurl='https://doi.org/'+arxivinfo['doi'], + ) + for key, arxivinfo in self.summary_published + ) + ) def filter_bibentry(self, entry): # @@ -399,8 +435,13 @@ def filter_bibentry(self, entry): if (self.warn_journal_ref and not we_are_published and arxivinfo['doi']): # we think we are not published but we actually are, as reported by arXiv.org API. This # could be because the authors published their paper in the meantime. - logger.warning("arxiv: Entry `%s' refers to arXiv version of published entry with DOI %r", - entry.key, arxivinfo['doi']) + #logger.warning( + # "arxiv: Entry `%s' refers to arXiv version of published entry with DOI %r", + # entry.key, arxivinfo['doi'] + #) + self.summary_published.append( + (entry.key, arxivinfo,) + ) logger.longdebug("arXiv: entry %s: published=%r, mode=%r", entry.key, we_are_published, mode) diff --git a/bibolamazi/filters/util/arxivutil.py b/bibolamazi/filters/util/arxivutil.py index 54046f2..b96d1e4 100644 --- a/bibolamazi/filters/util/arxivutil.py +++ b/bibolamazi/filters/util/arxivutil.py @@ -593,6 +593,8 @@ def complete_cache(self, bibdata, arxiv_api_accessor): # with info from the arXiv API. needs_to_be_completed = [] + summary_info_mismatch = [] + # # Do a first scan through all the bibdata entries, and detect the API # information using only what we have (to figure out the arxiv @@ -649,16 +651,22 @@ def complete_cache(self, bibdata, arxiv_api_accessor): entrydic[k]['primaryclass'][:len(primaryclass)] != primaryclass[:len(entrydic[k]['primaryclass'])]): # - logger.warning("Conflicting primaryclass values for entry %s (%s): " - "%s (given in bibtex) != %s (retrieved from the arxiv)", - k, aid, entrydic[k]['primaryclass'], primaryclass) + summary_info_mismatch.append( + (k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass) + ) + # logger.warning("Conflicting primaryclass values for entry %s (%s): " + # "%s (given in bibtex) != %s (retrieved from the arxiv)", + # k, aid, entrydic[k]['primaryclass'], primaryclass) else: entrydic[k]['primaryclass'] = primaryclass if (doi and entrydic[k]['doi'] and entrydic[k]['doi'].lower() != doi.lower()): - logger.warning("Conflicting doi values for entry %s (%s): " - "%s (given in bibtex) != %s (retrieved from the arxiv)", - k, aid, entrydic[k]['doi'], doi) + summary_info_mismatch.append( + (k, aid, 'doi', entrydic[k]['doi'], doi) + ) + # logger.warning("Conflicting doi values for entry %s (%s): " + # "%s (given in bibtex) != %s (retrieved from the arxiv)", + # k, aid, entrydic[k]['doi'], doi) else: entrydic[k]['doi'] = doi @@ -669,6 +677,25 @@ def complete_cache(self, bibdata, arxiv_api_accessor): logger.warning("Failed to fetch information from the arXiv for %d entries: %s", len(fail_aids), joined) + # warning for info mismatch + if summary_info_mismatch: + logger.warning( + "Mismatch: info in bibtex ≠ info from arxiv.org\n" + + "\n".join( + "- ‘{key}’ ({arxivid}) [{field}]:\n" + " {value_from_bibtex} ≠ {value_from_arxivorg}" + .format(key=key, + arxivid=arxivid, + field=field, + value_from_bibtex='“'+value_from_bibtex+'”', + value_from_arxivorg='“'+value_from_arxivorg+'”',) + for key, arxivid, field, value_from_bibtex, value_from_arxivorg + in summary_info_mismatch + ) + ) + + + def getArXivInfo(self, entrykey): """ diff --git a/doc/bibolamazi.core.bibfilter.rst b/doc/bibolamazi.core.bibfilter.rst index 174dfdc..130fa51 100644 --- a/doc/bibolamazi.core.bibfilter.rst +++ b/doc/bibolamazi.core.bibfilter.rst @@ -1,6 +1,16 @@ :mod:`bibolamazi.core.bibfilter` package ======================================== + +Module :mod:`bibolamazi.core.bibfilter` +--------------------------------------- + +.. automodule:: bibolamazi.core.bibfilter + :members: + :undoc-members: + :show-inheritance: + + bibolamazi.core.bibfilter.argtypes module ----------------------------------------- @@ -16,12 +26,3 @@ bibolamazi.core.bibfilter.factory module :members: :undoc-members: :show-inheritance: - - -Module contents ---------------- - -.. automodule:: bibolamazi.core.bibfilter - :members: - :undoc-members: - :show-inheritance: