improvements to log messages; postrun() filter method

phfaist · Nov 1, 2020 · 8ea3cbd · 8ea3cbd
1 parent bff4b0a
commit 8ea3cbd
Show file tree

Hide file tree

Showing 6 changed files with 195 additions and 57 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -9,12 +9,14 @@ env:
     - BIBOLAMAZI_TESTS_SKIP_GITHUB_ACCESS=1
 
 python:
+  - "3.4" # windows builds still use 3.4 (yes, I know...)
   - "3.6"
-  - "3.7"
-  - "3.8"
   - "3.9"
 
 install:
+  # On Python 3.4, we need to install pyyaml==3.13 because newer PyYaml dropped py3.4 support
+  - if [ "`python -c 'import sys; print(\"%d.%d\"%sys.version_info[0:2])'`" == "3.4" ]; then  pip install pyyaml==3.13; fi
+
   - pip install -r pip_requirements.txt
 
 # command to run tests

diff --git a/bibolamazi/core/bibfilter/__init__.py b/bibolamazi/core/bibfilter/__init__.py
@@ -170,6 +170,27 @@ def prerun(self, bibolamazifile):
         The default implementation does nothing.
         """
         return
+
+    def postrun(self, bibolamazifile):
+        """
+        This function gets called immediately after the filter is run, before any
+        further filters are executed.
+
+        It is not very useful if the :py:meth:`action()` is
+        :py:const:`BibFilter.BIB_FILTER_BIBOLAMAZIFILE`, but it can prove useful
+        for filters with action :py:const:`BibFilter.BIB_FILTER_SINGLE_ENTRY`,
+        if any sort of global post-processing task should be done immediately
+        after the actual filtering of the data.
+
+        You can use this function, e.g., to produce an aggregated warning or
+        report message.
+
+        This method is not called if the filter raised an exception, whether
+        internal or not.
+
+        The default implementation does nothing.
+        """
+        return
 
 
     def filter_bibentry(self, x):

diff --git a/bibolamazi/core/bibolamazifile.py b/bibolamazi/core/bibolamazifile.py
@@ -547,7 +547,8 @@ def fullFilterPath(self):
 
         See also filterPath().
         """
-        return PrependOrderedDict(list(self._filterpath.items()) + list(factory.filterpath.items()))
+        return PrependOrderedDict(list(self._filterpath.items())
+                                  + list(factory.filterpath.items()))
 
     def filters(self):
         """
@@ -579,7 +580,8 @@ def bibliographydata(self):
         .. deprecated:: 2.0
            Use `bibliographyData()` instead!
         """
-        butils.warn_deprecated("BibolamaziFile", "bibliographydata()", "bibliographyData()", __name__)
+        butils.warn_deprecated("BibolamaziFile", "bibliographydata()",
+                               "bibliographyData()", __name__)
         return self.bibliographyData()
 
     def cacheFileName(self):
@@ -1026,8 +1028,13 @@ def _load_contents(self):
         self._bibliographydata = None
 
         if (not len(self._source_lists)):
-            logger.warning("File `%s': No source files specified. You need source files to provide bib entries!",
-                           self._fname)
+            logger.warning(
+                "File ‘%s’: No source files specified. You need source files to provide "
+                "bibliographic entries!",
+                self._fname
+            )
+
+        logger.info('{:+^80s}'.format(' collecting sources '))
 
         # now, populate all bibliographydata.
         num_conflicting_keys = 0
@@ -1040,6 +1047,7 @@ def _load_contents(self):
         if num_conflicting_keys:
             logger.info(CONFLICT_KEY_INFO)
 
+        logger.info('{:+^80s}\n'.format(''))
 
         # Now, try to load the cache
         # --------------------------
@@ -1119,7 +1127,7 @@ def _populate_from_src(self, src):
                 # ignore source, will have to try next in list
                 return (False,0)
 
-        logger.info("Found Source: %s", src)
+        logger.info("→ %s", src)
 
         try:
             # parse bibtex
@@ -1149,7 +1157,8 @@ def _populate_from_src(self, src):
                         key = oldkey + u".conflictkey." + str(n)
 
                     entry.key = key
-                    logger.debug("Key conflict in source file %s: renamed %s -> %s", src, oldkey, key)
+                    logger.debug("Key conflict in source file %s: renamed %s -> %s",
+                                 src, oldkey, key)
 
                 self._bibliographydata.add_entry(key, entry)
 
@@ -1239,24 +1248,43 @@ def runFilter(self, filter_instance):
         # entries etc.), or it can act on a single entry.
         #
 
-        filtername = ''
+        class _WrapFilterAction:
+            def __init__(self, bibolamazifile, logger, filtername, filter_instance):
+                super().__init__()
+                self.bibolamazifile = bibolamazifile
+                self.logger = logger
+                self.filtername = filtername
+                self.filter_instance = filter_instance
+
+            def __enter__(self):
+                self.logger.info('{:-^80s}'.format(' filter ‘{}’ '.format(self.filtername)))
+                msg = self.filter_instance.getRunningMessage()
+                if msg != self.filtername:
+                    self.logger.info(msg)
+
+                self.filter_instance.prerun(self.bibolamazifile)
+
+            def __exit__(self, exc_type, exc_value, exc_traceback):
+                if exc_type is None:
+                    # success
+                    self.filter_instance.postrun(self.bibolamazifile)
+                    self.logger.info('{:-^79s}\n'.format(' filter ✅ '))
 
         try:
             filtername = filter_instance.name()
             action = filter_instance.action()
 
-            logger.info("=== Filter: %s", filter_instance.getRunningMessage())
-
-            filter_instance.prerun(self)
-
             #
             # pass the whole bibolamazifile to the filter. the filter can actually do
             # whatever it wants with it (!!)
             #
             if (action == BibFilter.BIB_FILTER_BIBOLAMAZIFILE):
-                filter_instance.filter_bibolamazifile(self)
 
-                logger.debug('filter %s filtered the full bibolamazifile.',
+                with _WrapFilterAction(self, logger, filtername, filter_instance):
+
+                    filter_instance.filter_bibolamazifile(self)
+
+                logger.debug('filter ‘%s’ processed the full bibolamazifile.',
                              filter_instance.name())
                 return
 
@@ -1266,12 +1294,13 @@ def runFilter(self, filter_instance):
             #
             if (action == BibFilter.BIB_FILTER_SINGLE_ENTRY):
 
-                bibdata = self.bibliographyData()
+                with _WrapFilterAction(self, logger, filtername, filter_instance):
 
-                for (k, entry) in bibdata.entries.items():
-                    filter_instance.filter_bibentry(entry)
+                    bibdata = self.bibliographyData()
+                    for (k, entry) in bibdata.entries.items():
+                        filter_instance.filter_bibentry(entry)
 
-                logger.debug('filter %s filtered each of the the bibentries one by one.',
+                logger.debug('filter %s processed all the bibliographic entries.',
                              filter_instance.name())
                 return
 
@@ -1321,8 +1350,6 @@ def saveRawToFile(self, fname=None, cachefname=None):
             logger.info("Saved file '%s'", fname)
 
 
-
-
     def saveToFile(self, fname=None, cachefname=None):
         """
         Save the current bibolamazi file object to disk.
@@ -1354,16 +1381,9 @@ def saveToFile(self, fname=None, cachefname=None):
                     always silently overwritten (so be careful). The same
                     applies to the cache file.
         """
-        if fname is None:
-            fname = self._fname
 
-        if cachefname is None or (isinstance(cachefname, bool) and cachefname):
-            cachefname = self.cacheFileName()
-        elif isinstance(cachefname, bool) and not cachefname:
-            cachefname = ''
-        else:
-            pass # cachefname has a specific file name
-
+        fname, cachefname = self._get_fname_and_cachefname(fname, cachefname)
+
         with codecs.open(fname, 'w', BIBOLAMAZI_FILE_ENCODING) as f:
             f.write(self._header)
             f.write(self._config)
@@ -1385,26 +1405,52 @@ def saveToFile(self, fname=None, cachefname=None):
                 #
                 # Write to bibtex output
                 #
-                #w = outputbibtex.Writer()
-                #w.write_stream(self._bibliographydata, f)
-                #
-                #f.write(self._bibliographydata.to_string('bibtex'))
-                #
                 w = outputbibtex.Writer()
                 f.write(w.to_string(self._bibliographydata))
 
-            logger.info("Updated output file '%s'", fname)
+            logger.info("✨ Updated output file '%s'", fname)
+
+        self.saveCache(cachefname=cachefname)
+
+    def _get_fname_and_cachefname(self, fname, cachefname):
+        if fname is None:
+            fname = self._fname
+
+        if cachefname is None or (isinstance(cachefname, bool) and cachefname):
+            cachefname = self.cacheFileName()
+        elif isinstance(cachefname, bool) and not cachefname:
+            cachefname = ''
+        else:
+            pass # cachefname has a specific file name
+
+        return fname, cachefname
+
+    def saveCache(self, cachefname=None):
+        r"""
+        Save the cache.  If `cachefname` is None, the cache file name is deduced
+        from the current file name (see :py:meth:`fname()`).
+
+        The argument `cachefname` is parsed exactly as in the method
+        :py:meth:`saveToFile()`.
+
+        .. note:: If you call :py:meth:`saveToFile()`, then the cache is
+                  automatically saved and a separate call to `saveCache()` is
+                  not necessary.
+
+        Warning: This method will silently overwrite any existing file of the
+        same name.
+        """
+
+        _, cachefname = self._get_fname_and_cachefname(None, cachefname)
 
-        # if we have cache to save, save it
         if (cachefname and self._user_cache and self._user_cache.hasCache()):
             try:
                 with open(cachefname, 'wb') as f:
-                    logger.debug("Writing cache to file %s", cachefname)
+                    logger.debug("Writing cache to file ‘%s’", cachefname)
                     self._user_cache.saveCache(f)
             except IOError as e:
-                logger.debug("Couldn't save cache to file '%s'.", cachefname)
+                logger.debug("Error saving cache to file ‘%s’: %s", cachefname, e)
 
-
 
 
 

diff --git a/bibolamazi/filters/arxiv.py b/bibolamazi/filters/arxiv.py
@@ -329,6 +329,9 @@ def __init__(self,
 
         self.warn_journal_ref = butils.getbool(warn_journal_ref)
 
+        self.summary_info_mismatch = None
+        self.summary_published = None
+
         if self.unpublished_mode == MODE_EPRINT and 'journal' in self.strip_unpublished_fields:
             if self.arxiv_journal_name and self.arxiv_journal_name != _default_arxiv_journal_name:
                 raise ValueError(
@@ -364,6 +367,39 @@ def prerun(self, bibolamazifile):
         logger.debug("arxiv prerun(): re-validating arxiv info cache")
         bibolamazifile.cacheAccessor(arxivutil.ArxivInfoCacheAccessor).revalidate(bibolamazifile)
 
+        # initialize "summary" messages
+        #
+        # summary_info_mismatch --- entries for which info fetched from
+        # arXiv.org don't match the provided bibtex entry
+        self.summary_info_mismatch = []
+        # summary_published --- entries that refer to the arXiv but
+        # which are in fact published in some other venue
+        self.summary_published = []
+
+
+    def postrun(self, bibolamazifile):
+        # Save the cache to disk immediately. This action is useful in case
+        # there is an exception in a later filter, which would cause the cache
+        # not to be saved in a future run.
+        bibolamazifile.saveCache()
+
+        # present any "summaries":
+
+        # -- for published entries
+        logger.warning(
+            "The following arXiv-only bibliographic {entryname} have been published in "
+            "other {venuename}:\n"
+            .format(entryname='entry' if len(self.summary_published) == 1 else 'entries',
+                    venuename='venue' if len(self.summary_published) == 1 else 'venues')
+            + "\n".join(
+                "- {key} ({arxivid})  → {doiurl}".format(
+                    key='‘'+key+'’',
+                    arxivid=arxivinfo['arxivid'],
+                    doiurl='https://doi.org/'+arxivinfo['doi'],
+                )
+                for key, arxivinfo in self.summary_published
+            )
+        )
 
     def filter_bibentry(self, entry):
         #
@@ -399,8 +435,13 @@ def filter_bibentry(self, entry):
         if (self.warn_journal_ref and not we_are_published and arxivinfo['doi']):
             # we think we are not published but we actually are, as reported by arXiv.org API. This
             # could be because the authors published their paper in the meantime.
-            logger.warning("arxiv: Entry `%s' refers to arXiv version of published entry with DOI %r",
-                           entry.key, arxivinfo['doi'])
+            #logger.warning(
+            #    "arxiv: Entry `%s' refers to arXiv version of published entry with DOI %r",
+            #    entry.key, arxivinfo['doi']
+            #)
+            self.summary_published.append(
+                (entry.key, arxivinfo,)
+            )
 
         logger.longdebug("arXiv: entry %s: published=%r, mode=%r", entry.key, we_are_published, mode)
 

diff --git a/bibolamazi/filters/util/arxivutil.py b/bibolamazi/filters/util/arxivutil.py
@@ -593,6 +593,8 @@ def complete_cache(self, bibdata, arxiv_api_accessor):
         # with info from the arXiv API.
         needs_to_be_completed = []
 
+        summary_info_mismatch = []
+
         #
         # Do a first scan through all the bibdata entries, and detect the API
         # information using only what we have (to figure out the arxiv
@@ -649,16 +651,22 @@ def complete_cache(self, bibdata, arxiv_api_accessor):
                 entrydic[k]['primaryclass'][:len(primaryclass)] !=
                 primaryclass[:len(entrydic[k]['primaryclass'])]):
                 #
-                logger.warning("Conflicting primaryclass values for entry %s (%s): "
-                               "%s (given in bibtex) != %s (retrieved from the arxiv)",
-                               k, aid, entrydic[k]['primaryclass'], primaryclass)
+                summary_info_mismatch.append(
+                    (k, aid, 'primaryclass', entrydic[k]['primaryclass'], primaryclass)
+                )
+                # logger.warning("Conflicting primaryclass values for entry %s (%s): "
+                #                "%s (given in bibtex) != %s (retrieved from the arxiv)",
+                #                k, aid, entrydic[k]['primaryclass'], primaryclass)
             else:
                 entrydic[k]['primaryclass'] = primaryclass
 
             if (doi and entrydic[k]['doi'] and entrydic[k]['doi'].lower() != doi.lower()):
-                logger.warning("Conflicting doi values for entry %s (%s): "
-                               "%s (given in bibtex) != %s (retrieved from the arxiv)",
-                               k, aid, entrydic[k]['doi'], doi)
+                summary_info_mismatch.append(
+                    (k, aid, 'doi', entrydic[k]['doi'], doi)
+                )
+                # logger.warning("Conflicting doi values for entry %s (%s): "
+                #                "%s (given in bibtex) != %s (retrieved from the arxiv)",
+                #                k, aid, entrydic[k]['doi'], doi)
             else:
                 entrydic[k]['doi'] = doi
 
@@ -669,6 +677,25 @@ def complete_cache(self, bibdata, arxiv_api_accessor):
             logger.warning("Failed to fetch information from the arXiv for %d entries: %s",
                            len(fail_aids), joined)
 
+        # warning for info mismatch
+        if summary_info_mismatch:
+            logger.warning(
+                "Mismatch: info in bibtex ≠ info from arxiv.org\n" +
+                "\n".join(
+                    "- ‘{key}’ ({arxivid}) [{field}]:\n"
+                    "    {value_from_bibtex}  ≠  {value_from_arxivorg}"
+                    .format(key=key,
+                            arxivid=arxivid,
+                            field=field,
+                            value_from_bibtex='“'+value_from_bibtex+'”',
+                            value_from_arxivorg='“'+value_from_arxivorg+'”',)
+                    for key, arxivid, field, value_from_bibtex, value_from_arxivorg
+                    in summary_info_mismatch
+                )
+            )
+
+
+
 
     def getArXivInfo(self, entrykey):
         """