Support baselines in console reporter

parttimenerd · Jul 24, 2019 · 6449a35 · 6449a35
1 parent 351b7bb
commit 6449a35
Show file tree

Hide file tree

Showing 6 changed files with 172 additions and 50 deletions.
diff --git a/doc/temci_report.rst b/doc/temci_report.rst
@@ -167,6 +167,16 @@ command line options of the same name (prefixed with ``console_``):
 
 .. code:: yaml
 
+    # Matches the baseline block
+    baseline: ''
+
+    # Position of the baseline comparison:
+    # 'each': after each block
+    # 'after': after each cluster
+    # 'both': after each and after cluster
+    # 'instead': instead of the non baselined
+    baseline_position: each
+
     # 'auto': report clusters (runs with the same description)
     #         and singles (clusters with a single entry, combined) separately
     # 'single': report all clusters together as one
@@ -189,15 +199,35 @@ Output for a simple benchmark (with ``--properties utime``):
 
     Report for single runs
     sleep 0.5            (    2 single benchmarks)
-         utime mean =        1.(211)m, deviation = 33.27828%
+         utime mean =        1.(211)m, deviation =   33.27828%
 
     sleep 1              (    2 single benchmarks)
-         utime mean =        1.(172)m, deviation = 29.91891%
+         utime mean =        1.(172)m, deviation =   29.91891%
 
     Equal program blocks
          sleep 0.5  ⟷  sleep 1
              utime confidence =        95%, speed up =      3.26%
 
+Or using `sleep 0.5` as a baseline (``--console_baseline "sleep 0.5"``):
+
+.. code:: sh
+
+    Report for single runs
+    sleep 0.5            (    5 single benchmarks)
+         utime mean =      (1).(661)m, deviation =   18.91399%
+
+    sleep 1              (    5 single benchmarks)
+         utime mean =      (1).(138)m, deviation =   37.83985%
+
+    sleep 1              (    5) with baseline sleep 1              (    5)
+         utime mean =     (68).(554)%, confidence =    9%, dev =   37.83985%,   18.91399%
+    geometric mean of relative mean =         68.554%
+
+    Uncertain program blocks
+         sleep 0.5  ⟷   sleep 1
+             utime confidence =    9%, speed up =     31.45%
+
+
 The sample ``run_output.yaml`` was created via ``temci short exec 'sleep 0.5' 'sleep 1' --runs 5 --runner rusage``:
 
 .. code:: yaml

diff --git a/temci/report/report.py b/temci/report/report.py
@@ -14,10 +14,13 @@
 
 import itertools
 
+from scipy import stats
+
 from temci.report.stats import TestedPairsAndSingles, BaseStatObject, TestedPair, TestedPairProperty, StatMessage, \
     StatMessageType, Single, SingleProperty, SinglesProperty
 from temci.report.testers import TesterRegistry, Tester
 from temci.report.rundata import RunDataStatsHelper, RunData, ExcludedInvalidData
+from temci.run.run_driver import filter_runs
 from temci.utils.sudo_utils import chown
 from temci.utils.typecheck import *
 from temci.utils.registry import AbstractRegistry, register
@@ -113,7 +116,11 @@ def report(self):
                          "'single': report all clusters together as one, "
                          "'cluster': report all clusters separately, "
                          "'both': append the output of 'cluster' to the output of 'single'"),
-    "report_errors": Bool() // Default(True) // Description("Report on the failing blocks")
+    "report_errors": Bool() // Default(True) // Description("Report on the failing blocks"),
+    "baseline": Str() // Default("") // Description("Matches the baseline block"),
+    "baseline_position": ExactEither("each", "after", "both", "instead") // Default("each")
+                    // Description("Position of the baseline comparison: each: after each block, after: after each "
+                                   "cluster, both: after each and after cluster, instead: instead of the non baselined")
 }))
 class ConsoleReporter(AbstractReporter):
     """
@@ -132,22 +139,26 @@ def report(self, with_tester_results: bool = True, to_string: bool = False) -> t
 
         with_tester_results = with_tester_results and self.misc["with_tester_results"]
 
+        baselines = filter_runs(self.stats_helper.runs, self.misc["baseline"]) if self.misc["baseline"] else []
+
         def string_printer(line: str, **args):
             output[0] += str(line) + "\n"
 
         with click.open_file(self.misc["out"], mode='w') as f:
             print_func = string_printer if to_string else lambda x: print(x, file=f)
             if self.misc["mode"] == "auto":
                 single, clusters = self.stats_helper.get_description_clusters_and_single()
-                self._report_cluster("single runs", single, print_func, with_tester_results)
-                self._report_clusters(clusters, print_func, with_tester_results)
+                self._report_cluster("single runs", single, print_func, with_tester_results, baselines)
+                self._report_clusters(clusters, print_func, with_tester_results, baselines)
             if self.misc["mode"] in ["both", "single"]:
                 self._report_cluster("all runs",
                                      self.stats_helper.runs,
                                      print_func,
-                                     with_tester_results)
+                                     with_tester_results,
+                                     baselines)
             if self.misc["mode"] in ["both", "cluster"]:
-                self._report_clusters(self.stats_helper.get_description_clusters(), print_func, with_tester_results)
+                self._report_clusters(self.stats_helper.get_description_clusters(), print_func, with_tester_results,
+                                      baselines)
                 print_func("")
             if self.misc["report_errors"] and len(self.stats_helper.errorneous_runs) > 0:
                 self._report_errors(self.stats_helper.errorneous_runs, print_func)
@@ -156,65 +167,112 @@ def string_printer(line: str, **args):
             return output[0]
 
     def _report_clusters(self, clusters: t.Dict[str, t.List[RunData]], print_func: t.Callable[[str], None],
-                         with_tester_results: bool):
+                         with_tester_results: bool, baselines: t.List[RunData]):
         for n, c in clusters.items():
             self._report_cluster(n,
                                  c,
                                  print_func,
-                                 with_tester_results)
+                                 with_tester_results,
+                                 baselines)
 
-    def _report_cluster(self, description: str, items: t.List[RunData], print_func: t.Callable[[str], None],
-                        with_tester_results: bool):
-        if not items:
+    def _report_cluster(self, description: str, blocks: t.List[RunData], print_func: t.Callable[[str], None],
+                        with_tester_results: bool, baselines: t.List[RunData]):
+        if not blocks:
             return
         print_func("Report for {}".format(description))
-        descr_size = max(len(prop) for block in items for prop in block.properties)
-        for block in items:
-            assert isinstance(block, RunData)
-            print_func("{descr:<20} ({num:>5} single benchmarks)"
-                       .format(descr=block.description(), num=len(block.data[block.properties[0]])))
-            for prop in sorted(block.properties):
-                mean = np.mean(block[prop])
-                std = np.std(block[prop])
-                mean_str = str(FNumber(mean, abs_deviation=std))
-                dev = "{:>5.5%}".format(std / mean) if mean != 0 else "{:>5.5}".format(std)
-                print_func("\t {{prop:<{}}} mean = {{mean:>15s}}, deviation = {{dev}}".format(descr_size)
-                    .format(
-                    prop=prop, mean=mean_str,
-                    dev=dev))
-            print_func("")
+        descr_size = max(len(prop) for block in blocks for prop in block.properties)
+        self._report_blocks(blocks, print_func, baselines, descr_size)
         if with_tester_results:
-            stats_helper = RunDataStatsHelper(items, self.stats_helper.tester,
+            stats_helper = RunDataStatsHelper(blocks, self.stats_helper.tester,
                                               property_descriptions=self.stats_helper.property_descriptions)
             self._report_list("Equal program blocks",
-                              stats_helper.get_evaluation(with_equal=True,
+                              stats_helper.get_evaluation(blocks=blocks, with_equal=True,
                                                           with_uncertain=False,
                                                           with_unequal=False),
                               print_func, descr_size)
             self._report_list("Unequal program blocks",
-                              stats_helper.get_evaluation(with_equal=False,
+                              stats_helper.get_evaluation(blocks=blocks, with_equal=False,
                                                           with_uncertain=False,
                                                           with_unequal=True),
                               print_func, descr_size)
             self._report_list("Uncertain program blocks",
-                              stats_helper.get_evaluation(with_equal=False,
+                              stats_helper.get_evaluation(blocks=blocks, with_equal=False,
                                                           with_uncertain=True,
                                                           with_unequal=False),
                               print_func, descr_size)
 
+    def _report_blocks(self, blocks: t.List[RunData], print_func: t.Callable[[str], None],
+                       baselines: t.List[RunData], descr_size: int):
+        if self.misc["baseline_position"] != "instead":
+            for block in blocks:
+                assert isinstance(block, RunData)
+                self._report_block(block, print_func, baselines, descr_size)
+        if self.misc["baseline_position"] in ["after", "both", "instead"]:
+            for baseline in baselines:
+                if baseline == block:
+                    continue
+                self._report_block_with_baseline(block, print_func, baseline, descr_size)
+                print_func("")
+
+    def _report_block(self, block: RunData, print_func: t.Callable[[str], None],
+                      baselines: t.List[RunData], descr_size: int):
+        print_func("{descr:<20} ({num:>5} single benchmarks)"
+                   .format(descr=block.description(), num=len(block.data[block.properties[0]])))
+        for prop in sorted(block.properties):
+            mean = np.mean(block[prop])
+            std = np.std(block[prop])
+            mean_str = str(FNumber(mean, abs_deviation=std))
+            dev = "{:>5.5%}".format(std / mean) if mean != 0 else "{:>5.5}".format(std)
+            print_func("\t {{prop:<{}}} mean = {{mean:>15s}}, deviation = {{dev:>11s}}".format(descr_size)
+                .format(
+                prop=prop, mean=mean_str,
+                dev=dev))
+        print_func("")
+        if self.misc["baseline_position"] in ["each", "both", "instead"]:
+            for baseline in baselines:
+                if baseline == block:
+                    continue
+                self._report_block_with_baseline(block, print_func, baseline, descr_size)
+                print_func("")
+
+    def _report_block_with_baseline(self, block: RunData, print_func: t.Callable[[str], None], baseline: RunData,
+                                    descr_size: int):
+        print_func("{descr:<20} ({num:>5}) with baseline {descr2:<20} ({num2:>5})"
+                   .format(descr=block.description(), num=len(block.data[block.properties[0]]),
+                           descr2=block.description(), num2=len(block.data[block.properties[0]])))
+        combined_props = set(block.properties) & set(baseline.properties)
+        tester = TesterRegistry.get_tester()
+        for prop in sorted(combined_props):
+            mean = np.mean(block[prop])
+            std = np.std(block[prop])
+            base_mean = baseline.get_single_properties()[prop].mean()
+            base_std = baseline.get_single_properties()[prop].std()
+            mean_str = str(FNumber(mean / base_mean, abs_deviation=std / base_mean, is_percent=True))
+            dev = "{:>5.5%}".format(std / mean) if mean != 0 else "{:>5.5}".format(std)
+            print_func("\t {{prop:<{}}} mean = {{mean:>15s}}, confidence = {{conf:>5.0%}}, dev = {{dev:>11s}}, "
+                       "{{dbase:>11s}}".format(descr_size)
+                .format(
+                    prop=prop,
+                    mean=mean_str,
+                    dev=dev,
+                    conf=tester.test(block[prop], baseline[prop]),
+                    dbase="{:>5.5%}".format(base_std / base_mean) if base_mean != 0 else "{:>5.5}".format(base_std)))
+        gmean = stats.gmean([(block.get_single_properties()[prop].mean() / baseline.get_single_properties()[prop].mean())
+                            for prop in combined_props])
+        print_func("geometric mean of relative mean = {:>15}".format(str(FNumber(gmean, is_percent=True))))
 
     def _report_list(self, title: str, items: t.List[dict], print_func: t.Callable[[str], None], descr_size: int):
         if len(items) != 0:
             print_func(title)
         for item in items:
-            print_func("\t {}  ⟷  {}".format(item["data"][0].description(),
+            print_func("\t {}  ⟷   {}".format(item["data"][0].description(),
                                        item["data"][1].description()))
             for prop in sorted(item["properties"]):
                 prop_data = item["properties"][prop]
                 perc = prop_data["p_val"]
                 if prop_data["unequal"]:
                     perc = 1 - perc
-                print_func("\t\t {{descr:<{}}} confidence = {{perc:>10.0%}}, speed up = {{speed_up:>10.2%}}"
+                print_func("\t\t {{descr:<{}}} confidence = {{perc:>5.0%}}, speed up = {{speed_up:>10.2%}}"
                       .format(descr_size).format(descr=prop_data["description"], perc=perc,
                               speed_up=prop_data["speed_up"]))
             print_func("")
@@ -1827,17 +1885,20 @@ def _column(self, single: Single, spec: t.Tuple[str, str]) -> t.Union[str, int,
             raise SyntaxError("No such property {}".format(long_prop))
         return self._column_property(single.properties[long_prop], spec[1], spec[2])
 
-    def _column_property(self, single: SingleProperty, modifier: str, opts: t.List[str]) -> t.Union[str, int, float]:
+    def _column_property(self, single: SingleProperty, modifier: str, opts: t.List[str],
+                         baseline: SingleProperty = None) -> t.Union[str, int, float]:
         mod = {
-            "mean": lambda: single.mean(),
-            "stddev": lambda: single.std_dev(),
-            "property": lambda: single.property,
-            "description": lambda: single.description(),
-            "min": lambda: single.min(),
-            "max": lambda: single.max(),
-            "stddev per mean": lambda: single.std_dev_per_mean()
+            "mean": lambda single: single.mean(),
+            "stddev": lambda single: single.std_dev(),
+            "property": lambda single: single.property,
+            "description": lambda single: single.description(),
+            "min": lambda single: single.min(),
+            "max": lambda single: single.max(),
+            "stddev per mean": lambda single: single.std_dev_per_mean()
         }
-        num = mod[modifier]()
+        num = mod[modifier](single)
+        if baseline:
+            num = num / baseline.mean()
         return FNumber(num,
                        abs_deviation=single.std_dev(),
                        is_percent=("%" in opts),

diff --git a/temci/report/rundata.py b/temci/report/rundata.py
@@ -635,7 +635,8 @@ def has_error(self, program_id: int) -> bool:
         """ Is there an error recorded for the program with the given id? """
         return self.runs[program_id + self.external_count].has_error()
 
-    def get_evaluation(self, with_equal: bool, with_unequal: bool, with_uncertain: bool) -> dict:
+    def get_evaluation(self, with_equal: bool, with_unequal: bool, with_uncertain: bool,
+                       blocks: t.List[RunData] = None) -> dict:
         """
 
         Structure of the returned list items::
@@ -652,14 +653,16 @@ def get_evaluation(self, with_equal: bool, with_unequal: bool, with_uncertain: b
         :param with_equal: with tuple with at least one "equal" property
         :param with_unequal: ... unequal property
         :param with_uncertain: include also uncertain properties
+        :param blocks: blocks to compare
         :return: list of tuples for which at least one property matches the criteria
         """
+        blocks = blocks or self.runs
         arr = []
-        for i in range(0, len(self.runs) - 1):
-            for j in range(i + 1, len(self.runs)):
-                if self.runs[i].discarded or self.runs[j].discarded:
+        for i in range(0, len(blocks) - 1):
+            for j in range(i + 1, len(blocks)):
+                if blocks[i].discarded or blocks[j].discarded:
                     continue
-                data = (self.runs[i], self.runs[j])
+                data = (blocks[i], blocks[j])
                 props = {}
                 for prop in self.properties():
                     map = {"p_val": self.tester.test(data[0][prop], data[1][prop]),

diff --git a/temci/report/stats.py b/temci/report/stats.py
@@ -770,6 +770,10 @@ def std_dev(self) -> float:
         """ Standard deviation of the measurements """
         return np.std(self.array)
 
+    def std(self) -> float:
+        """ Standard deviation of the measurements """
+        return np.std(self.array)
+
     def std_devs(self) -> t.Tuple[float, float]:
         """
         Calculates the standard deviation of elements <= mean and of the elements > mean.
@@ -942,9 +946,7 @@ def __init__(self, first: t.Union[RunData, Single], second: t.Union[RunData, Sin
         """ First of the two compared single objects """
         self.second = Single(second)  # type: Single
         """ Second of the two compared single objects """
-        self.tester = tester or TesterRegistry.get_for_name(TesterRegistry.get_used(),  # type: Tester
-                                                            Settings()["stats/tester"],
-                                                            Settings()["stats/uncertainty_range"])
+        self.tester = tester or TesterRegistry.get_tester()
         """ Used statistical tester for the comparisons """
         self.properties = {} # type: t.Dict[str, TestedPairProperty]
         """ TestedPairProperty objects for each shared property of the inherited Single objects """

diff --git a/temci/utils/registry.py b/temci/utils/registry.py
@@ -44,6 +44,17 @@ def get_for_name(cls, name: str, *args, **kwargs) -> t.Any:
         misc_settings = Settings()["/".join([cls.settings_key_path, name + "_misc"])]
         return cls.registry[name](misc_settings, *args, **kwargs)
 
+    @classmethod
+    def get_tester(cls) -> 'Tester':
+        """
+        Returns the tester that is configured in the settings
+
+        :return: tester instance
+        """
+        return cls.get_for_name(cls.get_used(),
+                                Settings()["stats/tester"],
+                                Settings()["stats/uncertainty_range"])
+
     @classmethod
     def get_used(cls) -> t.Union[str, t.List[str]]:
         """

diff --git a/tests/test_report.py b/tests/test_report.py
@@ -56,6 +56,21 @@ def test_properties_regexp():
     assert "p456" in out and "z111" not in out
 
 
+def test_console_baseline():
+    run_temci(r"report in.yaml --console_baseline base", files={
+        "in.yaml": [
+            {
+                "attributes": {"description": "XYZ"},
+                "data": {"p456": [1], "z111": [2]}
+            },
+            {
+                "attributes": {"description": "base"},
+                "data": {"p456": [1], "z111": [2]}
+            }
+        ]
+    }).out
+
+
 def test_all_reporters():
     from temci.report.report import ReporterRegistry
     for name, rep in ReporterRegistry.registry.items():