feat: allow 'signed' option to uncertainty types

misho104 · Mar 10, 2019 · 0ec350f · 0ec350f
1 parent d97b2a0
commit 0ec350f
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 83 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -13,8 +13,10 @@
     "imul",
     "inplace",
     "interp",
+    "isdisjoint",
     "isnan",
     "issubdtype",
+    "iterrows",
     "lxml",
     "mstw",
     "mypy",

diff --git a/docs/use_as_package.rst b/docs/use_as_package.rst
@@ -141,15 +141,17 @@ The dict has six keys: ``document``, ``attributes`` (optional), ``columns``, ``r
   This list defines the cross-section values.
   Each element is a dictionary and constructs a `ValueInfo` object.
   The dictionary has possibly the keys ``column``, ``unc``, ``unc+``, ``unc-``, and ``attributes``.
-  ``column`` is mandatory and its value is one of the ``name`` of ``columns``, where the column is used as the central value of cross-section.
-  ``attributes`` is optional and its value is a :typ:`dict(str, Any)`; it is used to construct a `CrossSectionAttributes` object, overriding the file-wide default values.
+  Among these keys, ``column`` is mandatory and corresponding value must be one of the ``name`` of ``columns``, where the column is used as the central value of cross-section.
+  The value for ``attributes`` is a dictionary :typ:`dict(str, Any)`. It overrides the file-wide default values (explained above) to construct a `CrossSectionAttributes`.
 
   The other three keys are used to specify uncertainties.
   ``unc`` specifies symmetric uncertainty, while a pair of ``unc+`` and ``unc-`` specifies asymmetric uncertainty; ``unc`` will not be present together with ``unc+`` or ``unc-``.
   Each value of ``unc``, ``unc+``, and ``unc-`` is *a list of dictionaries*, :typ:`list(dict(str, str))`.
   Each element of the list, being a dictionary with two keys ``column`` and ``type``, describes one source of uncertainties.
-  The value for ``column`` is one of the ``name`` of ``columns``, where the column is used as the source.
-  The value for ``type`` specifies the type of uncertainty; for details see the API document of `ValueInfo`.
+  The value for ``column`` is one of the ``name`` of ``columns``, or a list of the names.
+  If one name is specified, the column is used as the source.
+  If a list is specified, the column with the largest value among them is used as the source.
+  The value for ``type`` specifies the type of uncertainty; possible options and further details are found in the API document of `ValueInfo`.
 
 
 How to use own tables

diff --git a/susy_cross_section/base/info.py b/susy_cross_section/base/info.py
@@ -15,12 +15,11 @@
 
 from __future__ import absolute_import, division, print_function  # py2
 
-import itertools
 import json
 import logging
 import pathlib  # noqa: F401
 import sys
-from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Union
+from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Tuple, Union
 
 from susy_cross_section.utility import TypeCheck as TC
 
@@ -34,6 +33,8 @@
 logging.basicConfig(level=logging.WARNING)
 logger = logging.getLogger(__name__)
 
+UncSpecType = Tuple[List[str], str]
+
 
 class ColumnInfo(object):
     """Stores information of a column.
@@ -267,10 +268,12 @@ class ValueInfo(object):
 
     Attributes
     ----------
-    column: str
-        Name of the column that stores this value.
+    column: str or List[str]
+        Names of the column that stores this value.
 
-        This must be match one of the :attr:`ColumnInfo.name` in the file.
+        The string, or each element of the list, must match one of the
+        :attr:`ColumnInfo.name` in the file. If multiple columns are specified,
+        the largest value among the columns (compared in each row) is used.
     attributes: dict (str, Any)
         Physical information annotated to this value.
     unc_p : dict (str, str)
@@ -286,8 +289,8 @@ class ValueInfo(object):
         - ``"absolute"`` for absolute uncertainty, where the unit of the column
           must be the same as that of the value column up to a factor.
 
-        The unit of the uncertainty column should be consistent with the unit
-        of the value column.
+        - ``"absolute,signed"`` or ``"relative,signed"`` for absolute/relative
+          uncertainty but using the columns with correct sign.
     unc_m : dict(str, str)
         The sources of "minus" uncertainties.
 
@@ -300,25 +303,24 @@ def __init__(
         self,
         column="",  # type: str
         attributes=None,  # type:MutableMapping[str, Any]
-        unc_p=None,  # type: MutableMapping[str, str]
-        unc_m=None,  # type: MutableMapping[str, str]
+        unc_p=None,  # type: List[UncSpecType]
+        unc_m=None,  # type: List[UncSpecType]
     ):
         # type: (...)->None
         self.column = column
         self.attributes = attributes or {}
-        self.unc_p = unc_p or {}
-        self.unc_m = unc_m or {}
+        self.unc_p = unc_p or []
+        self.unc_m = unc_m or []
 
     def validate(self):
         # type: ()->None
         """Validate the content."""
         assert isinstance(self.column, str), "ValueInfo.column must be string."
         assert self.column, "ValueInfo.column is missing."
         assert TC.is_dict(self.attributes, key_type=str), "attributes not dict[str]."
-        for title, unc in [("unc+", self.unc_p), ("unc-", self.unc_m)]:
-            assert TC.is_dict(unc, key_type=str), title + " not dict[str]."
-            for v in unc.values():
-                assert v in self._valid_uncertainty_types, "invalid unc type: %s" % v
+        for col, t in [*self.unc_p, *self.unc_m]:
+            assert TC.is_list(col, element_type=str)
+            assert t in self._valid_uncertainty_types, "invalid unc type: %s" % t
 
     @classmethod
     def from_json(cls, json_obj):
@@ -355,13 +357,18 @@ def from_json(cls, json_obj):
             if unc_def is None:
                 logger.warning("Uncertainty (%s) missing for %s.", key_name, obj.column)
                 continue
-            if not TC.is_list(unc_def, element_type=Mapping):
-                raise TypeError("%s (%s) is not a list of dicts.", key_name, obj.column)
+            assert TC.is_list(unc_def, Mapping), "bad %s/%s" % (key_name, obj.column)
             try:
-                unc_dict = {source["column"]: source["type"] for source in unc_def}
-                setattr(obj, attr_name, unc_dict)
+                unc_list = [
+                    (
+                        src["column"] if TC.is_list(src["column"]) else [src["column"]],
+                        src["type"],
+                    )
+                    for src in unc_def
+                ]
             except KeyError as e:
                 raise ValueError("%s missing in %s (%s)", key_name, obj.column, *e.args)
+            setattr(obj, attr_name, unc_list)
 
         if not (obj.unc_p and obj.unc_m):
             logger.warning("Value %s lacks uncertainties.", obj.column)
@@ -380,8 +387,8 @@ def to_json(self):
         return {
             "column": self.column,
             "attributes": self.attributes,
-            "unc+": [{"column": k, "type": v} for k, v in self.unc_p.items()],
-            "unc-": [{"column": k, "type": v} for k, v in self.unc_m.items()],
+            "unc+": [{"column": c, "type": t} for c, t in self.unc_p],
+            "unc-": [{"column": c, "type": t} for c, t in self.unc_m],
         }
 
 
@@ -458,8 +465,9 @@ def validate(self):
             assert p.column in names_dict, "Unknown column name: %s" % p.column
         for v in self.values:
             assert v.column in names_dict, "Unknown column name: %s" % v.column
-            for u_col in itertools.chain(v.unc_p.keys(), v.unc_m.keys()):
-                assert u_col in names_dict, "Unknown column name: %s" % u_col
+            for col_list, _ in [*v.unc_p, *v.unc_m]:
+                for c in col_list:
+                    assert c in names_dict, "Unknown column name: %s" % c
 
     @classmethod
     def load(cls, source):

diff --git a/susy_cross_section/base/table.py b/susy_cross_section/base/table.py
@@ -21,14 +21,15 @@
     MutableMapping,
     Optional,
     Sequence,
+    Set,
     TypeVar,
     Union,
     cast,
 )
 
 import pandas
 
-from susy_cross_section.base.info import FileInfo
+from susy_cross_section.base.info import FileInfo, UncSpecType, ValueInfo
 from susy_cross_section.utility import Unit
 
 if sys.version_info[0] < 3:  # py2
@@ -171,68 +172,71 @@ def _parse_data(self):
         # type: ()->MutableMapping[str, TableT]
         """Load and prepare data from the specified paths."""
         tables = {}  # type: MutableMapping[str, TableT]
-        for value_info in self.info.values:
-            name = value_info.column
-            value_unit = self.info.get_column(name).unit
-            parameters = self.info.parameters
-            data = self.raw_data.copy()
-
-            # set index by the quantized values
-            def quantize(data_frame, granularity):
-                # type: (pandas.DataFrame, float)->pandas.DataFrame
-                return (data_frame / granularity).apply(round) * granularity
-
-            for p in parameters:
-                if p.granularity:
-                    data[p.column] = quantize(data[p.column], p.granularity)
-
-            data.set_index([p.column for p in parameters], inplace=True)
-
-            # define functions to apply to DataFrame to get uncertainty.
-            up_factors = self._uncertainty_factors(Unit(value_unit), value_info.unc_p)
-            um_factors = self._uncertainty_factors(Unit(value_unit), value_info.unc_m)
 
-            def unc_p(row, name=name, unc_sources=value_info.unc_p, factors=up_factors):
-                # type: (Any, str, Mapping[str, str], Mapping[str, float])->float
-                return self._combine_uncertainties(row, name, unc_sources, factors)
-
-            def unc_m(row, name=name, unc_sources=value_info.unc_m, factors=um_factors):
-                # type: (Any, str, Mapping[str, str], Mapping[str, float])->float
-                return self._combine_uncertainties(row, name, unc_sources, factors)
+        def calc(row, unc_sources, sign):
+            # type: (pandas.Series, List[UncSpecType], int)->float
+            """Calculate uncertainty from a row in normalized dataframe."""
+            unc_components = []  # type: List[float]
+            for source, unc_type in unc_sources:  # iterate over sources
+                if "signed" in unc_type.split(","):
+                    # use only the correct-signed uncertainties
+                    unc_candidates = [abs(row[c]) for c in source if row[c] * sign > 0]
+                else:
+                    unc_candidates = [abs(row[c]) for c in source]
+                unc_components.append(max(unc_candidates) if unc_candidates else 0)
+            return sum(i ** 2 for i in unc_components) ** 0.5
 
+        for value_info in self.info.values:
+            name = value_info.column
+            data = self._prepare_normalized_data(value_info)
             tables[name] = cast(TableT, BaseTable(file=self, name=name))
             tables[name]["value"] = data[name]
-            tables[name]["unc+"] = data.apply(unc_p, axis=1)
-            tables[name]["unc-"] = data.apply(unc_m, axis=1)
+            for key, row in data.iterrows():
+                tables[name].loc[key, "unc+"] = calc(row, value_info.unc_p, +1)
+                tables[name].loc[key, "unc-"] = calc(row, value_info.unc_m, -1)
+
         return tables
 
-    def _uncertainty_factors(self, value_unit, uncertainty_info):
-        # type: (Unit, Mapping[str, str])->Mapping[str, float]
-        """Return the factor of uncertainty column relative to value column."""
-        factors = {}
-        for source_name, source_type in uncertainty_info.items():
-            unc_unit = Unit(self.info.get_column(source_name).unit)
-            if source_type == "relative":
-                unc_unit *= value_unit
-            # unc / unc_unit == "number in the table"
-            # we want to get "unc / value_unit" = "number in the table"  * unc_unit / value_unit
-            factors[source_name] = float(unc_unit / value_unit)
-        return factors
-
-    @staticmethod
-    def _combine_uncertainties(row, value_name, unc_sources, factors):
-        # type: (Any, str, Mapping[str, str], Mapping[str, float])->float
-        """Return absolute combined uncertainty."""
-        uncertainties = []
-        for name, typ in unc_sources.items():
-            if typ == "relative":
-                uncertainties.append(row[name] * factors[name] * row[value_name])
-            elif typ == "absolute":
-                uncertainties.append(row[name] * factors[name])
+    def _prepare_normalized_data(self, value_info):
+        # type: (ValueInfo)->pandas.DataFrame
+        """Quantize parameters and normalize columns to value_info.column."""
+        data = self.raw_data.copy()
+
+        def quantize(data_frame, granularity):
+            # type: (pandas.DataFrame, float)->pandas.DataFrame
+            return (data_frame / granularity).apply(round) * granularity
+
+        # set index by the quantized values
+        for p in self.info.parameters:
+            if p.granularity:
+                data[p.column] = quantize(data[p.column], p.granularity)
+        data.set_index([p.column for p in self.info.parameters], inplace=True)
+
+        # collect columns to use
+        abs_columns, rel_columns = set(), set()  # type: Set[str], Set[str]
+        for unc_cols, unc_type in [*value_info.unc_p, *value_info.unc_m]:
+            is_relative = "relative" in unc_type.split(",")
+            for c in unc_cols:
+                (rel_columns if is_relative else abs_columns).add(c)
+        assert abs_columns.isdisjoint(rel_columns)
+
+        name = value_info.column
+        value_unit = Unit(self.info.get_column(name).unit)
+        for col in data.columns:
+            if col == value_info.column:
+                pass
+            elif col in abs_columns:
+                # unc / unc_unit == "number in the table"
+                # we want to get "unc / value_unit"
+                # = "number in the table" * unc_unit / value_unit
+                unc_unit = Unit(self.info.get_column(col).unit)
+                data[col] = data[col] * float(unc_unit / value_unit)
+            elif col in rel_columns:
+                unc_unit = Unit(self.info.get_column(col).unit) * value_unit
+                data[col] = data[name] * data[col] * float(unc_unit / value_unit)
             else:
-                raise ValueError(typ)
-
-        return sum(x ** 2 for x in uncertainties) ** 0.5
+                data.drop(col, axis=1, inplace=True)
+        return data
 
     def validate(self):
         # type: ()->None