Skip to content

Commit

Permalink
feat: allow 'signed' option to uncertainty types
Browse files Browse the repository at this point in the history
  • Loading branch information
misho104 committed Mar 10, 2019
1 parent d97b2a0 commit 0ec350f
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 83 deletions.
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
"imul",
"inplace",
"interp",
"isdisjoint",
"isnan",
"issubdtype",
"iterrows",
"lxml",
"mstw",
"mypy",
Expand Down
10 changes: 6 additions & 4 deletions docs/use_as_package.rst
Original file line number Diff line number Diff line change
Expand Up @@ -141,15 +141,17 @@ The dict has six keys: ``document``, ``attributes`` (optional), ``columns``, ``r
This list defines the cross-section values.
Each element is a dictionary and constructs a `ValueInfo` object.
The dictionary has possibly the keys ``column``, ``unc``, ``unc+``, ``unc-``, and ``attributes``.
``column`` is mandatory and its value is one of the ``name`` of ``columns``, where the column is used as the central value of cross-section.
``attributes`` is optional and its value is a :typ:`dict(str, Any)`; it is used to construct a `CrossSectionAttributes` object, overriding the file-wide default values.
Among these keys, ``column`` is mandatory and corresponding value must be one of the ``name`` of ``columns``, where the column is used as the central value of cross-section.
The value for ``attributes`` is a dictionary :typ:`dict(str, Any)`. It overrides the file-wide default values (explained above) to construct a `CrossSectionAttributes`.

The other three keys are used to specify uncertainties.
``unc`` specifies symmetric uncertainty, while a pair of ``unc+`` and ``unc-`` specifies asymmetric uncertainty; ``unc`` will not be present together with ``unc+`` or ``unc-``.
Each value of ``unc``, ``unc+``, and ``unc-`` is *a list of dictionaries*, :typ:`list(dict(str, str))`.
Each element of the list, being a dictionary with two keys ``column`` and ``type``, describes one source of uncertainties.
The value for ``column`` is one of the ``name`` of ``columns``, where the column is used as the source.
The value for ``type`` specifies the type of uncertainty; for details see the API document of `ValueInfo`.
The value for ``column`` is one of the ``name`` of ``columns``, or a list of the names.
If one name is specified, the column is used as the source.
If a list is specified, the column with the largest value among them is used as the source.
The value for ``type`` specifies the type of uncertainty; possible options and further details are found in the API document of `ValueInfo`.


How to use own tables
Expand Down
54 changes: 31 additions & 23 deletions susy_cross_section/base/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@

from __future__ import absolute_import, division, print_function # py2

import itertools
import json
import logging
import pathlib # noqa: F401
import sys
from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Union
from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Tuple, Union

from susy_cross_section.utility import TypeCheck as TC

Expand All @@ -34,6 +33,8 @@
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

UncSpecType = Tuple[List[str], str]


class ColumnInfo(object):
"""Stores information of a column.
Expand Down Expand Up @@ -267,10 +268,12 @@ class ValueInfo(object):
Attributes
----------
column: str
Name of the column that stores this value.
column: str or List[str]
Names of the column that stores this value.
This must be match one of the :attr:`ColumnInfo.name` in the file.
The string, or each element of the list, must match one of the
:attr:`ColumnInfo.name` in the file. If multiple columns are specified,
the largest value among the columns (compared in each row) is used.
attributes: dict (str, Any)
Physical information annotated to this value.
unc_p : dict (str, str)
Expand All @@ -286,8 +289,8 @@ class ValueInfo(object):
- ``"absolute"`` for absolute uncertainty, where the unit of the column
must be the same as that of the value column up to a factor.
The unit of the uncertainty column should be consistent with the unit
of the value column.
- ``"absolute,signed"`` or ``"relative,signed"`` for absolute/relative
uncertainty but using the columns with correct sign.
unc_m : dict(str, str)
The sources of "minus" uncertainties.
Expand All @@ -300,25 +303,24 @@ def __init__(
self,
column="", # type: str
attributes=None, # type:MutableMapping[str, Any]
unc_p=None, # type: MutableMapping[str, str]
unc_m=None, # type: MutableMapping[str, str]
unc_p=None, # type: List[UncSpecType]
unc_m=None, # type: List[UncSpecType]
):
# type: (...)->None
self.column = column
self.attributes = attributes or {}
self.unc_p = unc_p or {}
self.unc_m = unc_m or {}
self.unc_p = unc_p or []
self.unc_m = unc_m or []

def validate(self):
# type: ()->None
"""Validate the content."""
assert isinstance(self.column, str), "ValueInfo.column must be string."
assert self.column, "ValueInfo.column is missing."
assert TC.is_dict(self.attributes, key_type=str), "attributes not dict[str]."
for title, unc in [("unc+", self.unc_p), ("unc-", self.unc_m)]:
assert TC.is_dict(unc, key_type=str), title + " not dict[str]."
for v in unc.values():
assert v in self._valid_uncertainty_types, "invalid unc type: %s" % v
for col, t in [*self.unc_p, *self.unc_m]:
assert TC.is_list(col, element_type=str)
assert t in self._valid_uncertainty_types, "invalid unc type: %s" % t

@classmethod
def from_json(cls, json_obj):
Expand Down Expand Up @@ -355,13 +357,18 @@ def from_json(cls, json_obj):
if unc_def is None:
logger.warning("Uncertainty (%s) missing for %s.", key_name, obj.column)
continue
if not TC.is_list(unc_def, element_type=Mapping):
raise TypeError("%s (%s) is not a list of dicts.", key_name, obj.column)
assert TC.is_list(unc_def, Mapping), "bad %s/%s" % (key_name, obj.column)
try:
unc_dict = {source["column"]: source["type"] for source in unc_def}
setattr(obj, attr_name, unc_dict)
unc_list = [
(
src["column"] if TC.is_list(src["column"]) else [src["column"]],
src["type"],
)
for src in unc_def
]
except KeyError as e:
raise ValueError("%s missing in %s (%s)", key_name, obj.column, *e.args)
setattr(obj, attr_name, unc_list)

if not (obj.unc_p and obj.unc_m):
logger.warning("Value %s lacks uncertainties.", obj.column)
Expand All @@ -380,8 +387,8 @@ def to_json(self):
return {
"column": self.column,
"attributes": self.attributes,
"unc+": [{"column": k, "type": v} for k, v in self.unc_p.items()],
"unc-": [{"column": k, "type": v} for k, v in self.unc_m.items()],
"unc+": [{"column": c, "type": t} for c, t in self.unc_p],
"unc-": [{"column": c, "type": t} for c, t in self.unc_m],
}


Expand Down Expand Up @@ -458,8 +465,9 @@ def validate(self):
assert p.column in names_dict, "Unknown column name: %s" % p.column
for v in self.values:
assert v.column in names_dict, "Unknown column name: %s" % v.column
for u_col in itertools.chain(v.unc_p.keys(), v.unc_m.keys()):
assert u_col in names_dict, "Unknown column name: %s" % u_col
for col_list, _ in [*v.unc_p, *v.unc_m]:
for c in col_list:
assert c in names_dict, "Unknown column name: %s" % c

@classmethod
def load(cls, source):
Expand Down
116 changes: 60 additions & 56 deletions susy_cross_section/base/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@
MutableMapping,
Optional,
Sequence,
Set,
TypeVar,
Union,
cast,
)

import pandas

from susy_cross_section.base.info import FileInfo
from susy_cross_section.base.info import FileInfo, UncSpecType, ValueInfo
from susy_cross_section.utility import Unit

if sys.version_info[0] < 3: # py2
Expand Down Expand Up @@ -171,68 +172,71 @@ def _parse_data(self):
# type: ()->MutableMapping[str, TableT]
"""Load and prepare data from the specified paths."""
tables = {} # type: MutableMapping[str, TableT]
for value_info in self.info.values:
name = value_info.column
value_unit = self.info.get_column(name).unit
parameters = self.info.parameters
data = self.raw_data.copy()

# set index by the quantized values
def quantize(data_frame, granularity):
# type: (pandas.DataFrame, float)->pandas.DataFrame
return (data_frame / granularity).apply(round) * granularity

for p in parameters:
if p.granularity:
data[p.column] = quantize(data[p.column], p.granularity)

data.set_index([p.column for p in parameters], inplace=True)

# define functions to apply to DataFrame to get uncertainty.
up_factors = self._uncertainty_factors(Unit(value_unit), value_info.unc_p)
um_factors = self._uncertainty_factors(Unit(value_unit), value_info.unc_m)

def unc_p(row, name=name, unc_sources=value_info.unc_p, factors=up_factors):
# type: (Any, str, Mapping[str, str], Mapping[str, float])->float
return self._combine_uncertainties(row, name, unc_sources, factors)

def unc_m(row, name=name, unc_sources=value_info.unc_m, factors=um_factors):
# type: (Any, str, Mapping[str, str], Mapping[str, float])->float
return self._combine_uncertainties(row, name, unc_sources, factors)
def calc(row, unc_sources, sign):
# type: (pandas.Series, List[UncSpecType], int)->float
"""Calculate uncertainty from a row in normalized dataframe."""
unc_components = [] # type: List[float]
for source, unc_type in unc_sources: # iterate over sources
if "signed" in unc_type.split(","):
# use only the correct-signed uncertainties
unc_candidates = [abs(row[c]) for c in source if row[c] * sign > 0]
else:
unc_candidates = [abs(row[c]) for c in source]
unc_components.append(max(unc_candidates) if unc_candidates else 0)
return sum(i ** 2 for i in unc_components) ** 0.5

for value_info in self.info.values:
name = value_info.column
data = self._prepare_normalized_data(value_info)
tables[name] = cast(TableT, BaseTable(file=self, name=name))
tables[name]["value"] = data[name]
tables[name]["unc+"] = data.apply(unc_p, axis=1)
tables[name]["unc-"] = data.apply(unc_m, axis=1)
for key, row in data.iterrows():
tables[name].loc[key, "unc+"] = calc(row, value_info.unc_p, +1)
tables[name].loc[key, "unc-"] = calc(row, value_info.unc_m, -1)

return tables

def _uncertainty_factors(self, value_unit, uncertainty_info):
# type: (Unit, Mapping[str, str])->Mapping[str, float]
"""Return the factor of uncertainty column relative to value column."""
factors = {}
for source_name, source_type in uncertainty_info.items():
unc_unit = Unit(self.info.get_column(source_name).unit)
if source_type == "relative":
unc_unit *= value_unit
# unc / unc_unit == "number in the table"
# we want to get "unc / value_unit" = "number in the table" * unc_unit / value_unit
factors[source_name] = float(unc_unit / value_unit)
return factors

@staticmethod
def _combine_uncertainties(row, value_name, unc_sources, factors):
# type: (Any, str, Mapping[str, str], Mapping[str, float])->float
"""Return absolute combined uncertainty."""
uncertainties = []
for name, typ in unc_sources.items():
if typ == "relative":
uncertainties.append(row[name] * factors[name] * row[value_name])
elif typ == "absolute":
uncertainties.append(row[name] * factors[name])
def _prepare_normalized_data(self, value_info):
# type: (ValueInfo)->pandas.DataFrame
"""Quantize parameters and normalize columns to value_info.column."""
data = self.raw_data.copy()

def quantize(data_frame, granularity):
# type: (pandas.DataFrame, float)->pandas.DataFrame
return (data_frame / granularity).apply(round) * granularity

# set index by the quantized values
for p in self.info.parameters:
if p.granularity:
data[p.column] = quantize(data[p.column], p.granularity)
data.set_index([p.column for p in self.info.parameters], inplace=True)

# collect columns to use
abs_columns, rel_columns = set(), set() # type: Set[str], Set[str]
for unc_cols, unc_type in [*value_info.unc_p, *value_info.unc_m]:
is_relative = "relative" in unc_type.split(",")
for c in unc_cols:
(rel_columns if is_relative else abs_columns).add(c)
assert abs_columns.isdisjoint(rel_columns)

name = value_info.column
value_unit = Unit(self.info.get_column(name).unit)
for col in data.columns:
if col == value_info.column:
pass
elif col in abs_columns:
# unc / unc_unit == "number in the table"
# we want to get "unc / value_unit"
# = "number in the table" * unc_unit / value_unit
unc_unit = Unit(self.info.get_column(col).unit)
data[col] = data[col] * float(unc_unit / value_unit)
elif col in rel_columns:
unc_unit = Unit(self.info.get_column(col).unit) * value_unit
data[col] = data[name] * data[col] * float(unc_unit / value_unit)
else:
raise ValueError(typ)

return sum(x ** 2 for x in uncertainties) ** 0.5
data.drop(col, axis=1, inplace=True)
return data

def validate(self):
# type: ()->None
Expand Down

0 comments on commit 0ec350f

Please sign in to comment.