diff --git a/changelog/28.improvement.md b/changelog/28.improvement.md new file mode 100644 index 0000000..1beec6a --- /dev/null +++ b/changelog/28.improvement.md @@ -0,0 +1,2 @@ +[pandas_openscm.index_manipulation.update_levels_from_other][] now supports updating levels based on multiple other levels from the index at once (see the docstring for examples). +This update also propagates to [pandas_openscm.index_manipulation.update_index_levels_from_other_func][] and [pandas_openscm.accessors.dataframe.PandasDataFrameOpenSCMAccessor.update_index_levels_from_other][]. diff --git a/src/pandas_openscm/accessors/dataframe.py b/src/pandas_openscm/accessors/dataframe.py index 4344721..8e2f1e8 100644 --- a/src/pandas_openscm/accessors/dataframe.py +++ b/src/pandas_openscm/accessors/dataframe.py @@ -838,7 +838,17 @@ def update_index_levels( def update_index_levels_from_other( self, update_sources: dict[ - Any, tuple[Any, Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any]] + Any, + tuple[ + Any, + Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any], + ] + | tuple[ + tuple[Any, ...], + Callable[[tuple[Any, ...]], Any] + | dict[tuple[Any, ...], Any] + | pd.Series[Any], + ], ], copy: bool = True, remove_unused_levels: bool = True, @@ -849,18 +859,29 @@ def update_index_levels_from_other( Parameters ---------- update_sources - Updates to apply to `df`'s index + Updates to apply to the data's index Each key is the level to which the updates will be applied (or the level that will be created if it doesn't already exist). - Each value is a tuple of which the first element + There are two options for the values. + + The first is used when only one level is used to update the 'target level'. + In this case, each value is a tuple of which the first element is the level to use to generate the values (the 'source level') and the second is mapper of the form used by [pd.Index.map][pandas.Index.map] which will be applied to the source level to update/create the level of interest. + Each value is a tuple of which the first element + is the level or levels (if a tuple) + to use to generate the values (the 'source level') + and the second is mapper of the form used by + [pd.Index.map][pandas.Index.map] + which will be applied to the source level + to update/create the level of interest. + copy Should the [pd.DataFrame][pandas.DataFrame] be copied before returning? diff --git a/src/pandas_openscm/index_manipulation.py b/src/pandas_openscm/index_manipulation.py index fc4267c..11a9a85 100644 --- a/src/pandas_openscm/index_manipulation.py +++ b/src/pandas_openscm/index_manipulation.py @@ -405,6 +405,53 @@ def create_new_level_and_codes_by_mapping( return new_level, new_codes +def create_new_level_and_codes_by_mapping_multiple( + ini: pd.MultiIndex, + levels_to_create_from: tuple[str, ...], + mapper: Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any], +) -> tuple[pd.Index[Any], npt.NDArray[np.integer[Any]]]: + """ + Create a new level and associated codes by mapping existing levels + + This is a thin function intended for internal use + to handle some slightly tricky logic. + + Parameters + ---------- + ini + Input index + + levels_to_create_from + Levels to create the new level from + + mapper + Function to use to map existing levels to new levels + + Returns + ------- + new_level : + New level + + new_codes : + New codes + """ + # You could probably do some optimisation here + # that checks for unique combinations of codes + # for the levels we're using, + # then only applies the mapping to those unique combos + # to reduce the number of evaluations of mapper. + # That feels tricky to get right, so just doing the brute force way for now. + dup_level = ini.droplevel( + ini.names.difference(list(levels_to_create_from)) # type: ignore # pandas-stubs confused + ).map(mapper) + + # Brute force: get codes from new levels + new_level = dup_level.unique() + new_codes = new_level.get_indexer(dup_level) + + return new_level, new_codes + + def update_index_levels_func( df: pd.DataFrame, updates: Mapping[Any, Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any]], @@ -564,7 +611,17 @@ def update_levels( def update_index_levels_from_other_func( df: pd.DataFrame, update_sources: dict[ - Any, tuple[Any, Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any]] + Any, + tuple[ + Any, + Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any], + ] + | tuple[ + tuple[Any, ...], + Callable[[tuple[Any, ...]], Any] + | dict[tuple[Any, ...], Any] + | pd.Series[Any], + ], ], copy: bool = True, remove_unused_levels: bool = True, @@ -586,13 +643,24 @@ def update_index_levels_from_other_func( Each key is the level to which the updates will be applied (or the level that will be created if it doesn't already exist). - Each value is a tuple of which the first element + There are two options for the values. + + The first is used when only one level is used to update the 'target level'. + In this case, each value is a tuple of which the first element is the level to use to generate the values (the 'source level') and the second is mapper of the form used by [pd.Index.map][pandas.Index.map] which will be applied to the source level to update/create the level of interest. + Each value is a tuple of which the first element + is the level or levels (if a tuple) + to use to generate the values (the 'source level') + and the second is mapper of the form used by + [pd.Index.map][pandas.Index.map] + which will be applied to the source level + to update/create the level of interest. + copy Should `df` be copied before returning? @@ -629,7 +697,17 @@ def update_index_levels_from_other_func( def update_levels_from_other( ini: pd.MultiIndex, update_sources: dict[ - Any, tuple[Any, Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any]] + Any, + tuple[ + Any, + Callable[[Any], Any] | dict[Any, Any] | pd.Series[Any], + ] + | tuple[ + tuple[Any, ...], + Callable[[tuple[Any, ...]], Any] + | dict[tuple[Any, ...], Any] + | pd.Series[Any], + ], ], remove_unused_levels: bool = True, ) -> pd.MultiIndex: @@ -650,13 +728,24 @@ def update_levels_from_other( Each key is the level to which the updates will be applied (or the level that will be created if it doesn't already exist). - Each value is a tuple of which the first element + There are two options for the values. + + The first is used when only one level is used to update the 'target level'. + In this case, each value is a tuple of which the first element is the level to use to generate the values (the 'source level') and the second is mapper of the form used by [pd.Index.map][pandas.Index.map] which will be applied to the source level to update/create the level of interest. + Each value is a tuple of which the first element + is the level or levels (if a tuple) + to use to generate the values (the 'source level') + and the second is mapper of the form used by + [pd.Index.map][pandas.Index.map] + which will be applied to the source level + to update/create the level of interest. + remove_unused_levels Call `ini.remove_unused_levels` before updating the levels @@ -718,6 +807,19 @@ def update_levels_from_other( ('sa', 'model sa', 'v2', 'km')], names=['scenario', 'model', 'variable', 'unit']) >>> + >>> # Create a new level based on multiple existing levels + >>> update_levels_from_other( + ... start, + ... { + ... "model || scenario": (("model", "scenario"), lambda x: " || ".join(x)), + ... }, + ... ) + MultiIndex([('sa', 'ma', 'v1', 'kg', 'sa || ma'), + ('sb', 'ma', 'v2', 'm', 'sb || ma'), + ('sa', 'mb', 'v1', 'kg', 'sa || mb'), + ('sa', 'mb', 'v2', 'm', 'sa || mb')], + names=['scenario', 'model', 'variable', 'unit', 'model || scenario']) + >>> >>> # Both at the same time >>> update_levels_from_other( ... start, @@ -731,7 +833,28 @@ def update_levels_from_other( ('sa', 'mb', 'v1', nan, 'Sa'), ('sa', 'mb', 'v2', nan, 'Sa')], names=['scenario', 'model', 'variable', 'unit', 'title']) - """ + >>> + >>> # Setting with a range of different methods + >>> update_levels_from_other( + ... start, + ... { + ... # callable + ... "y-label": (("variable", "unit"), lambda x: f"{x[0]} ({x[1]})"), + ... # dict + ... "title": ("scenario", {"sa": "Scenario A", "sb": "Delta"}), + ... # pd.Series + ... "Source": ( + ... "model", + ... pd.Series(["Internal", "External"], index=["ma", "mb"]), + ... ), + ... }, + ... ) + MultiIndex([('sa', 'ma', 'v1', 'kg', 'v1 (kg)', 'Scenario A', 'Internal'), + ('sb', 'ma', 'v2', 'm', 'v2 (m)', 'Delta', 'Internal'), + ('sa', 'mb', 'v1', 'kg', 'v1 (kg)', 'Scenario A', 'External'), + ('sa', 'mb', 'v2', 'm', 'v2 (m)', 'Scenario A', 'External')], + names=['scenario', 'model', 'variable', 'unit', 'y-label', 'title', 'Source']) + """ # noqa: E501 if remove_unused_levels: ini = ini.remove_unused_levels() # type: ignore @@ -740,17 +863,35 @@ def update_levels_from_other( names: list[str] = list(ini.names) for level, (source, updater) in update_sources.items(): - if source not in ini.names: - msg = ( - f"{source} is not available in the index. Available levels: {ini.names}" + if isinstance(source, tuple): + missing_levels = set(source) - set(ini.names) + if missing_levels: + conj = "is" if len(missing_levels) == 1 else "are" + msg = ( + f"{sorted(missing_levels)} {conj} not available in the index. " + f"Available levels: {ini.names}" + ) + raise KeyError(msg) + + new_level, new_codes = create_new_level_and_codes_by_mapping_multiple( + ini=ini, + levels_to_create_from=source, + mapper=updater, ) - raise KeyError(msg) - new_level, new_codes = create_new_level_and_codes_by_mapping( - ini=ini, - level_to_create_from=source, - mapper=updater, - ) + else: + if source not in ini.names: + msg = ( + f"{source} is not available in the index. " + f"Available levels: {ini.names}" + ) + raise KeyError(msg) + + new_level, new_codes = create_new_level_and_codes_by_mapping( + ini=ini, + level_to_create_from=source, + mapper=updater, + ) if level in ini.names: level_idx = ini.names.index(level) diff --git a/tests/integration/index_manipulation/test_integration_index_manipulation_update_levels_from_other.py b/tests/integration/index_manipulation/test_integration_index_manipulation_update_levels_from_other.py index b320414..ff82f05 100644 --- a/tests/integration/index_manipulation/test_integration_index_manipulation_update_levels_from_other.py +++ b/tests/integration/index_manipulation/test_integration_index_manipulation_update_levels_from_other.py @@ -143,14 +143,62 @@ }, id="multiple-updates-incl-external-func", ), + pytest.param( + pd.MultiIndex.from_tuples( + [ + ("sa", "va", "kg", 0), + ("sb", "vb", "m", -1), + ("sa", "va", "kg", -2), + ("sa", "vb", "kg", 2), + ], + names=["scenario", "variable", "unit", "run_id"], + ), + { + "vv": (("scenario", "variable"), lambda x: " - ".join(x)), + "sv": ( + ("scenario", "variable"), + { + ("sa", "va"): "hi", + ("sb", "vb"): "bye", + ("sa", "vb"): "psi", + }, + ), + "su": ( + ("scenario", "unit"), + pd.Series( + ["alpha", "beta"], + index=pd.MultiIndex.from_tuples( + [ + ("sa", "kg"), + ("sb", "m"), + ], + names=["scenario", "unit"], + ), + ), + ), + "unit": ("unit", lambda x: x.replace("kg", "g").replace("m", "km")), + "u_run_id_abs": ( + ("unit", "run_id"), + lambda x: f"{x[0]}_{np.abs(x[1])}", + ), + }, + id="multiple-updates-multiple-sources-incl-dict-series-external-func", + ), ), ) def test_update_levels_from_other(start, update_sources): res = update_levels_from_other(start, update_sources=update_sources) - exp = start.to_frame(index=False) + # Need this so we order of updates doesn't matter + helper = start.to_frame(index=False) + exp = helper.copy() for level, (source, mapper) in update_sources.items(): - exp[level] = exp[source].map(mapper) + if isinstance(source, tuple): + exp[level] = pd.MultiIndex.from_frame(helper[list(source)]).map(mapper) + + else: + exp[level] = helper[source].map(mapper) + exp = pd.MultiIndex.from_frame(exp) pd.testing.assert_index_equal(res, exp) @@ -181,6 +229,50 @@ def test_update_levels_from_other_missing_level(): update_levels_from_other(start, update_sources=update_sources) +@pytest.mark.parametrize( + "sources, exp", + ( + ( + ("units", "variable"), + pytest.raises( + KeyError, + match=re.escape( + f"{sorted(set(['units']))} is not available in the index. " + f"Available levels: {['scenario', 'variable', 'unit', 'run_id']}" + ), + ), + ), + ( + ("units", "variables"), + pytest.raises( + KeyError, + match=re.escape( + f"{sorted(set(['units', 'variables']))} " + "are not available in the index. " + f"Available levels: {['scenario', 'variable', 'unit', 'run_id']}" + ), + ), + ), + ), +) +def test_update_levels_from_other_missing_levels(sources, exp): + start = pd.MultiIndex.from_tuples( + [ + ("sa", "va", "kg", 0), + ("sb", "vb", "m", -1), + ("sa", "va", "kg", -2), + ("sa", "vb", "kg", 2), + ], + names=["scenario", "variable", "unit", "run_id"], + ) + update_sources = { + "uu": (sources, lambda x: x), + } + + with exp: + update_levels_from_other(start, update_sources=update_sources) + + def test_doesnt_trip_over_droped_levels(setup_pandas_accessors): def update_func(in_v: int) -> int: if in_v < 0: @@ -272,8 +364,24 @@ def test_accessor(setup_pandas_accessors): ) update_sources = { + # callables single source "vv": ("variable", lambda x: x.replace("v", "vv")), "unit": ("unit", lambda x: x.replace("kg", "g").replace("m", "km")), + # callables multi source + "y-label": (("variable", "unit"), lambda x: f"{x[0]} ({x[1]})"), + # dict + "title": ("scenario", {"sa": "Scenario A", "sb": "Delta"}), + # pd.Series + "Source": ( + ("scenario", "variable"), + pd.Series( + ["Internal", "External", "External"], + index=pd.MultiIndex.from_tuples( + [("sa", "va"), ("sb", "vb"), ("sa", "vb")], + names=["scenario", "variable"], + ), + ), + ), } exp = pd.DataFrame( @@ -281,12 +389,23 @@ def test_accessor(setup_pandas_accessors): columns=start.columns, index=pd.MultiIndex.from_tuples( [ - ("sa", "va", "g", 0, "vva"), - ("sb", "vb", "km", -1, "vvb"), - ("sa", "va", "g", -2, "vva"), - ("sa", "vb", "g", 2, "vvb"), + # Updates not done sequentially + # hence y-label uses units from original data + ("sa", "va", "g", 0, "vva", "va (kg)", "Scenario A", "Internal"), + ("sb", "vb", "km", -1, "vvb", "vb (m)", "Delta", "External"), + ("sa", "va", "g", -2, "vva", "va (kg)", "Scenario A", "Internal"), + ("sa", "vb", "g", 2, "vvb", "vb (kg)", "Scenario A", "External"), + ], + names=[ + "scenario", + "variable", + "unit", + "run_id", + "vv", + "y-label", + "title", + "Source", ], - names=["scenario", "variable", "unit", "run_id", "vv"], ), )