Skip to content

Commit

Permalink
Update _standardize_muli_index_columns to be faster for dataframe del…
Browse files Browse the repository at this point in the history
…egates.
  • Loading branch information
erykoff committed Apr 19, 2023
1 parent 220d528 commit 9707e0e
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 9 deletions.
7 changes: 3 additions & 4 deletions python/lsst/daf/butler/delegates/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@
from __future__ import annotations

import collections.abc
from typing import Any, Mapping, Optional
import itertools
from typing import Any, Mapping, Optional, Sequence

import pandas
import pyarrow as pa
from lsst.daf.butler import StorageClassDelegate
from lsst.daf.butler.formatters.parquet import DataFrameSchema
from lsst.utils.introspection import get_full_type_name
Expand Down Expand Up @@ -112,9 +112,8 @@ def handleParameters(

if isinstance(inMemoryDataset.columns, pandas.MultiIndex):
# We have a multi-index dataframe which needs special handling.
arrow_table = pa.Table.from_pandas(inMemoryDataset)
readColumns = _standardize_multi_index_columns(
arrow_table.schema,
inMemoryDataset.columns,
parameters["columns"],
stringify=False,
)
Expand Down
12 changes: 7 additions & 5 deletions python/lsst/daf/butler/formatters/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,10 @@ def read(self, component: Optional[str] = None) -> Any:
f"Column {par_column} specified in parameters not available in parquet file."
)
else:
par_columns = _standardize_multi_index_columns(schema, par_columns)
par_columns = _standardize_multi_index_columns(
arrow_schema_to_pandas_index(schema),
par_columns,
)

if len(self.fileDescriptor.parameters):
raise ValueError(
Expand Down Expand Up @@ -895,7 +898,7 @@ def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequen


def _standardize_multi_index_columns(
schema: pa.Schema,
pd_index: pd.MultiIndex,
columns: Any,
stringify: bool = True,
) -> list[str | Sequence[Any]]:
Expand All @@ -904,8 +907,8 @@ def _standardize_multi_index_columns(
Parameters
----------
schema : `pyarrow.Schema`
Pyarrow schema.
pd_index : `pandas.MultiIndex`
Pandas multi-index.
columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]]
Columns to standardize.
stringify : `bool`, optional
Expand All @@ -916,7 +919,6 @@ def _standardize_multi_index_columns(
names : `list` [`str`]
Stringified representation of a multi-index column name.
"""
pd_index = arrow_schema_to_pandas_index(schema)
index_level_names = tuple(pd_index.names)

names: list[str | Sequence[Any]] = []
Expand Down

0 comments on commit 9707e0e

Please sign in to comment.