Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-43925: Add workarounds for pandas bugs when using non-floating-point masked columns. #998

Merged
merged 7 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/changes/DM-43925.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Work around pandas bugs when using non-floating-point masked columns.
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/configs/storageClasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ storageClasses:
pytype: pandas.core.frame.DataFrame
converters:
pyarrow.Table: lsst.daf.butler.formatters.parquet.arrow_to_pandas
astropy.table.Table: astropy.table.Table.to_pandas
astropy.table.Table: lsst.daf.butler.formatters.parquet.astropy_to_pandas
numpy.ndarray: pandas.DataFrame.from_records
dict: pandas.DataFrame.from_records
delegate: lsst.daf.butler.delegates.dataframe.DataFrameDelegate
Expand Down
34 changes: 31 additions & 3 deletions python/lsst/daf/butler/formatters/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"pandas_to_arrow",
"pandas_to_astropy",
"astropy_to_arrow",
"astropy_to_pandas",
"numpy_to_arrow",
"numpy_to_astropy",
"numpy_dict_to_arrow",
Expand Down Expand Up @@ -492,6 +493,34 @@
return arrow_table


def astropy_to_pandas(astropy_table: atable.Table, index: str | None = None) -> pd.DataFrame:
"""Convert an astropy table to a pandas dataframe via arrow.

By going via arrow we avoid pandas masked column bugs (e.g.
https://github.com/pandas-dev/pandas/issues/58173)

Parameters
----------
astropy_table : `astropy.Table`
Input astropy table.
index : `str`, optional
Name of column to set as index.

Returns
-------
dataframe : `pandas.DataFrame`
Output pandas dataframe.
"""
dataframe = arrow_to_pandas(astropy_to_arrow(astropy_table))

if isinstance(index, str):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this raise if it's not a string or None? If not, what about just if index:?

dataframe = dataframe.set_index(index)
elif index:
raise RuntimeError("index must be a string or None.")

Check warning on line 519 in python/lsst/daf/butler/formatters/parquet.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/formatters/parquet.py#L519

Added line #L519 was not covered by tests

return dataframe


def _astropy_to_numpy_dict(astropy_table: atable.Table) -> dict[str, np.ndarray]:
"""Convert an astropy table to an arrow table.

Expand Down Expand Up @@ -560,12 +589,11 @@
Converted astropy table.
"""
import pandas as pd
from astropy.table import Table

if isinstance(dataframe.columns, pd.MultiIndex):
raise ValueError("Cannot convert a multi-index dataframe to an astropy table.")

return Table.from_pandas(dataframe, index=True)
return arrow_to_astropy(pandas_to_arrow(dataframe))


def _pandas_to_numpy_dict(dataframe: pd.DataFrame) -> dict[str, np.ndarray]:
Expand Down Expand Up @@ -1073,7 +1101,7 @@
# String/bytes length from header.
strlen = int(schema.metadata[encoded])
elif numpy_column is not None and len(numpy_column) > 0:
strlen = max(len(row) for row in numpy_column)
strlen = max([len(row) for row in numpy_column if row])

dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}"

Expand Down
37 changes: 35 additions & 2 deletions tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,12 @@
arrow_to_numpy_dict,
arrow_to_pandas,
astropy_to_arrow,
astropy_to_pandas,
compute_row_group_size,
numpy_dict_to_arrow,
numpy_to_arrow,
pandas_to_arrow,
pandas_to_astropy,
)
except ImportError:
pa = None
Expand Down Expand Up @@ -200,6 +202,7 @@ def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
df["mstrcol"] = pd.array(np.array(["text"] * nrow))
df.loc[1, ["m1", "m2", "mstrcol"]] = None
df.loc[0, "m1"] = 1649900760361600113

if include_lists:
nrow = len(df)
Expand Down Expand Up @@ -273,6 +276,7 @@ def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, includ
# Masked 64-bit integer.
arr = np.arange(nrow, dtype="i8")
arr[mask] = -1
arr[0] = 1649900760361600113
table["m_i8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=-1)
# Masked 32-bit float.
arr = np.arange(nrow, dtype="f4")
Expand Down Expand Up @@ -555,7 +559,7 @@ def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
self.butler.put(df1, self.datasetType, dataId={})

tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
tab2_df = tab2.to_pandas(index="index")
tab2_df = astropy_to_pandas(tab2, index="index")

self.assertTrue(df1.columns.equals(tab2_df.columns))
for name in tab2_df.columns:
Expand Down Expand Up @@ -584,6 +588,23 @@ def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
# This test simply checks that it's readable, but definitely not
# recommended.

@unittest.skipUnless(atable is not None, "Cannot test writing as astropy without astropy.")
def testWriteAstropyTableWithMaskedColsReadAsSingleIndexDataFrame(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is quite the mouthful... though I don't have a better suggestion.

tab1 = _makeSimpleAstropyTable(include_masked=True)

self.butler.put(tab1, self.datasetType, dataId={})

tab2 = self.butler.get(self.datasetType, dataId={})

tab1_df = astropy_to_pandas(tab1)
self.assertTrue(tab1_df.equals(tab2))

tab2_astropy = pandas_to_astropy(tab2)
for col in tab1.dtype.names:
np.testing.assert_array_equal(tab2_astropy[col], tab1[col])
if isinstance(tab1[col], atable.column.MaskedColumn):
np.testing.assert_array_equal(tab2_astropy[col].mask, tab1[col].mask)

@unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
def testWriteSingleIndexDataFrameReadAsArrowTable(self):
df1, allColumns = _makeSingleIndexDataFrame()
Expand Down Expand Up @@ -967,7 +988,7 @@ def testWriteAstropyWithMaskedColsReadAsDataFrame(self):

tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")

tab1_df = tab1.to_pandas()
tab1_df = astropy_to_pandas(tab1)

self.assertTrue(tab1_df.columns.equals(tab2.columns))
for name in tab2.columns:
Expand All @@ -984,6 +1005,18 @@ def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
else:
self.assertTrue(col1.equals(col2))

@unittest.skipUnless(pd is not None, "Cannot test writing as a dataframe without pandas.")
def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)

self.butler.put(df1, self.datasetType, dataId={})

tab2 = self.butler.get(self.datasetType, dataId={})

df1_tab = pandas_to_astropy(df1)

self._checkAstropyTableEquality(df1_tab, tab2)

@unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
def testWriteAstropyReadAsNumpyTable(self):
tab1 = _makeSimpleAstropyTable()
Expand Down