Skip to content

Commit

Permalink
Merge pull request #998 from lsst/tickets/DM-43925
Browse files Browse the repository at this point in the history
DM-43925: Add workarounds for pandas bugs when using non-floating-point masked columns.
  • Loading branch information
erykoff committed Apr 18, 2024
2 parents 1223572 + e620aae commit fd58971
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 6 deletions.
1 change: 1 addition & 0 deletions doc/changes/DM-43925.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Work around pandas bugs when using non-floating-point masked columns.
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/configs/storageClasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ storageClasses:
pytype: pandas.core.frame.DataFrame
converters:
pyarrow.Table: lsst.daf.butler.formatters.parquet.arrow_to_pandas
astropy.table.Table: astropy.table.Table.to_pandas
astropy.table.Table: lsst.daf.butler.formatters.parquet.astropy_to_pandas
numpy.ndarray: pandas.DataFrame.from_records
dict: pandas.DataFrame.from_records
delegate: lsst.daf.butler.delegates.dataframe.DataFrameDelegate
Expand Down
34 changes: 31 additions & 3 deletions python/lsst/daf/butler/formatters/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"pandas_to_arrow",
"pandas_to_astropy",
"astropy_to_arrow",
"astropy_to_pandas",
"numpy_to_arrow",
"numpy_to_astropy",
"numpy_dict_to_arrow",
Expand Down Expand Up @@ -492,6 +493,34 @@ def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table:
return arrow_table


def astropy_to_pandas(astropy_table: atable.Table, index: str | None = None) -> pd.DataFrame:
"""Convert an astropy table to a pandas dataframe via arrow.
By going via arrow we avoid pandas masked column bugs (e.g.
https://github.com/pandas-dev/pandas/issues/58173)
Parameters
----------
astropy_table : `astropy.Table`
Input astropy table.
index : `str`, optional
Name of column to set as index.
Returns
-------
dataframe : `pandas.DataFrame`
Output pandas dataframe.
"""
dataframe = arrow_to_pandas(astropy_to_arrow(astropy_table))

if isinstance(index, str):
dataframe = dataframe.set_index(index)
elif index:
raise RuntimeError("index must be a string or None.")

return dataframe


def _astropy_to_numpy_dict(astropy_table: atable.Table) -> dict[str, np.ndarray]:
"""Convert an astropy table to an arrow table.
Expand Down Expand Up @@ -560,12 +589,11 @@ def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table:
Converted astropy table.
"""
import pandas as pd
from astropy.table import Table

if isinstance(dataframe.columns, pd.MultiIndex):
raise ValueError("Cannot convert a multi-index dataframe to an astropy table.")

return Table.from_pandas(dataframe, index=True)
return arrow_to_astropy(pandas_to_arrow(dataframe))


def _pandas_to_numpy_dict(dataframe: pd.DataFrame) -> dict[str, np.ndarray]:
Expand Down Expand Up @@ -1073,7 +1101,7 @@ def _arrow_string_to_numpy_dtype(
# String/bytes length from header.
strlen = int(schema.metadata[encoded])
elif numpy_column is not None and len(numpy_column) > 0:
strlen = max(len(row) for row in numpy_column)
strlen = max([len(row) for row in numpy_column if row])

dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}"

Expand Down
37 changes: 35 additions & 2 deletions tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,12 @@
arrow_to_numpy_dict,
arrow_to_pandas,
astropy_to_arrow,
astropy_to_pandas,
compute_row_group_size,
numpy_dict_to_arrow,
numpy_to_arrow,
pandas_to_arrow,
pandas_to_astropy,
)
except ImportError:
pa = None
Expand Down Expand Up @@ -200,6 +202,7 @@ def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
df["mstrcol"] = pd.array(np.array(["text"] * nrow))
df.loc[1, ["m1", "m2", "mstrcol"]] = None
df.loc[0, "m1"] = 1649900760361600113

if include_lists:
nrow = len(df)
Expand Down Expand Up @@ -273,6 +276,7 @@ def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, includ
# Masked 64-bit integer.
arr = np.arange(nrow, dtype="i8")
arr[mask] = -1
arr[0] = 1649900760361600113
table["m_i8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=-1)
# Masked 32-bit float.
arr = np.arange(nrow, dtype="f4")
Expand Down Expand Up @@ -555,7 +559,7 @@ def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
self.butler.put(df1, self.datasetType, dataId={})

tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
tab2_df = tab2.to_pandas(index="index")
tab2_df = astropy_to_pandas(tab2, index="index")

self.assertTrue(df1.columns.equals(tab2_df.columns))
for name in tab2_df.columns:
Expand Down Expand Up @@ -584,6 +588,23 @@ def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
# This test simply checks that it's readable, but definitely not
# recommended.

@unittest.skipUnless(atable is not None, "Cannot test writing as astropy without astropy.")
def testWriteAstropyTableWithMaskedColsReadAsSingleIndexDataFrame(self):
tab1 = _makeSimpleAstropyTable(include_masked=True)

self.butler.put(tab1, self.datasetType, dataId={})

tab2 = self.butler.get(self.datasetType, dataId={})

tab1_df = astropy_to_pandas(tab1)
self.assertTrue(tab1_df.equals(tab2))

tab2_astropy = pandas_to_astropy(tab2)
for col in tab1.dtype.names:
np.testing.assert_array_equal(tab2_astropy[col], tab1[col])
if isinstance(tab1[col], atable.column.MaskedColumn):
np.testing.assert_array_equal(tab2_astropy[col].mask, tab1[col].mask)

@unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
def testWriteSingleIndexDataFrameReadAsArrowTable(self):
df1, allColumns = _makeSingleIndexDataFrame()
Expand Down Expand Up @@ -967,7 +988,7 @@ def testWriteAstropyWithMaskedColsReadAsDataFrame(self):

tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")

tab1_df = tab1.to_pandas()
tab1_df = astropy_to_pandas(tab1)

self.assertTrue(tab1_df.columns.equals(tab2.columns))
for name in tab2.columns:
Expand All @@ -984,6 +1005,18 @@ def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
else:
self.assertTrue(col1.equals(col2))

@unittest.skipUnless(pd is not None, "Cannot test writing as a dataframe without pandas.")
def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)

self.butler.put(df1, self.datasetType, dataId={})

tab2 = self.butler.get(self.datasetType, dataId={})

df1_tab = pandas_to_astropy(df1)

self._checkAstropyTableEquality(df1_tab, tab2)

@unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
def testWriteAstropyReadAsNumpyTable(self):
tab1 = _makeSimpleAstropyTable()
Expand Down

0 comments on commit fd58971

Please sign in to comment.