Merge pull request #998 from lsst/tickets/DM-43925

DM-43925: Add workarounds for pandas bugs when using non-floating-point masked columns.
lsst · Apr 18, 2024 · fd58971 · fd58971
2 parents 1223572 + e620aae
commit fd58971
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 6 deletions.
diff --git a/doc/changes/DM-43925.bugfix.md b/doc/changes/DM-43925.bugfix.md
@@ -0,0 +1 @@
+Work around pandas bugs when using non-floating-point masked columns.
diff --git a/python/lsst/daf/butler/configs/storageClasses.yaml b/python/lsst/daf/butler/configs/storageClasses.yaml
@@ -124,7 +124,7 @@ storageClasses:
     pytype: pandas.core.frame.DataFrame
     converters:
       pyarrow.Table: lsst.daf.butler.formatters.parquet.arrow_to_pandas
-      astropy.table.Table: astropy.table.Table.to_pandas
+      astropy.table.Table: lsst.daf.butler.formatters.parquet.astropy_to_pandas
       numpy.ndarray: pandas.DataFrame.from_records
       dict: pandas.DataFrame.from_records
     delegate: lsst.daf.butler.delegates.dataframe.DataFrameDelegate

diff --git a/python/lsst/daf/butler/formatters/parquet.py b/python/lsst/daf/butler/formatters/parquet.py
@@ -36,6 +36,7 @@
     "pandas_to_arrow",
     "pandas_to_astropy",
     "astropy_to_arrow",
+    "astropy_to_pandas",
     "numpy_to_arrow",
     "numpy_to_astropy",
     "numpy_dict_to_arrow",
@@ -492,6 +493,34 @@ def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table:
     return arrow_table
 
 
+def astropy_to_pandas(astropy_table: atable.Table, index: str | None = None) -> pd.DataFrame:
+    """Convert an astropy table to a pandas dataframe via arrow.
+
+    By going via arrow we avoid pandas masked column bugs (e.g.
+    https://github.com/pandas-dev/pandas/issues/58173)
+
+    Parameters
+    ----------
+    astropy_table : `astropy.Table`
+        Input astropy table.
+    index : `str`, optional
+        Name of column to set as index.
+
+    Returns
+    -------
+    dataframe : `pandas.DataFrame`
+        Output pandas dataframe.
+    """
+    dataframe = arrow_to_pandas(astropy_to_arrow(astropy_table))
+
+    if isinstance(index, str):
+        dataframe = dataframe.set_index(index)
+    elif index:
+        raise RuntimeError("index must be a string or None.")
+
+    return dataframe
+
+
 def _astropy_to_numpy_dict(astropy_table: atable.Table) -> dict[str, np.ndarray]:
     """Convert an astropy table to an arrow table.
 
@@ -560,12 +589,11 @@ def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table:
         Converted astropy table.
     """
     import pandas as pd
-    from astropy.table import Table
 
     if isinstance(dataframe.columns, pd.MultiIndex):
         raise ValueError("Cannot convert a multi-index dataframe to an astropy table.")
 
-    return Table.from_pandas(dataframe, index=True)
+    return arrow_to_astropy(pandas_to_arrow(dataframe))
 
 
 def _pandas_to_numpy_dict(dataframe: pd.DataFrame) -> dict[str, np.ndarray]:
@@ -1073,7 +1101,7 @@ def _arrow_string_to_numpy_dtype(
         # String/bytes length from header.
         strlen = int(schema.metadata[encoded])
     elif numpy_column is not None and len(numpy_column) > 0:
-        strlen = max(len(row) for row in numpy_column)
+        strlen = max([len(row) for row in numpy_column if row])
 
     dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}"
 

diff --git a/tests/test_parquet.py b/tests/test_parquet.py
@@ -96,10 +96,12 @@
         arrow_to_numpy_dict,
         arrow_to_pandas,
         astropy_to_arrow,
+        astropy_to_pandas,
         compute_row_group_size,
         numpy_dict_to_arrow,
         numpy_to_arrow,
         pandas_to_arrow,
+        pandas_to_astropy,
     )
 except ImportError:
     pa = None
@@ -200,6 +202,7 @@ def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
         df["m2"] = pd.array(np.arange(nrow), dtype=np.float32)
         df["mstrcol"] = pd.array(np.array(["text"] * nrow))
         df.loc[1, ["m1", "m2", "mstrcol"]] = None
+        df.loc[0, "m1"] = 1649900760361600113
 
     if include_lists:
         nrow = len(df)
@@ -273,6 +276,7 @@ def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, includ
         # Masked 64-bit integer.
         arr = np.arange(nrow, dtype="i8")
         arr[mask] = -1
+        arr[0] = 1649900760361600113
         table["m_i8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=-1)
         # Masked 32-bit float.
         arr = np.arange(nrow, dtype="f4")
@@ -555,7 +559,7 @@ def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
         self.butler.put(df1, self.datasetType, dataId={})
 
         tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy")
-        tab2_df = tab2.to_pandas(index="index")
+        tab2_df = astropy_to_pandas(tab2, index="index")
 
         self.assertTrue(df1.columns.equals(tab2_df.columns))
         for name in tab2_df.columns:
@@ -584,6 +588,23 @@ def testWriteMultiIndexDataFrameReadAsAstropyTable(self):
         # This test simply checks that it's readable, but definitely not
         # recommended.
 
+    @unittest.skipUnless(atable is not None, "Cannot test writing as astropy without astropy.")
+    def testWriteAstropyTableWithMaskedColsReadAsSingleIndexDataFrame(self):
+        tab1 = _makeSimpleAstropyTable(include_masked=True)
+
+        self.butler.put(tab1, self.datasetType, dataId={})
+
+        tab2 = self.butler.get(self.datasetType, dataId={})
+
+        tab1_df = astropy_to_pandas(tab1)
+        self.assertTrue(tab1_df.equals(tab2))
+
+        tab2_astropy = pandas_to_astropy(tab2)
+        for col in tab1.dtype.names:
+            np.testing.assert_array_equal(tab2_astropy[col], tab1[col])
+            if isinstance(tab1[col], atable.column.MaskedColumn):
+                np.testing.assert_array_equal(tab2_astropy[col].mask, tab1[col].mask)
+
     @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.")
     def testWriteSingleIndexDataFrameReadAsArrowTable(self):
         df1, allColumns = _makeSingleIndexDataFrame()
@@ -967,7 +988,7 @@ def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
 
         tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame")
 
-        tab1_df = tab1.to_pandas()
+        tab1_df = astropy_to_pandas(tab1)
 
         self.assertTrue(tab1_df.columns.equals(tab2.columns))
         for name in tab2.columns:
@@ -984,6 +1005,18 @@ def testWriteAstropyWithMaskedColsReadAsDataFrame(self):
             else:
                 self.assertTrue(col1.equals(col2))
 
+    @unittest.skipUnless(pd is not None, "Cannot test writing as a dataframe without pandas.")
+    def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self):
+        df1, allColumns = _makeSingleIndexDataFrame(include_masked=True)
+
+        self.butler.put(df1, self.datasetType, dataId={})
+
+        tab2 = self.butler.get(self.datasetType, dataId={})
+
+        df1_tab = pandas_to_astropy(df1)
+
+        self._checkAstropyTableEquality(df1_tab, tab2)
+
     @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.")
     def testWriteAstropyReadAsNumpyTable(self):
         tab1 = _makeSimpleAstropyTable()