Merge pull request #748 from lsst/tickets/DM-36795

DM-36795: Fix pandas dataframe to arrow crash with partially nulled string columns.
lsst · Oct 29, 2022 · eeb583e · eeb583e
2 parents 422d82b + 1a8c4f1
commit eeb583e
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 1 deletion.
diff --git a/doc/changes/DM-36795.bugfix.md b/doc/changes/DM-36795.bugfix.md
@@ -0,0 +1 @@
+Fix bug in pandas dataframe to arrow conversion that would crash with partially nulled string columns.
diff --git a/python/lsst/daf/butler/formatters/parquet.py b/python/lsst/daf/butler/formatters/parquet.py
@@ -370,7 +370,7 @@ def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Tab
         if not name.startswith("__"):
             if arrow_table[name].type == pa.string():
                 if len(arrow_table[name]) > 0:
-                    strlen = max(len(row.as_py()) for row in arrow_table[name])
+                    strlen = max(len(row.as_py()) for row in arrow_table[name] if row.is_valid)
                 else:
                     strlen = default_length
                 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen)

diff --git a/tests/test_parquet.py b/tests/test_parquet.py
@@ -261,6 +261,32 @@ def testMultiIndexDataFrame(self):
         with self.assertRaises(ValueError):
             self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})
 
+    def testSingleIndexDataFrameEmptyString(self):
+        """Test persisting a single index dataframe with empty strings."""
+        df1, _ = _makeSingleIndexDataFrame()
+
+        # Set one of the strings to None
+        df1.at[1, "strcol"] = None
+
+        self.butler.put(df1, self.datasetType, dataId={})
+        # Read the whole DataFrame.
+        df2 = self.butler.get(self.datasetType, dataId={})
+        self.assertTrue(df1.equals(df2))
+
+    def testSingleIndexDataFrameAllEmptyStrings(self):
+        """Test persisting a single index dataframe with an empty string
+        column.
+        """
+        df1, _ = _makeSingleIndexDataFrame()
+
+        # Set all of the strings to None
+        df1.loc[0:, "strcol"] = None
+
+        self.butler.put(df1, self.datasetType, dataId={})
+        # Read the whole DataFrame.
+        df2 = self.butler.get(self.datasetType, dataId={})
+        self.assertTrue(df1.equals(df2))
+
     def testLegacyDataFrame(self):
         """Test writing a dataframe to parquet via pandas (without additional
         metadata) and ensure that we can read it back with all the new