Skip to content

Commit

Permalink
Merge pull request #748 from lsst/tickets/DM-36795
Browse files Browse the repository at this point in the history
DM-36795: Fix pandas dataframe to arrow crash with partially nulled string columns.
  • Loading branch information
erykoff committed Oct 29, 2022
2 parents 422d82b + 1a8c4f1 commit eeb583e
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/changes/DM-36795.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix bug in pandas dataframe to arrow conversion that would crash with partially nulled string columns.
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/formatters/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Tab
if not name.startswith("__"):
if arrow_table[name].type == pa.string():
if len(arrow_table[name]) > 0:
strlen = max(len(row.as_py()) for row in arrow_table[name])
strlen = max(len(row.as_py()) for row in arrow_table[name] if row.is_valid)
else:
strlen = default_length
md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen)
Expand Down
26 changes: 26 additions & 0 deletions tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,32 @@ def testMultiIndexDataFrame(self):
with self.assertRaises(ValueError):
self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})

def testSingleIndexDataFrameEmptyString(self):
"""Test persisting a single index dataframe with empty strings."""
df1, _ = _makeSingleIndexDataFrame()

# Set one of the strings to None
df1.at[1, "strcol"] = None

self.butler.put(df1, self.datasetType, dataId={})
# Read the whole DataFrame.
df2 = self.butler.get(self.datasetType, dataId={})
self.assertTrue(df1.equals(df2))

def testSingleIndexDataFrameAllEmptyStrings(self):
"""Test persisting a single index dataframe with an empty string
column.
"""
df1, _ = _makeSingleIndexDataFrame()

# Set all of the strings to None
df1.loc[0:, "strcol"] = None

self.butler.put(df1, self.datasetType, dataId={})
# Read the whole DataFrame.
df2 = self.butler.get(self.datasetType, dataId={})
self.assertTrue(df1.equals(df2))

def testLegacyDataFrame(self):
"""Test writing a dataframe to parquet via pandas (without additional
metadata) and ensure that we can read it back with all the new
Expand Down

0 comments on commit eeb583e

Please sign in to comment.