Skip to content

Commit

Permalink
Merge pull request #828 from lsst/tickets/DM-38845
Browse files Browse the repository at this point in the history
DM-38845: Fix row group computation crash if a list in a DataFrame is serialized to parquet.
  • Loading branch information
erykoff committed Apr 25, 2023
2 parents 9b39f79 + 83e8218 commit 8698bcb
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/changes/DM-38845.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add check for ListType when pandas converts a list object into parquet.
12 changes: 11 additions & 1 deletion python/lsst/daf/butler/formatters/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,9 +1311,19 @@ def compute_row_group_size(schema: pa.Schema, target_size: int = TARGET_ROW_GROU
# Assuming UTF-8 encoding, and very few wide characters.
t_width = 8 * strlen
elif isinstance(t, pa.FixedSizeListType):
t_width = t.list_size * t.value_type.bit_width
if t.value_type == pa.null():
t_width = 0
else:
t_width = t.list_size * t.value_type.bit_width
elif t == pa.null():
t_width = 0
elif isinstance(t, pa.ListType):
if t.value_type == pa.null():
t_width = 0
else:
# This is a variable length list, just choose
# something arbitrary.
t_width = 10 * t.value_type.bit_width
else:
t_width = t.bit_width

Expand Down
32 changes: 31 additions & 1 deletion tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,15 @@ def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
return data


def _makeSingleIndexDataFrame(include_masked=False):
def _makeSingleIndexDataFrame(include_masked=False, include_lists=False):
"""Make a single index data frame for testing.
Parameters
----------
include_masked : `bool`
Include masked columns.
include_lists : `bool`
Include list columns.
Returns
-------
Expand All @@ -172,6 +174,13 @@ def _makeSingleIndexDataFrame(include_masked=False):
df["mstrcol"] = pd.array(np.array(["text"] * nrow))
df.loc[1, ["m1", "m2", "mstrcol"]] = None

if include_lists:
nrow = len(df)

df["l1"] = [[0, 0]] * nrow
df["l2"] = [[0.0, 0.0]] * nrow
df["l3"] = [[]] * nrow

allColumns = df.columns.append(pd.Index(df.index.names))

return df, allColumns
Expand Down Expand Up @@ -309,6 +318,19 @@ def testSingleIndexDataFrame(self):
with self.assertRaises(ValueError):
self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]})

def testSingleIndexDataFrameWithLists(self):
df1, allColumns = _makeSingleIndexDataFrame(include_lists=True)

self.butler.put(df1, self.datasetType, dataId={})
# Read the whole DataFrame.
df2 = self.butler.get(self.datasetType, dataId={})

# We need to check the list columns specially because they go
# from lists to arrays.
for col in ["l1", "l2", "l3"]:
for i in range(len(df1)):
self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i]))

def testMultiIndexDataFrame(self):
df1 = _makeMultiIndexDataFrame()

Expand Down Expand Up @@ -1754,6 +1776,14 @@ def testRowGroupSizeTinyTable(self):

self.assertGreater(row_group_size, 1_000_000)

@unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.")
def testRowGroupSizeDataFrameWithLists(self):
df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10})
arrowTable = pandas_to_arrow(df)
row_group_size = compute_row_group_size(arrowTable.schema)

self.assertGreater(row_group_size, 1_000_000)


if __name__ == "__main__":
unittest.main()

0 comments on commit 8698bcb

Please sign in to comment.