Skip to content

Commit

Permalink
Refactor string length checking on dataframe->arrow conversion.
Browse files Browse the repository at this point in the history
  • Loading branch information
erykoff committed Oct 28, 2022
1 parent ee902a2 commit 83bb006
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 8 deletions.
2 changes: 2 additions & 0 deletions doc/changes/DM-36775.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix bug in pandas dataframe to arrow conversion that would crash with some pandas object datatypes.

15 changes: 7 additions & 8 deletions python/lsst/daf/butler/formatters/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,21 +357,20 @@ def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Tab
-------
arrow_table : `pyarrow.Table`
"""
import numpy as np
import pandas as pd

arrow_table = pa.Table.from_pandas(dataframe)

# Update the metadata
md = arrow_table.schema.metadata

md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows)

if not isinstance(dataframe.columns, pd.MultiIndex):
for name in dataframe.columns:
if dataframe[name].dtype.type is np.object_:
if len(dataframe[name].values) > 0:
strlen = max(len(row) for row in dataframe[name].values)
# We loop through the arrow table columns because the datatypes have
# been checked and converted from pandas objects.
for name in arrow_table.column_names:
if not name.startswith('__'):
if arrow_table[name].type == pa.string():
if len(arrow_table[name]) > 0:
strlen = max(len(row.as_py()) for row in arrow_table[name])
else:
strlen = default_length
md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen)
Expand Down

0 comments on commit 83bb006

Please sign in to comment.