Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make pd.read_hdf('data.h5') work when pandas object stored contained categorical columns #13359

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,6 @@ Bug Fixes


- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)

- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset (that had one or more categorical columns) failed unless the key argument was set to the name of the dataset. (:issue:`13231`)

30 changes: 25 additions & 5 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,11 +331,17 @@ def read_hdf(path_or_buf, key=None, **kwargs):

try:
if key is None:
keys = store.keys()
if len(keys) != 1:
raise ValueError('key must be provided when HDF file contains '
'multiple datasets.')
key = keys[0]
groups = store.groups()
candidate_only_group = groups[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will fail on an empty store (and the error it raises will be odd, e.g. IndexError).

# For the HDF file to have only one dataset, all other groups
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

blank line

# should then be metadata groups for that candidate group. (This
# assumes that the groups() method enumerates parent groups
# before their children.)
for group_to_check in groups[1:]:
if not _is_metadata_of(group_to_check, candidate_only_group):
raise ValueError('key must be provided when HDF file '
'contains multiple datasets.')
key = candidate_only_group._v_pathname
return store.select(key, auto_close=auto_close, **kwargs)
except:
# if there is an error, close the store
Expand All @@ -347,6 +353,20 @@ def read_hdf(path_or_buf, key=None, **kwargs):
raise


def _is_metadata_of(group, parent_group):
"""Check if a given group is a metadata group for a given parent_group."""
if group._v_depth <= parent_group._v_depth:
return False

current = group
while current._v_depth > 1:
parent = current._v_parent
if parent == parent_group and current._v_name == 'meta':
return True
current = current._v_parent
return False


class HDFStore(StringMixin):

"""
Expand Down
17 changes: 15 additions & 2 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@

from distutils.version import LooseVersion

_default_compressor = LooseVersion(tables.__version__) >= '2.2' \
and 'blosc' or 'zlib'
_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2'
else 'zlib')

_multiprocess_can_split_ = False

Expand Down Expand Up @@ -4877,13 +4877,26 @@ def test_read_nokey(self):
df = DataFrame(np.random.rand(4, 5),
index=list('abcd'),
columns=list('ABCDE'))
# Categorical dtype not supported for "fixed" format. So no need
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

blank line

# to test with that dtype in the dataframe here.
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df', mode='a')
reread = read_hdf(path)
assert_frame_equal(df, reread)
df.to_hdf(path, 'df2', mode='a')
self.assertRaises(ValueError, read_hdf, path)

def test_read_nokey_table(self):
# GH13231
df = DataFrame({'i': range(5),
'c': Series(list('abacd'), dtype='category')})
with ensure_clean_path(self.path) as path:
df.to_hdf(path, 'df', mode='a', format='table')
reread = read_hdf(path)
assert_frame_equal(df, reread)
df.to_hdf(path, 'df2', mode='a', format='table')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

blank line

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you really mean "add a blank line before the last assert (assertRaises) of test_read_nokey_table()?" Because then that with-block would not look the same as the pretty much identical one in the very similar test_read_nokey(). I'm assuming for consistency's sake that you meant a blank like before the with-block (like for test_read_nokey()). If not, please let me know.

self.assertRaises(ValueError, read_hdf, path)

def test_read_from_pathlib_path(self):

# GH11773
Expand Down