Skip to content

Commit

Permalink
BUG: HDFStore.append with encoded string itemsize
Browse files Browse the repository at this point in the history
Failure came when the maximum length of the unencoded string
was smaller than the maximum encoded lenght.
  • Loading branch information
TomAugspurger committed Oct 5, 2015
1 parent 0cd6734 commit 59eee79
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 5 deletions.
7 changes: 7 additions & 0 deletions doc/source/whatsnew/v0.17.1.txt
Expand Up @@ -42,3 +42,10 @@ Performance Improvements

Bug Fixes
~~~~~~~~~


- Bug in ``HDFStore.append`` with strings whose encoded length exceded the max unencoded length (:issue:`11234`)




8 changes: 3 additions & 5 deletions pandas/io/pytables.py
Expand Up @@ -1860,7 +1860,8 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
)

# itemsize is the maximum length of a string (along any dimension)
itemsize = lib.max_len_string_array(com._ensure_object(data.ravel()))
data_converted = _convert_string_array(data, encoding)
itemsize = data_converted.itemsize

# specified min_itemsize?
if isinstance(min_itemsize, dict):
Expand All @@ -1877,10 +1878,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
self.itemsize = itemsize
self.kind = 'string'
self.typ = self.get_atom_string(block, itemsize)
self.set_data(self.convert_string_data(data, itemsize, encoding))

def convert_string_data(self, data, itemsize, encoding):
return _convert_string_array(data, encoding, itemsize)
self.set_data(data_converted.astype('|S%d' % itemsize, copy=False))

def get_atom_coltype(self, kind=None):
""" return the PyTables column class for this column """
Expand Down
16 changes: 16 additions & 0 deletions pandas/io/tests/test_pytables.py
Expand Up @@ -4292,6 +4292,22 @@ def f():

compat_assert_produces_warning(PerformanceWarning, f)


def test_unicode_longer_encoded(self):
# GH 11234
char = '\u0394'
df = pd.DataFrame({'A': [char]})
with ensure_clean_store(self.path) as store:
store.put('df', df, format='table', encoding='utf-8')
result = store.get('df')
tm.assert_frame_equal(result, df)

df = pd.DataFrame({'A': ['a', char], 'B': ['b', 'b']})
with ensure_clean_store(self.path) as store:
store.put('df', df, format='table', encoding='utf-8')
result = store.get('df')
tm.assert_frame_equal(result, df)

def test_store_datetime_mixed(self):

df = DataFrame(
Expand Down

0 comments on commit 59eee79

Please sign in to comment.