diff --git a/doc/source/release.rst b/doc/source/release.rst index d3814ab324e92..4291ed1b6c357 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -60,6 +60,7 @@ API Changes indexed. These will be excluded. This will make pandas conform more with pandas/numpy indexing of out-of-bounds values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise ``IndexError`` (:issue:`6296`) +- ``select_as_multiple`` will always raise a ``KeyError``, when a key or the selector is not found (:issue:`6177`) Experimental Features ~~~~~~~~~~~~~~~~~~~~~ @@ -86,6 +87,9 @@ Bug Fixes - Bug in conversion of a string types to a DatetimeIndex with a specified frequency (:issue:`6273`, :issue:`6274`) - Bug in ``eval`` where type-promotion failed for large expressions (:issue:`6205`) - Bug in interpolate with inplace=True (:issue:`6281`) +- ``HDFStore.remove`` now handles start and stop (:issue:`6177`) +- ``HDFStore.select_as_multiple`` handles start and stop the same way as ``select`` (:issue:`6177`) +- ``HDFStore.select_as_coordinates`` and ``select_column`` works where clauses that result in filters (:issue:`6177`) pandas 0.13.1 ------------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d0dd292adfe67..85a9cf4ea0f9f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -724,8 +724,9 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, Exceptions ---------- - raise if any of the keys don't refer to tables or if they are not ALL - THE SAME DIMENSIONS + raises KeyError if keys or selector is not found or keys is empty + raises TypeError if keys is not a list or tuple + raises ValueError if the tables are not ALL THE SAME DIMENSIONS """ # default to single select @@ -748,12 +749,13 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, # collect the tables tbls = [self.get_storer(k) for k in keys] + s = self.get_storer(selector) # validate rows nrows = None - for t, k in zip(tbls, keys): + for t, k in itertools.chain([(s,selector)], zip(tbls, keys)): if t is None: - raise TypeError("Invalid table [%s]" % k) + raise KeyError("Invalid table [%s]" % k) if not t.is_table: raise TypeError( "object [%s] is not a table, and cannot be used in all " @@ -766,22 +768,17 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, raise ValueError( "all tables must have exactly the same nrows!") - # select coordinates from the selector table - try: - c = self.select_as_coordinates( - selector, where, start=start, stop=stop) - nrows = len(c) - except Exception: - raise ValueError("invalid selector [%s]" % selector) + # axis is the concentation axes + axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] def func(_start, _stop): - - # collect the returns objs - objs = [t.read(where=c[_start:_stop], columns=columns) - for t in tbls] - - # axis is the concentation axes - axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] + if where is not None: + c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs) + else: + c = None + + objs = [t.read(where=c, start=_start, stop=_stop, + columns=columns, **kwargs) for t in tbls] # concat and return return concat(objs, axis=axis, @@ -860,7 +857,7 @@ def remove(self, key, where=None, start=None, stop=None): raise KeyError('No object named %s in the file' % key) # remove the node - if where is None: + if where is None and start is None and stop is None: s.group._f_remove(recursive=True) # delete from the table @@ -2139,11 +2136,9 @@ def write(self, **kwargs): raise NotImplementedError( "cannot write on an abstract storer: sublcasses should implement") - def delete(self, where=None, **kwargs): - """support fully deleting the node in its entirety (only) - where - specification must be None - """ - if where is None: + def delete(self, where=None, start=None, stop=None, **kwargs): + """ support fully deleting the node in its entirety (only) - where specification must be None """ + if where is None and start is None and stop is None: self._handle.removeNode(self.group, recursive=True) return None @@ -3381,9 +3376,15 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): # create the selection self.selection = Selection( self, where=where, start=start, stop=stop, **kwargs) - return Index(self.selection.select_coords()) + coords = self.selection.select_coords() + if self.selection.filter is not None: + for field, op, filt in self.selection.filter.format(): + data = self.read_column(field, start=coords.min(), stop=coords.max()+1) + coords = coords[op(data.iloc[coords-coords.min()], filt).values] - def read_column(self, column, where=None, **kwargs): + return Index(coords) + + def read_column(self, column, where=None, start=None, stop=None, **kwargs): """return a single column from the table, generally only indexables are interesting """ @@ -3411,7 +3412,7 @@ def read_column(self, column, where=None, **kwargs): # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - return Series(a.convert(c[:], nan_rep=self.nan_rep, + return Series(a.convert(c[start:stop], nan_rep=self.nan_rep, encoding=self.encoding).take_data()) raise KeyError("column [%s] not found in the table" % column) @@ -3712,12 +3713,19 @@ def write_data_chunk(self, indexes, mask, values): except Exception as detail: raise TypeError("tables cannot write this data -> %s" % detail) - def delete(self, where=None, **kwargs): + def delete(self, where=None, start=None, stop=None, **kwargs): # delete all rows (and return the nrows) if where is None or not len(where): - nrows = self.nrows - self._handle.removeNode(self.group, recursive=True) + if start is None and stop is None: + nrows = self.nrows + self._handle.removeNode(self.group, recursive=True) + else: + # pytables<3.0 would remove a single row with stop=None + if stop is None: + stop = self.nrows + nrows = self.table.removeRows(start=start, stop=stop) + self.table.flush() return nrows # infer the data kind @@ -3726,7 +3734,7 @@ def delete(self, where=None, **kwargs): # create the selection table = self.table - self.selection = Selection(self, where, **kwargs) + self.selection = Selection(self, where, start=start, stop=stop, **kwargs) values = self.selection.select_coords() # delete the rows in reverse order @@ -4303,13 +4311,25 @@ def select_coords(self): """ generate the selection """ - if self.condition is None: - return np.arange(self.table.nrows) + start, stop = self.start, self.stop + nrows = self.table.nrows + if start is None: + start = 0 + elif start < 0: + start += nrows + if self.stop is None: + stop = nrows + elif stop < 0: + stop += nrows - return self.table.table.getWhereList(self.condition.format(), - start=self.start, stop=self.stop, - sort=True) + if self.condition is not None: + return self.table.table.getWhereList(self.condition.format(), + start=start, stop=stop, + sort=True) + elif self.coordinates is not None: + return self.coordinates + return np.arange(start, stop) # utilities ### diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 3c8e40fb1566a..3c5662a6fe268 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2195,6 +2195,69 @@ def test_remove_where(self): # self.assertRaises(ValueError, store.remove, # 'wp2', [('column', ['A', 'D'])]) + def test_remove_startstop(self): + # GH #4835 and #6177 + + with ensure_clean_store(self.path) as store: + + wp = tm.makePanel() + + # start + store.put('wp1', wp, format='t') + n = store.remove('wp1', start=32) + #assert(n == 120-32) + result = store.select('wp1') + expected = wp.reindex(major_axis=wp.major_axis[:32//4]) + assert_panel_equal(result, expected) + + store.put('wp2', wp, format='t') + n = store.remove('wp2', start=-32) + #assert(n == 32) + result = store.select('wp2') + expected = wp.reindex(major_axis=wp.major_axis[:-32//4]) + assert_panel_equal(result, expected) + + # stop + store.put('wp3', wp, format='t') + n = store.remove('wp3', stop=32) + #assert(n == 32) + result = store.select('wp3') + expected = wp.reindex(major_axis=wp.major_axis[32//4:]) + assert_panel_equal(result, expected) + + store.put('wp4', wp, format='t') + n = store.remove('wp4', stop=-32) + #assert(n == 120-32) + result = store.select('wp4') + expected = wp.reindex(major_axis=wp.major_axis[-32//4:]) + assert_panel_equal(result, expected) + + # start n stop + store.put('wp5', wp, format='t') + n = store.remove('wp5', start=16, stop=-16) + #assert(n == 120-32) + result = store.select('wp5') + expected = wp.reindex(major_axis=wp.major_axis[:16//4]+wp.major_axis[-16//4:]) + assert_panel_equal(result, expected) + + store.put('wp6', wp, format='t') + n = store.remove('wp6', start=16, stop=16) + #assert(n == 0) + result = store.select('wp6') + expected = wp.reindex(major_axis=wp.major_axis) + assert_panel_equal(result, expected) + + # with where + date = wp.major_axis.take(np.arange(0,30,3)) + crit = Term('major_axis=date') + store.put('wp7', wp, format='t') + n = store.remove('wp7', where=[crit], stop=80) + #assert(n == 28) + result = store.select('wp7') + expected = wp.reindex(major_axis=wp.major_axis-wp.major_axis[np.arange(0,20,3)]) + assert_panel_equal(result, expected) + + def test_remove_crit(self): with ensure_clean_store(self.path) as store: @@ -3449,6 +3512,25 @@ def f(): result = store.select_column('df3', 'string') tm.assert_almost_equal(result.values, df3['string'].values) + # start/stop + result = store.select_column('df3', 'string', start=2) + tm.assert_almost_equal(result.values, df3['string'].values[2:]) + + result = store.select_column('df3', 'string', start=-2) + tm.assert_almost_equal(result.values, df3['string'].values[-2:]) + + result = store.select_column('df3', 'string', stop=2) + tm.assert_almost_equal(result.values, df3['string'].values[:2]) + + result = store.select_column('df3', 'string', stop=-2) + tm.assert_almost_equal(result.values, df3['string'].values[:-2]) + + result = store.select_column('df3', 'string', start=2, stop=-2) + tm.assert_almost_equal(result.values, df3['string'].values[2:-2]) + + result = store.select_column('df3', 'string', start=-2, stop=2) + tm.assert_almost_equal(result.values, df3['string'].values[-2:2]) + def test_coordinates(self): df = tm.makeTimeDataFrame() @@ -3519,6 +3601,12 @@ def test_coordinates(self): self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5) self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5,stop=10) + # selection with filter + selection = date_range('20000101',periods=500) + result = store.select('df', where='index in selection') + expected = df[df.index.isin(selection)] + tm.assert_frame_equal(result,expected) + # list df = DataFrame(np.random.randn(10,2)) store.append('df2',df) @@ -3533,6 +3621,11 @@ def test_coordinates(self): expected = df.loc[where] tm.assert_frame_equal(result,expected) + # start/stop + result = store.select('df2', start=5, stop=10) + expected = df[5:10] + tm.assert_frame_equal(result,expected) + def test_append_to_multiple(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) @@ -3603,11 +3696,11 @@ def test_select_as_multiple(self): None, where=['A>0', 'B>0'], selector='df1') self.assertRaises(Exception, store.select_as_multiple, [None], where=['A>0', 'B>0'], selector='df1') - self.assertRaises(TypeError, store.select_as_multiple, + self.assertRaises(KeyError, store.select_as_multiple, ['df1','df3'], where=['A>0', 'B>0'], selector='df1') self.assertRaises(KeyError, store.select_as_multiple, ['df3'], where=['A>0', 'B>0'], selector='df1') - self.assertRaises(ValueError, store.select_as_multiple, + self.assertRaises(KeyError, store.select_as_multiple, ['df1','df2'], where=['A>0', 'B>0'], selector='df4') # default select