Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

ENH: select column/coordinates/multiple with start/stop/selection #6177

Merged
merged 1 commit into from Feb 9, 2014
Jump to file or symbol
Failed to load files and symbols.
+155 −38
Split
View
@@ -60,6 +60,7 @@ API Changes
indexed. These will be excluded. This will make pandas conform more with pandas/numpy indexing of out-of-bounds
values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise
``IndexError`` (:issue:`6296`)
+- ``select_as_multiple`` will always raise a ``KeyError``, when a key or the selector is not found (:issue:`6177`)
Experimental Features
~~~~~~~~~~~~~~~~~~~~~
@@ -86,6 +87,9 @@ Bug Fixes
- Bug in conversion of a string types to a DatetimeIndex with a specified frequency (:issue:`6273`, :issue:`6274`)
- Bug in ``eval`` where type-promotion failed for large expressions (:issue:`6205`)
- Bug in interpolate with inplace=True (:issue:`6281`)
+- ``HDFStore.remove`` now handles start and stop (:issue:`6177`)
+- ``HDFStore.select_as_multiple`` handles start and stop the same way as ``select`` (:issue:`6177`)
+- ``HDFStore.select_as_coordinates`` and ``select_column`` works where clauses that result in filters (:issue:`6177`)
pandas 0.13.1
-------------
View
@@ -724,8 +724,9 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
Exceptions
----------
- raise if any of the keys don't refer to tables or if they are not ALL
- THE SAME DIMENSIONS
+ raises KeyError if keys or selector is not found or keys is empty
+ raises TypeError if keys is not a list or tuple
+ raises ValueError if the tables are not ALL THE SAME DIMENSIONS
"""
# default to single select
@@ -748,12 +749,13 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
# collect the tables
tbls = [self.get_storer(k) for k in keys]
+ s = self.get_storer(selector)
# validate rows
nrows = None
- for t, k in zip(tbls, keys):
+ for t, k in itertools.chain([(s,selector)], zip(tbls, keys)):
if t is None:
- raise TypeError("Invalid table [%s]" % k)
+ raise KeyError("Invalid table [%s]" % k)
if not t.is_table:
raise TypeError(
"object [%s] is not a table, and cannot be used in all "
@@ -766,22 +768,17 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
raise ValueError(
"all tables must have exactly the same nrows!")
- # select coordinates from the selector table
- try:
- c = self.select_as_coordinates(
- selector, where, start=start, stop=stop)
- nrows = len(c)
- except Exception:
- raise ValueError("invalid selector [%s]" % selector)
+ # axis is the concentation axes
+ axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
def func(_start, _stop):
-
- # collect the returns objs
- objs = [t.read(where=c[_start:_stop], columns=columns)
- for t in tbls]
-
- # axis is the concentation axes
- axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
+ if where is not None:
+ c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs)
+ else:
+ c = None
+
+ objs = [t.read(where=c, start=_start, stop=_stop,
+ columns=columns, **kwargs) for t in tbls]
# concat and return
return concat(objs, axis=axis,
@@ -860,7 +857,7 @@ def remove(self, key, where=None, start=None, stop=None):
raise KeyError('No object named %s in the file' % key)
# remove the node
- if where is None:
+ if where is None and start is None and stop is None:
s.group._f_remove(recursive=True)
# delete from the table
@@ -2139,11 +2136,9 @@ def write(self, **kwargs):
raise NotImplementedError(
"cannot write on an abstract storer: sublcasses should implement")
- def delete(self, where=None, **kwargs):
- """support fully deleting the node in its entirety (only) - where
- specification must be None
- """
- if where is None:
+ def delete(self, where=None, start=None, stop=None, **kwargs):
+ """ support fully deleting the node in its entirety (only) - where specification must be None """
+ if where is None and start is None and stop is None:
self._handle.removeNode(self.group, recursive=True)
return None
@@ -3381,9 +3376,15 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
# create the selection
self.selection = Selection(
self, where=where, start=start, stop=stop, **kwargs)
- return Index(self.selection.select_coords())
+ coords = self.selection.select_coords()
+ if self.selection.filter is not None:
+ for field, op, filt in self.selection.filter.format():
+ data = self.read_column(field, start=coords.min(), stop=coords.max()+1)
+ coords = coords[op(data.iloc[coords-coords.min()], filt).values]
- def read_column(self, column, where=None, **kwargs):
+ return Index(coords)
+
+ def read_column(self, column, where=None, start=None, stop=None, **kwargs):
"""return a single column from the table, generally only indexables
are interesting
"""
@@ -3411,7 +3412,7 @@ def read_column(self, column, where=None, **kwargs):
# column must be an indexable or a data column
c = getattr(self.table.cols, column)
a.set_info(self.info)
- return Series(a.convert(c[:], nan_rep=self.nan_rep,
+ return Series(a.convert(c[start:stop], nan_rep=self.nan_rep,
encoding=self.encoding).take_data())
raise KeyError("column [%s] not found in the table" % column)
@@ -3712,12 +3713,19 @@ def write_data_chunk(self, indexes, mask, values):
except Exception as detail:
raise TypeError("tables cannot write this data -> %s" % detail)
- def delete(self, where=None, **kwargs):
+ def delete(self, where=None, start=None, stop=None, **kwargs):
# delete all rows (and return the nrows)
if where is None or not len(where):
- nrows = self.nrows
- self._handle.removeNode(self.group, recursive=True)
+ if start is None and stop is None:
+ nrows = self.nrows
+ self._handle.removeNode(self.group, recursive=True)
+ else:
+ # pytables<3.0 would remove a single row with stop=None
+ if stop is None:
+ stop = self.nrows
+ nrows = self.table.removeRows(start=start, stop=stop)
+ self.table.flush()
return nrows
# infer the data kind
@@ -3726,7 +3734,7 @@ def delete(self, where=None, **kwargs):
# create the selection
table = self.table
- self.selection = Selection(self, where, **kwargs)
+ self.selection = Selection(self, where, start=start, stop=stop, **kwargs)
values = self.selection.select_coords()
# delete the rows in reverse order
@@ -4303,13 +4311,25 @@ def select_coords(self):
"""
generate the selection
"""
- if self.condition is None:
- return np.arange(self.table.nrows)
+ start, stop = self.start, self.stop
+ nrows = self.table.nrows
+ if start is None:
+ start = 0
+ elif start < 0:
+ start += nrows
+ if self.stop is None:
+ stop = nrows
+ elif stop < 0:
+ stop += nrows
- return self.table.table.getWhereList(self.condition.format(),
- start=self.start, stop=self.stop,
- sort=True)
+ if self.condition is not None:
+ return self.table.table.getWhereList(self.condition.format(),
+ start=start, stop=stop,
+ sort=True)
+ elif self.coordinates is not None:
+ return self.coordinates
+ return np.arange(start, stop)
# utilities ###
@@ -2195,6 +2195,69 @@ def test_remove_where(self):
# self.assertRaises(ValueError, store.remove,
# 'wp2', [('column', ['A', 'D'])])
+ def test_remove_startstop(self):
+ # GH #4835 and #6177
+
+ with ensure_clean_store(self.path) as store:
+
+ wp = tm.makePanel()
+
+ # start
+ store.put('wp1', wp, format='t')
+ n = store.remove('wp1', start=32)
+ #assert(n == 120-32)
+ result = store.select('wp1')
+ expected = wp.reindex(major_axis=wp.major_axis[:32//4])
+ assert_panel_equal(result, expected)
+
+ store.put('wp2', wp, format='t')
+ n = store.remove('wp2', start=-32)
+ #assert(n == 32)
+ result = store.select('wp2')
+ expected = wp.reindex(major_axis=wp.major_axis[:-32//4])
+ assert_panel_equal(result, expected)
+
+ # stop
+ store.put('wp3', wp, format='t')
+ n = store.remove('wp3', stop=32)
+ #assert(n == 32)
+ result = store.select('wp3')
+ expected = wp.reindex(major_axis=wp.major_axis[32//4:])
+ assert_panel_equal(result, expected)
+
+ store.put('wp4', wp, format='t')
+ n = store.remove('wp4', stop=-32)
+ #assert(n == 120-32)
+ result = store.select('wp4')
+ expected = wp.reindex(major_axis=wp.major_axis[-32//4:])
+ assert_panel_equal(result, expected)
+
+ # start n stop
+ store.put('wp5', wp, format='t')
+ n = store.remove('wp5', start=16, stop=-16)
+ #assert(n == 120-32)
+ result = store.select('wp5')
+ expected = wp.reindex(major_axis=wp.major_axis[:16//4]+wp.major_axis[-16//4:])
+ assert_panel_equal(result, expected)
+
+ store.put('wp6', wp, format='t')
+ n = store.remove('wp6', start=16, stop=16)
+ #assert(n == 0)
+ result = store.select('wp6')
+ expected = wp.reindex(major_axis=wp.major_axis)
+ assert_panel_equal(result, expected)
+
+ # with where
+ date = wp.major_axis.take(np.arange(0,30,3))
+ crit = Term('major_axis=date')
+ store.put('wp7', wp, format='t')
+ n = store.remove('wp7', where=[crit], stop=80)
+ #assert(n == 28)
+ result = store.select('wp7')
+ expected = wp.reindex(major_axis=wp.major_axis-wp.major_axis[np.arange(0,20,3)])
+ assert_panel_equal(result, expected)
+
+
def test_remove_crit(self):
with ensure_clean_store(self.path) as store:
@@ -3449,6 +3512,25 @@ def f():
result = store.select_column('df3', 'string')
tm.assert_almost_equal(result.values, df3['string'].values)
+ # start/stop
+ result = store.select_column('df3', 'string', start=2)
+ tm.assert_almost_equal(result.values, df3['string'].values[2:])
+
+ result = store.select_column('df3', 'string', start=-2)
+ tm.assert_almost_equal(result.values, df3['string'].values[-2:])
+
+ result = store.select_column('df3', 'string', stop=2)
+ tm.assert_almost_equal(result.values, df3['string'].values[:2])
+
+ result = store.select_column('df3', 'string', stop=-2)
+ tm.assert_almost_equal(result.values, df3['string'].values[:-2])
+
+ result = store.select_column('df3', 'string', start=2, stop=-2)
+ tm.assert_almost_equal(result.values, df3['string'].values[2:-2])
+
+ result = store.select_column('df3', 'string', start=-2, stop=2)
+ tm.assert_almost_equal(result.values, df3['string'].values[-2:2])
+
def test_coordinates(self):
df = tm.makeTimeDataFrame()
@@ -3519,6 +3601,12 @@ def test_coordinates(self):
self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5)
self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5,stop=10)
+ # selection with filter
+ selection = date_range('20000101',periods=500)
+ result = store.select('df', where='index in selection')
+ expected = df[df.index.isin(selection)]
+ tm.assert_frame_equal(result,expected)
+
# list
df = DataFrame(np.random.randn(10,2))
store.append('df2',df)
@@ -3533,6 +3621,11 @@ def test_coordinates(self):
expected = df.loc[where]
tm.assert_frame_equal(result,expected)
+ # start/stop
+ result = store.select('df2', start=5, stop=10)
+ expected = df[5:10]
+ tm.assert_frame_equal(result,expected)
+
def test_append_to_multiple(self):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
@@ -3603,11 +3696,11 @@ def test_select_as_multiple(self):
None, where=['A>0', 'B>0'], selector='df1')
self.assertRaises(Exception, store.select_as_multiple,
[None], where=['A>0', 'B>0'], selector='df1')
- self.assertRaises(TypeError, store.select_as_multiple,
+ self.assertRaises(KeyError, store.select_as_multiple,
['df1','df3'], where=['A>0', 'B>0'], selector='df1')
self.assertRaises(KeyError, store.select_as_multiple,
['df3'], where=['A>0', 'B>0'], selector='df1')
- self.assertRaises(ValueError, store.select_as_multiple,
+ self.assertRaises(KeyError, store.select_as_multiple,
['df1','df2'], where=['A>0', 'B>0'], selector='df4')
# default select