Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

ENH: In HDFStore, add select_column method, deprecate unique method #3256

Merged
merged 1 commit into from Apr 4, 2013
Jump to file or symbol
Failed to load files and symbols.
+43 −22
Split
View
@@ -163,6 +163,11 @@ pandas 0.11.0
when invalid shapes are passed
- Methods return None when inplace=True (GH1893_)
+ - ``HDFStore``
+
+ - added the method ``select_column`` to select a single column from a table as a Series.
+ - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
+
**Bug Fixes**
- Fix seg fault on empty data frame when fillna with ``pad`` or ``backfill``
View
@@ -1352,16 +1352,17 @@ then the ``nrows`` of the table are considered.
Advanced Queries
~~~~~~~~~~~~~~~~
-**Unique**
+**Select a Single Column**
-To retrieve the *unique* values of an indexable or data column, use the
-method ``unique``. This will, for example, enable you to get the index
-very quickly. Note ``nan`` are excluded from the result set.
+To retrieve a single indexable or data column, use the
+method ``select_column``. This will, for example, enable you to get the index
+very quickly. These return a ``Series`` of the result, indexed by the row number.
+These do not currently accept the ``where`` selector (coming soon)
.. ipython:: python
- store.unique('df_dc', 'index')
- store.unique('df_dc', 'string')
+ store.select_column('df_dc', 'index')
+ store.select_column('df_dc', 'string')
**Replicating or**
View
@@ -226,6 +226,10 @@ Astype conversion on ``datetime64[ns]`` to ``object``, implicity converts ``NaT`
API changes
~~~~~~~~~~~
+ - In ``HDFStore``, added the method ``select_column`` to select a single column from a table as a Series.
+
+ - In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
+
Enhancements
~~~~~~~~~~~~
View
@@ -423,8 +423,13 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs
return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs)
def unique(self, key, column, **kwargs):
+ warnings.warn("unique(key,column) is deprecated\n"
+ "use select_column(key,column).unique() instead")
+ return self.get_storer(key).read_column(column = column, **kwargs).unique()
+
+ def select_column(self, key, column, **kwargs):
"""
- return a single column uniquely from the table. This is generally only useful to select an indexable
+ return a single column from the table. This is generally only useful to select an indexable
Parameters
----------
@@ -2525,7 +2530,7 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs)
return Coordinates(self.selection.select_coords(), group=self.group, where=where)
- def read_column(self, column, **kwargs):
+ def read_column(self, column, where = None, **kwargs):
""" return a single column from the table, generally only indexables are interesting """
# validate the version
@@ -2535,6 +2540,9 @@ def read_column(self, column, **kwargs):
if not self.infer_axes():
return False
+ if where is not None:
+ raise Exception("read_column does not currently accept a where clause")
+
# find the axes
for a in self.axes:
if column == a.name:
@@ -2544,7 +2552,7 @@ def read_column(self, column, **kwargs):
# column must be an indexable or a data column
c = getattr(self.table.cols, column)
- return Categorical.from_array(a.convert(c[:], nan_rep=self.nan_rep).take_data()).levels
+ return Series(a.convert(c[:], nan_rep=self.nan_rep).take_data())
raise KeyError("column [%s] not found in the table" % column)
@@ -2068,6 +2068,7 @@ def test_string_select(self):
expected = df2[isnull(df2.x)]
assert_frame_equal(result,expected)
+
# int ==/!=
df['int'] = 1
df.ix[2:7,'int'] = 2
@@ -2083,42 +2084,44 @@ def test_string_select(self):
assert_frame_equal(result,expected)
- def test_unique(self):
+ def test_read_column(self):
df = tm.makeTimeDataFrame()
- def check(x, y):
- self.assert_((np.unique(x) == np.unique(y)).all() == True)
-
with ensure_clean(self.path) as store:
store.remove('df')
store.append('df', df)
# error
- self.assertRaises(KeyError, store.unique, 'df', 'foo')
+ self.assertRaises(KeyError, store.select_column, 'df', 'foo')
+
+ def f():
+ store.select_column('df', 'index', where = ['index>5'])
+ self.assertRaises(Exception, f)
# valid
- result = store.unique('df', 'index')
- check(result.values, df.index.values)
-
+ result = store.select_column('df', 'index')
+ tm.assert_almost_equal(result.values, Series(df.index).values)
+ self.assert_(isinstance(result,Series))
+
# not a data indexable column
self.assertRaises(
- ValueError, store.unique, 'df', 'values_block_0')
+ ValueError, store.select_column, 'df', 'values_block_0')
# a data column
df2 = df.copy()
df2['string'] = 'foo'
store.append('df2', df2, data_columns=['string'])
- result = store.unique('df2', 'string')
- check(result.values, df2['string'].unique())
+ result = store.select_column('df2', 'string')
+ tm.assert_almost_equal(result.values, df2['string'].values)
# a data column with NaNs, result excludes the NaNs
df3 = df.copy()
df3['string'] = 'foo'
df3.ix[4:6, 'string'] = np.nan
store.append('df3', df3, data_columns=['string'])
- result = store.unique('df3', 'string')
- check(result.values, df3['string'].valid().unique())
+ result = store.select_column('df3', 'string')
+ tm.assert_almost_equal(result.values, df3['string'].values)
def test_coordinates(self):
df = tm.makeTimeDataFrame()