diff --git a/RELEASE.rst b/RELEASE.rst index eaeaaf33f5fdd..60a2171f04452 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -1,3 +1,49 @@ + +************************ +pandas 0.4 Release Notes +************************ + +========== +What is it +========== + +**pandas** is a library of labeled data structures, statistical models, and +general code for working with time series and cross-sectional data. It was +designed with the practical needs of statistical modeling and large, +inhomogeneous data sets in mind. + +=============== +Where to get it +=============== + +Source code: http://github.com/wesm/pandas +Binary installers on PyPI: http://pypi.python.org/pypi/pandas +Documentation: http://pandas.sourceforge.net + +============= +Release notes +============= + +**Release date:** NOT YET RELEASED + +**New features / modules** + +* `DataFrame.describe` +* `DataFrame.quantile`, `Series.quantile` +* `DataFrame.describe` +* Fancy indexing + +**Improvements** + + +**API Changes** + +**Bug fixes** + +************************ +pandas 0.3 Release Notes +************************ + ============= Release Notes ============= diff --git a/pandas/core/common.py b/pandas/core/common.py index e39f0378e87d4..5477e184fff63 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -88,6 +88,8 @@ def null_out_axis(arr, mask, axis): arr[tuple(indexer)] = np.NaN +#------------------------------------------------------------------------------- +# Lots of little utilities def ensure_float(arr): if issubclass(arr.dtype.type, np.integer): @@ -102,3 +104,23 @@ def _mut_exclusive(arg1, arg2): return arg1 else: return arg2 + + +def _is_list_like(obj): + return isinstance(obj, (list, np.ndarray)) + +def _is_label_slice(labels, obj): + def crit(x): + if x in labels: + return False + else: + return isinstance(x, int) or x is None + return not crit(obj.start) or not crit(obj.stop) + +def _need_slice(obj): + return obj.start is not None or obj.stop is not None + +def _check_step(obj): + if obj.step is not None: + raise Exception('steps other than 1 are not supported') + diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 00a9a45298921..f38ff99e8ea77 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10,7 +10,9 @@ from numpy import NaN import numpy as np -from pandas.core.common import (_pickle_array, _unpickle_array, isnull, notnull) +from pandas.core.common import (_pickle_array, _unpickle_array, isnull, notnull, + _check_step, _is_list_like, _need_slice, + _is_label_slice) from pandas.core.daterange import DateRange from pandas.core.generic import PandasGeneric from pandas.core.index import Index, NULL_INDEX @@ -2361,7 +2363,7 @@ def ix(self): return self._ix - def _fancy_index(self, key, axis=0): + def _fancy_getitem(self, key, axis=0): labels = self._get_axis(axis) axis_name = self._get_axis_name(axis) @@ -2376,21 +2378,21 @@ def _fancy_index(self, key, axis=0): else: return self.reindex(**{axis_name : key}) - def _fancy_index_tuple(self, rowkey, colkey): - result = self._fancy_index_axis(colkey, axis=1) + def _fancy_getitem_tuple(self, rowkey, colkey): + result = self._fancy_getitem_axis(colkey, axis=1) if isinstance(result, Series): result = result[rowkey] else: - result = result._fancy_index_axis(rowkey, axis=0) + result = result._fancy_getitem_axis(rowkey, axis=0) return result - def _fancy_index_axis(self, key, axis=0): + def _fancy_getitem_axis(self, key, axis=0): if isinstance(key, slice): return self._slice_axis(key, axis=axis) elif _is_list_like(key): - return self._fancy_index(key, axis=axis) + return self._fancy_getitem(key, axis=axis) elif axis == 0: idx = key if isinstance(key, int): @@ -2445,37 +2447,19 @@ def __init__(self, frame): def __getitem__(self, key): frame = self.frame if isinstance(key, slice): - return frame._fancy_index_axis(key, axis=0) + return frame._fancy_getitem_axis(key, axis=0) elif isinstance(key, tuple): if len(key) != 2: raise Exception('only length 2 tuple supported') - return frame._fancy_index_tuple(*key) + return frame._fancy_getitem_tuple(*key) elif _is_list_like(key): - return frame._fancy_index(key, axis=0) + return frame._fancy_getitem(key, axis=0) else: - return frame._fancy_index_axis(key, axis=0) + return frame._fancy_getitem_axis(key, axis=0) def __setitem__(self, key, value): raise NotImplementedError -def _is_list_like(obj): - return isinstance(obj, (list, np.ndarray)) - -def _is_label_slice(labels, obj): - def crit(x): - if x in labels: - return False - else: - return isinstance(x, int) or x is None - return not crit(obj.start) or not crit(obj.stop) - -def _need_slice(obj): - return obj.start is not None or obj.stop is not None - -def _check_step(obj): - if obj.step is not None: - raise Exception('steps other than 1 are not supported') - def try_sort(iterable): listed = list(iterable) try: diff --git a/pandas/core/functions.py b/pandas/core/functions.py index d07244d80a28d..c1fbf6b10f2dc 100644 --- a/pandas/core/functions.py +++ b/pandas/core/functions.py @@ -81,6 +81,31 @@ def dumb_way(series, buckets): sampled2 = sampled2.reindex(buckets) return sampled2 +def ts_upsample(dates, buckets, values, aggfunc, inclusive=True): + ''' + put something here + ''' + nbuckets = len(buckets) + nvalues = len(dates) + output = np.empty(nbuckets, dtype=float) + + if inclusive: + _check = lambda x, y: x < y + else: + _check = lambda x, y: x <= y + + j = 0 + for i, bound in enumerate(buckets): + next_bound = buckets[i + 1] + jstart = j + + while _check(dates[j], next_bound) and j < nvalues: + j += 1 + + output[i] = aggfunc(values[jstart:j]) + + return Series(output, index=buckets) + if __name__ == '__main__': N = 1000000 K = 1000 diff --git a/pandas/core/series.py b/pandas/core/series.py index a6937ef8425e7..ac91f2e8d3424 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -14,6 +14,9 @@ import numpy as np from pandas.core.common import isnull, notnull +from pandas.core.common import (_check_step, _is_list_like, _need_slice, + _is_label_slice) + from pandas.core.daterange import DateRange from pandas.core.generic import PandasGeneric from pandas.core.index import Index, NULL_INDEX @@ -1331,41 +1334,82 @@ def select(self, crit): """ return self._select_generic(crit, axis=0) + _ix = None + @property + def ix(self): + if self._ix is None: + self._ix = _SeriesIndexer(self) + + return self._ix + + def _fancy_getitem(self, key): + # asarray can be unsafe, NumPy strings are weird + if _isboolarr(key): + if isinstance(key, Series): + if not key.index.equals(self.index): + raise Exception('Cannot use boolean index with misaligned ' + 'or unequal labels') + return self.reindex(self.index[key]) + elif isinstance(key, slice): + if _is_label_slice(self.index, key): + i, j = self.index.slice_locs(key.start, key.stop) + return self[i:j] + else: + return self[key] + else: + return self.reindex(key) + + def _fancy_setitem(self, key, value): + if _isboolarr(key) or isinstance(key, slice): + if isinstance(key, Series): + if not key.index.equals(self.index): + raise Exception('Cannot use boolean index with misaligned ' + 'or unequal labels') + self[key] = value + else: + inds, mask = self.index.get_indexer(key) + if not mask.all(): + raise Exception('Indices %s not found' % key[-mask]) + self.put(inds, value) + class TimeSeries(Series): pass -def ts_upsample(dates, buckets, values, aggfunc, inclusive=True): - ''' - put something here - ''' - nbuckets = len(buckets) - nvalues = len(dates) - output = np.empty(nbuckets, dtype=float) - - if inclusive: - _check = lambda x, y: x < y - else: - _check = lambda x, y: x <= y +class _SeriesIndexer(object): + """ + Class to support fancy indexing, potentially using labels - j = 0 - for i, bound in enumerate(buckets): - next_bound = buckets[i + 1] - jstart = j + Notes + ----- + Indexing based on labels is INCLUSIVE + Slicing uses PYTHON SEMANTICS (endpoint is excluded) - while _check(dates[j], next_bound) and j < nvalues: - j += 1 + If Index contains int labels, these will be used rather than the locations, + so be very careful (ambiguous). - output[i] = aggfunc(values[jstart:j]) + Examples + -------- + >>> ts.ix[5:10] # equivalent to ts[5:10] + >>> ts.ix[[date1, date2, date3]] + >>> ts.ix[date1:date2] = 0 + """ + def __init__(self, series): + self.series = series - return Series(output, index=buckets) + def __getitem__(self, key): + return self.series._fancy_getitem(key) + def __setitem__(self, key, value): + return self.series._fancy_setitem(key, value) #------------------------------------------------------------------------------- # Supplementary functions _ndgi = ndarray.__getitem__ +_isboolarr = lambda x: np.asarray(x).dtype == np.bool_ + def remove_na(arr): """ Return array containing only true/non-NaN values, possibly empty. diff --git a/pandas/core/tests/test_frame.py b/pandas/core/tests/test_frame.py index c6df9c7cbb2c5..a95572ae98237 100644 --- a/pandas/core/tests/test_frame.py +++ b/pandas/core/tests/test_frame.py @@ -23,7 +23,200 @@ #------------------------------------------------------------------------------- # DataFrame test cases -class TestDataFrame(unittest.TestCase): +class CheckIndexing(object): + + def test_getitem(self): + # slicing + + sl = self.frame[:20] + self.assertEqual(20, len(sl.index)) + + # column access + + for _, series in sl.iteritems(): + self.assertEqual(20, len(series.index)) + self.assert_(common.equalContents(series.index, sl.index)) + + for key, _ in self.frame._series.iteritems(): + self.assert_(self.frame[key] is not None) + + self.assert_('random' not in self.frame) + self.assertRaises(Exception, self.frame.__getitem__, 'random') + + # boolean indexing + d = self.tsframe.index[10] + indexer = self.tsframe.index > d + + subindex = self.tsframe.index[indexer] + subframe = self.tsframe[indexer] + + self.assert_(np.array_equal(subindex, subframe.index)) + self.assertRaises(Exception, self.tsframe.__getitem__, indexer[:-1]) + + def test_setitem(self): + # not sure what else to do here + series = self.frame['A'][::2] + self.frame['col5'] = series + self.assert_('col5' in self.frame) + common.assert_dict_equal(series, self.frame['col5'], + compare_keys=False) + + series = self.frame['A'] + self.frame['col6'] = series + common.assert_dict_equal(series, self.frame['col6'], + compare_keys=False) + + self.assertRaises(Exception, self.frame.__setitem__, + randn(len(self.frame) + 1)) + + # set ndarray + arr = randn(len(self.frame)) + self.frame['col9'] = arr + self.assert_((self.frame['col9'] == arr).all()) + + # set value, do out of order for DataMatrix + self.frame['col7'] = 5 + assert((self.frame['col7'] == 5).all()) + + self.frame['col0'] = 3.14 + assert((self.frame['col0'] == 3.14).all()) + + self.frame['col8'] = 'foo' + assert((self.frame['col8'] == 'foo').all()) + + smaller = self.frame[:2] + smaller['col10'] = ['1', '2'] + self.assertEqual(smaller['col10'].dtype, np.object_) + self.assert_((smaller['col10'] == ['1', '2']).all()) + + def test_setitem_boolean(self): + df = self.frame.copy() + values = self.frame.values + + df[df > 0] = 5 + values[values > 0] = 5 + assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + assert_almost_equal(df.values, values) + + self.assertRaises(Exception, df.__setitem__, df[:-1] > 0, 2) + self.assertRaises(Exception, df.__setitem__, df * 0, 2) + + def test_getitem_fancy_2d(self): + f = self.frame + ix = f.ix + + assert_frame_equal(ix[:, ['B', 'A']], f.reindex(columns=['B', 'A'])) + + # slicing rows, etc. + assert_frame_equal(ix[5:10], f[5:10]) + assert_frame_equal(ix[5:10, :], f[5:10]) + assert_frame_equal(ix[:5, ['A', 'B']], + f.reindex(index=f.index[:5], columns=['A', 'B'])) + + # slice rows with labels, inclusive! + expected = ix[5:11] + result = ix[f.index[5]:f.index[10]] + assert_frame_equal(expected, result) + + # slice columns + assert_frame_equal(ix[:, :2], f.reindex(columns=['A', 'B'])) + + def test_getitem_fancy_1d(self): + f = self.frame + ix = f.ix + + # return self if no slicing...for now + self.assert_(ix[:, :] is f) + + # low dimensional slice + xs1 = ix[2, ['C', 'B', 'A']] + xs2 = f.xs(f.index[2]).reindex(['C', 'B', 'A']) + assert_series_equal(xs1, xs2) + + ts1 = ix[5:10, 2] + ts2 = f[f.columns[2]][5:10] + assert_series_equal(ts1, ts2) + + # positional xs + xs1 = ix[0] + xs2 = f.xs(f.index[0]) + assert_series_equal(xs1, xs2) + + xs1 = ix[f.index[5]] + xs2 = f.xs(f.index[5]) + assert_series_equal(xs1, xs2) + + # single column + assert_series_equal(ix[:, 'A'], f['A']) + + def test_getitem_fancy_scalar(self): + f = self.frame + ix = f.ix + # individual value + for col in f.columns: + ts = f[col] + for idx in f.index[::5]: + assert_almost_equal(ix[idx, col], ts[idx]) + + def test_getitem_fancy_boolean(self): + f = self.frame + ix = f.ix + + expected = f.reindex(columns=['B', 'D']) + result = ix[:, [False, True, False, True]] + assert_frame_equal(result, expected) + + expected = f.reindex(index=f.index[5:10], columns=['B', 'D']) + result = ix[5:10, [False, True, False, True]] + assert_frame_equal(result, expected) + + boolvec = f.index > f.index[7] + expected = f.reindex(index=f.index[boolvec]) + result = ix[boolvec] + assert_frame_equal(result, expected) + result = ix[boolvec, :] + assert_frame_equal(result, expected) + + result = ix[boolvec, 2:] + expected = f.reindex(index=f.index[boolvec], + columns=['C', 'D']) + assert_frame_equal(result, expected) + + def test_getitem_fancy_exceptions(self): + ix = self.frame.ix + self.assertRaises(Exception, ix.__getitem__, + (slice(None, None, None), + slice(None, None, None), + slice(None, None, None))) + + self.assertRaises(Exception, ix.__getitem__, slice(None, None, 2)) + + # boolean index misaligned labels + mask = self.frame['A'][::-1] > 1 + self.assertRaises(Exception, ix.__getitem__, mask) + + def test_setitem_fancy_1d(self): + f = self.frame + + def test_setitem_fancy_2d(self): + f = self.frame + + def test_setitem_fancy_scalar(self): + f = self.frame + ix = f.ix + # individual value + for col in f.columns: + ts = f[col] + for idx in f.index[::5]: + assert_almost_equal(ix[idx, col], ts[idx]) + + def test_setitem_fancy_boolean(self): + f = self.frame + +class TestDataFrame(unittest.TestCase, CheckIndexing): klass = DataFrame def setUp(self): @@ -392,85 +585,6 @@ def test_toString(self): frame = self.klass(index=np.arange(1000)) frame.toString(buf=buf) - def test_getitem(self): - # slicing - - sl = self.frame[:20] - self.assertEqual(20, len(sl.index)) - - # column access - - for _, series in sl.iteritems(): - self.assertEqual(20, len(series.index)) - self.assert_(common.equalContents(series.index, sl.index)) - - for key, _ in self.frame._series.iteritems(): - self.assert_(self.frame[key] is not None) - - self.assert_('random' not in self.frame) - self.assertRaises(Exception, self.frame.__getitem__, 'random') - - # boolean indexing - d = self.tsframe.index[10] - indexer = self.tsframe.index > d - - subindex = self.tsframe.index[indexer] - subframe = self.tsframe[indexer] - - self.assert_(np.array_equal(subindex, subframe.index)) - self.assertRaises(Exception, self.tsframe.__getitem__, indexer[:-1]) - - def test_setitem(self): - # not sure what else to do here - series = self.frame['A'][::2] - self.frame['col5'] = series - self.assert_('col5' in self.frame) - common.assert_dict_equal(series, self.frame['col5'], - compare_keys=False) - - series = self.frame['A'] - self.frame['col6'] = series - common.assert_dict_equal(series, self.frame['col6'], - compare_keys=False) - - self.assertRaises(Exception, self.frame.__setitem__, - randn(len(self.frame) + 1)) - - # set ndarray - arr = randn(len(self.frame)) - self.frame['col9'] = arr - self.assert_((self.frame['col9'] == arr).all()) - - # set value, do out of order for DataMatrix - self.frame['col7'] = 5 - assert((self.frame['col7'] == 5).all()) - - self.frame['col0'] = 3.14 - assert((self.frame['col0'] == 3.14).all()) - - self.frame['col8'] = 'foo' - assert((self.frame['col8'] == 'foo').all()) - - smaller = self.frame[:2] - smaller['col10'] = ['1', '2'] - self.assertEqual(smaller['col10'].dtype, np.object_) - self.assert_((smaller['col10'] == ['1', '2']).all()) - - def test_setitem_boolean(self): - df = self.frame.copy() - values = self.frame.values - - df[df > 0] = 5 - values[values > 0] = 5 - assert_almost_equal(df.values, values) - - df[df == 5] = 0 - values[values == 5] = 0 - assert_almost_equal(df.values, values) - - self.assertRaises(Exception, df.__setitem__, df[:-1] > 0, 2) - self.assertRaises(Exception, df.__setitem__, df * 0, 2) - def test_delitem(self): del self.frame['A'] self.assert_('A' not in self.frame) @@ -990,7 +1104,7 @@ def test_pivot(self): assert_frame_equal(pivoted, expected) - # corner cases + # TODO: corner cases? def test_reindex(self): newFrame = self.frame.reindex(self.ts1.index) @@ -1024,7 +1138,8 @@ def test_reindex(self): self.assert_(np.isnan(val)) for col, series in nonContigFrame.iteritems(): - self.assert_(common.equalContents(series.index, nonContigFrame.index)) + self.assert_(common.equalContents(series.index, + nonContigFrame.index)) # corner cases @@ -1700,100 +1815,6 @@ def test_describe(self): desc = self.mixed_frame.describe() desc = self.frame.describe() - def test_fancy_indexing_2d(self): - f = self.frame - ix = f.ix - - assert_frame_equal(ix[:, ['B', 'A']], f.reindex(columns=['B', 'A'])) - - # slicing rows, etc. - assert_frame_equal(ix[5:10], f[5:10]) - assert_frame_equal(ix[5:10, :], f[5:10]) - assert_frame_equal(ix[:5, ['A', 'B']], - f.reindex(index=f.index[:5], columns=['A', 'B'])) - - # slice rows with labels, inclusive! - expected = ix[5:11] - result = ix[f.index[5]:f.index[10]] - assert_frame_equal(expected, result) - - # slice columns - assert_frame_equal(ix[:, :2], f.reindex(columns=['A', 'B'])) - - def test_fancy_indexing_1d(self): - f = self.frame - ix = f.ix - - # return self if no slicing...for now - self.assert_(ix[:, :] is f) - - # low dimensional slice - xs1 = ix[2, ['C', 'B', 'A']] - xs2 = f.xs(f.index[2]).reindex(['C', 'B', 'A']) - assert_series_equal(xs1, xs2) - - ts1 = ix[5:10, 2] - ts2 = f[f.columns[2]][5:10] - assert_series_equal(ts1, ts2) - - # positional xs - xs1 = ix[0] - xs2 = f.xs(f.index[0]) - assert_series_equal(xs1, xs2) - - xs1 = ix[f.index[5]] - xs2 = f.xs(f.index[5]) - assert_series_equal(xs1, xs2) - - # single column - assert_series_equal(ix[:, 'A'], f['A']) - - def test_fancy_indexing_scalar(self): - f = self.frame - ix = f.ix - # individual value - for col in f.columns: - ts = f[col] - for idx in f.index[::5]: - assert_almost_equal(ix[idx, col], ts[idx]) - - def test_fancy_indexing_boolean(self): - f = self.frame - ix = f.ix - - expected = f.reindex(columns=['B', 'D']) - result = ix[:, [False, True, False, True]] - assert_frame_equal(result, expected) - - expected = f.reindex(index=f.index[5:10], columns=['B', 'D']) - result = ix[5:10, [False, True, False, True]] - assert_frame_equal(result, expected) - - boolvec = f.index > f.index[7] - expected = f.reindex(index=f.index[boolvec]) - result = ix[boolvec] - assert_frame_equal(result, expected) - result = ix[boolvec, :] - assert_frame_equal(result, expected) - - result = ix[boolvec, 2:] - expected = f.reindex(index=f.index[boolvec], - columns=['C', 'D']) - assert_frame_equal(result, expected) - - def test_fancy_indexing_exceptions(self): - ix = self.frame.ix - self.assertRaises(Exception, ix.__getitem__, - (slice(None, None, None), - slice(None, None, None), - slice(None, None, None))) - - self.assertRaises(Exception, ix.__getitem__, slice(None, None, 2)) - - # boolean index misaligned labels - mask = self.frame['A'][::-1] > 1 - self.assertRaises(Exception, ix.__getitem__, mask) - def test_select(self): f = lambda x: x.weekday() == 2 result = self.tsframe.select(f, axis=0) diff --git a/pandas/core/tests/test_series.py b/pandas/core/tests/test_series.py index 6891d76d86ad4..4e6c3ea070a10 100644 --- a/pandas/core/tests/test_series.py +++ b/pandas/core/tests/test_series.py @@ -204,6 +204,29 @@ def test_setslice(self): self.assertEqual(len(sl), len(sl.index)) self.assertEqual(len(sl.index.indexMap), len(sl.index)) + def test_getitem_fancy(self): + inds = self.series.index[[3,4,7]] + assert_series_equal(self.series.ix[inds], self.series.reindex(inds)) + assert_series_equal(self.series.ix[5::2], self.series[5::2]) + + # boolean + mask = self.series > self.series.median() + assert_series_equal(self.series.ix[mask], self.series[mask]) + + def test_setitem_fancy(self): + inds = self.series.index[[3,4,7]] + + result = self.series.copy() + result.ix[inds] = 5 + + expected = self.series.copy() + expected[[3,4,7]] = 5 + assert_series_equal(result, expected) + + result.ix[5:10] = 10 + expected[5:10] = 10 + assert_series_equal(result, expected) + def test_repr(self): str(self.ts) str(self.series) diff --git a/pandas/core/tests/test_tseries.py b/pandas/core/tests/test_tseries.py deleted file mode 100644 index 9add8c4492d51..0000000000000 --- a/pandas/core/tests/test_tseries.py +++ /dev/null @@ -1,48 +0,0 @@ -import unittest - -import pandas.util.testing as common -import pandas.lib.tseries as tseries - -class TestUtil(unittest.TestCase): - - def test_map_indices(self): - pass - - - -class TestReindex(unittest.TestCase): - - def test_getMergeVec(self): - pass - - def test_getFillVec(self): - pass - - - -class TestGroupby(unittest.TestCase): - pass - -class TestIsnull(unittest.TestCase): - pass - -class TestOperators(unittest.TestCase): - pass - -class TestMoments(unittest.TestCase): - - def test_sum(self): - pass - - def test_mean(self): - pass - - def test_var(self): - pass - - def test_skewness(self): - pass - - def test_kurtosis(self): - pass - diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 003483ac29e31..b6a39f9a80bbf 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -73,7 +73,8 @@ def simpleParser(lines, colNames=None, header=0, indexCol=0, colCounts[col] += 1 else: if not colNames: - columns = string.ascii_uppercase[:len(lines[0])] + columns = list(string.ascii_uppercase[:len(lines[0])]) + # columns = ['X.%d' % (i + 1) for i in range(len(lines[0]))] else: columns = colNames content = lines