From d5cb5eec1a8984ad4da4b457d721a74dff5aa69c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 9 Oct 2011 17:59:53 -0400 Subject: [PATCH 001/161] RLS: update release notes --- RELEASE.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index 97fade2c1b7ca..0c542e7628b8a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -8,7 +8,7 @@ see the commit logs at http://github.com/wesm/pandas pandas 0.4.3 ============ -**Release date:** not yet released +**Release date:** 10/9/2011 This is largely a bugfix release from 0.4.2 but also includes a handful of new and enhanced features. Also, pandas can now be installed and used on Python 3 From d1a06ded6164fd23530c93b4a756cce1f5af48f3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 9 Oct 2011 20:51:06 -0400 Subject: [PATCH 002/161] RLS: set released to False --- pandas/core/internals.py | 3 +++ setup.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a3df3b16b3b10..9f73c9397b8c7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -915,7 +915,10 @@ def _union_items_slow(all_items): seen = seen.union(items) return seen +from line_profiler import LineProfiler +prof = LineProfiler() +@prof def join_managers(left, right, axis=1, how='left'): """ Parameters diff --git a/setup.py b/setup.py index 318e0bb8e8967..5cee226f296ff 100755 --- a/setup.py +++ b/setup.py @@ -129,8 +129,8 @@ MAJOR = 0 MINOR = 4 -MICRO = 3 -ISRELEASED = True +MICRO = 4 +ISRELEASED = False VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) FULLVERSION = VERSION From e3eac6791dd5d9a64e9e892f9615094059ac7f1a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Oct 2011 13:22:55 -0400 Subject: [PATCH 003/161] ENH: refactored join_managers into a tidy class and sped up joining in homogeneous case --- RELEASE.rst | 11 ++ TODO.rst | 6 + pandas/core/internals.py | 306 +++++++++++++++++++++++---------------- scripts/bench_join.py | 11 +- 4 files changed, 208 insertions(+), 126 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 0c542e7628b8a..6bc55b030be19 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -5,6 +5,17 @@ Release Notes This is the list of changes to pandas between each release. For full details, see the commit logs at http://github.com/wesm/pandas +pandas 0.4.4 +============ + +**Release date:** not yet released + +**Improvements to existing features** + + - Refactored merging / joining code into a tidy class and disabled unnecessary + computations in the float/object case, thus getting about 10% better + performance + pandas 0.4.3 ============ diff --git a/TODO.rst b/TODO.rst index 836c2791e971d..2d9dd87867fc9 100644 --- a/TODO.rst +++ b/TODO.rst @@ -1,7 +1,13 @@ +DONE +---- - SparseSeries name integration + tests - Refactor Series.repr + +TODO +---- - .name pickling / unpicking / HDFStore handling - Is there a way to write hierarchical columns to csv? - Possible to blow away existing name when creating MultiIndex? - prettytable output with index names - Add load/save functions to top level pandas namespace +- _consolidate, does it always copy? diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9f73c9397b8c7..e47ad99518201 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4,6 +4,7 @@ import numpy as np from pandas.core.index import Index, _ensure_index +from pandas.util.decorators import cache_readonly import pandas.core.common as common import pandas._tseries as lib @@ -915,146 +916,205 @@ def _union_items_slow(all_items): seen = seen.union(items) return seen -from line_profiler import LineProfiler -prof = LineProfiler() +def join_managers(left, right, axis=1, how='left', copy=True): + op = _JoinOperation(left, right, axis=axis, how=how) + return op.get_result(copy=copy) -@prof -def join_managers(left, right, axis=1, how='left'): +class _JoinOperation(object): """ - Parameters - ---------- - other - lindexer - lmask - rindexer - rmask - - Returns - ------- - merged : BlockManager + Object responsible for orchestrating efficient join operation between two + BlockManager data structures """ - assert(left.is_consolidated()) - assert(right.is_consolidated()) + def __init__(self, left, right, axis=1, how='left'): + self.left = left + self.right = right + self.axis = axis + self.how = how - laxis = left.axes[axis] - raxis = right.axes[axis] + assert(left.is_consolidated()) + assert(right.is_consolidated()) - join_index, lindexer, rindexer = laxis.join(raxis, how=how, - return_indexers=True) + laxis = left.axes[axis] + raxis = right.axes[axis] - N = len(join_index) + (self.join_index, + self.lindexer, + self.rindexer) = laxis.join(raxis, how=how, return_indexers=True) - if lindexer is None: - lmask = None - lneed_masking = None - else: - lmask = lindexer == -1 - lneed_masking = lmask.any() + # do NOT sort + self.result_items = left.items.append(right.items) + self.result_axes = list(left.axes) + self.result_axes[0] = self.result_items + self.result_axes[axis] = self.join_index - if rindexer is None: - rmask = None - rneed_masking = None - else: - rmask = rindexer == -1 - rneed_masking = rmask.any() - - lblocks = _maybe_upcast_blocks(left.blocks, lneed_masking) - rblocks = _maybe_upcast_blocks(right.blocks, rneed_masking) - - left_blockmap = dict((type(blk), blk) for blk in lblocks) - right_blockmap = dict((type(blk), blk) for blk in rblocks) - - # do NOT sort - result_items = left.items.append(right.items) - - result_axes = list(left.axes) - result_axes[0] = result_items - result_axes[axis] = join_index - - result_blocks = [] - - # copies all data by definition - - kinds = set(left_blockmap) | set(right_blockmap) - for klass in kinds: - if klass in left_blockmap and klass in right_blockmap: - # true merge, do not produce intermediate copy - lblk = left_blockmap[klass] - rblk = right_blockmap[klass] - new_values = _merge_blocks_fast(lblk, rblk, - lindexer, lmask, lneed_masking, - rindexer, rmask, rneed_masking, - axis=axis) - new_items = lblk.items.append(rblk.items) - res_blk = make_block(new_values, new_items, result_items) - elif klass in left_blockmap: - # only take necessary - blk = left_blockmap[klass] - if lindexer is None: - res_blk = blk.copy() - else: - res_blk = blk.reindex_axis(lindexer, lmask, lneed_masking, - axis=axis) - res_blk.ref_items = result_items - elif klass in right_blockmap: - # only take necessary - blk = right_blockmap[klass] - if rindexer is None: - res_blk = blk.copy() + def get_result(self, copy=False): + """ + Parameters + ---------- + other + lindexer + lmask + rindexer + rmask + + Returns + ------- + merged : BlockManager + """ + left_blockmap, right_blockmap = self._prepare_blocks() + + result_blocks = [] + + # maybe want to enable flexible copying + + kinds = set(left_blockmap) | set(right_blockmap) + for klass in kinds: + lblk = left_blockmap.get(klass) + rblk = right_blockmap.get(klass) + + if lblk and rblk: + # true merge, do not produce intermediate copy + res_blk = self._merge_blocks(lblk, rblk) + elif lblk: + res_blk = self._reindex_block(lblk, side='left') else: - res_blk = blk.reindex_axis(rindexer, rmask, rneed_masking, - axis=axis) - res_blk.ref_items = result_items + res_blk = self._reindex_block(rblk, side='right') - result_blocks.append(res_blk) + result_blocks.append(res_blk) - return BlockManager(result_blocks, result_axes) + return BlockManager(result_blocks, self.result_axes) -def _maybe_upcast_blocks(blocks, needs_masking): - """ - Upcast and consolidate if necessary - """ - if not needs_masking: - return blocks - new_blocks = [] - for block in blocks: - if isinstance(block, IntBlock): - newb = make_block(block.values.astype(float), block.items, - block.ref_items) - elif isinstance(block, BoolBlock): - newb = make_block(block.values.astype(object), block.items, - block.ref_items) + def _prepare_blocks(self): + lblocks = self.left.blocks + rblocks = self.right.blocks + + # will short-circuit and not compute lneed_masking + if self._may_need_upcasting(lblocks) and self.lneed_masking: + lblocks = self._upcast_blocks(lblocks) + + if self._may_need_upcasting(rblocks) and self.rneed_masking: + rblocks = self._upcast_blocks(rblocks) + + left_blockmap = dict((type(blk), blk) for blk in lblocks) + right_blockmap = dict((type(blk), blk) for blk in rblocks) + + return left_blockmap, right_blockmap + + def _reindex_block(self, block, side='left', copy=True): + indexer = self.lindexer if side == 'left' else self.rindexer + + # still some inefficiency here for bool/int64 because in the case where + # no masking is needed, take_fast will recompute the mask + + if indexer is None and copy: + result = block.copy() + else: + result = block.reindex_axis(indexer, None, False, axis=self.axis) + + result.ref_items = self.result_items + return result + + @cache_readonly + def lmask_info(self): + if self.lindexer is None: + lmask = None + lneed_masking = None else: - newb = block - new_blocks.append(newb) + lmask = self.lindexer == -1 + lneed_masking = lmask.any() - # use any ref_items - return _consolidate(new_blocks, newb.ref_items) + return lmask, lneed_masking -def _merge_blocks_fast(left, right, lindexer, lmask, lneed_masking, - rindexer, rmask, rneed_masking, axis=1): + @cache_readonly + def rmask_info(self): + if self.rindexer is None: + rmask = None + rneed_masking = None + else: + rmask = self.rindexer == -1 + rneed_masking = rmask.any() - n = left.values.shape[axis] if lindexer is None else len(lindexer) - lk = len(left.items) - rk = len(right.items) + return rmask, rneed_masking - out_shape = list(left.shape) - out_shape[0] = lk + rk - out_shape[axis] = n + @property + def lneed_masking(self): + return self.lmask_info[1] - out = np.empty(out_shape, dtype=left.values.dtype) + @property + def lmask(self): + return self.lmask_info[0] - if lindexer is None: - common.take_fast(left.values, np.arange(n, dtype=np.int32), - None, False, axis=axis, out=out[:lk]) - else: - common.take_fast(left.values, lindexer, lmask, lneed_masking, - axis=axis, out=out[:lk]) + @property + def rneed_masking(self): + return self.rmask_info[1] - if rindexer is None: - common.take_fast(right.values, np.arange(n, dtype=np.int32), - None, False, axis=axis, out=out[lk:]) - else: - common.take_fast(right.values, rindexer, rmask, rneed_masking, - axis=axis, out=out[lk:]) - return out + @property + def rmask(self): + return self.rmask_info[0] + + @staticmethod + def _may_need_upcasting(blocks): + for block in blocks: + if isinstance(block, (IntBlock, BoolBlock)): + return True + return False + + def _merge_blocks(self, lblk, rblk): + lidx = self.lindexer + ridx = self.rindexer + + n = lblk.values.shape[self.axis] if lidx is None else len(lidx) + lk = len(lblk.items) + rk = len(rblk.items) + + out_shape = list(lblk.shape) + out_shape[0] = lk + rk + out_shape[self.axis] = n + + out = np.empty(out_shape, dtype=lblk.values.dtype) + + # is this really faster than assigning to arr.flat? + if lidx is None: + # out[:lk] = lblk.values + common.take_fast(lblk.values, np.arange(n, dtype='i4'), + None, False, + axis=self.axis, out=out[:lk]) + else: + # write out the values to the result array + common.take_fast(lblk.values, lidx, None, False, + axis=self.axis, out=out[:lk]) + if ridx is None: + # out[lk:] = lblk.values + common.take_fast(rblk.values, np.arange(n, dtype='i4'), + None, False, + axis=self.axis, out=out[lk:]) + else: + common.take_fast(rblk.values, ridx, None, False, + axis=self.axis, out=out[lk:]) + + # does not sort + new_items = lblk.items.append(rblk.items) + return make_block(out, new_items, self.result_items) + + @staticmethod + def _upcast_blocks(self, blocks, need_masking=True): + """ + Upcast and consolidate if necessary + """ + if not need_masking: + return blocks + + new_blocks = [] + for block in blocks: + if isinstance(block, IntBlock): + newb = make_block(block.values.astype(float), block.items, + block.ref_items) + elif isinstance(block, BoolBlock): + newb = make_block(block.values.astype(object), block.items, + block.ref_items) + else: + newb = block + new_blocks.append(newb) + + # use any ref_items + return _consolidate(new_blocks, newb.ref_items) diff --git a/scripts/bench_join.py b/scripts/bench_join.py index 1a82f8a1762bd..56d97599bb802 100644 --- a/scripts/bench_join.py +++ b/scripts/bench_join.py @@ -136,7 +136,7 @@ def join(a, b, av, bv, how="left"): def bench_python(n=100000, pct_overlap=0.20, K=1): import gc ns = [2, 3, 4, 5, 6] - iterations = 50 + iterations = 200 pct_overlap = 0.2 kinds = ['outer', 'left', 'inner'] @@ -156,11 +156,16 @@ def bench_python(n=100000, pct_overlap=0.20, K=1): for kind in kinds: gc.disable() + elapsed = 0 _s = time.clock() - for _ in range(iterations): + for i in range(iterations): + if i % 10 == 0: + elapsed += time.clock() - _s + gc.collect() + _s = time.clock() a_frame.join(b_frame, how=kind) # join(a, b, avf, bvf, how=kind) - elapsed = time.clock() - _s + elapsed += time.clock() - _s gc.enable() result[kind] = (elapsed / iterations) * 1000 From 549453aa779ef24a5f1942781e61da248bc6467f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Oct 2011 13:25:21 -0400 Subject: [PATCH 004/161] BUG: buglet, test suite passes now --- pandas/core/internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e47ad99518201..fec3be706cef2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1097,7 +1097,7 @@ def _merge_blocks(self, lblk, rblk): return make_block(out, new_items, self.result_items) @staticmethod - def _upcast_blocks(self, blocks, need_masking=True): + def _upcast_blocks(blocks, need_masking=True): """ Upcast and consolidate if necessary """ From a0a9520b2f164cd2c77a30b1461b0ee1010a2821 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Oct 2011 13:47:51 -0400 Subject: [PATCH 005/161] ENH: some more join refactoring and testing --- pandas/core/internals.py | 38 +++++++++++++++++++------------------- pandas/tests/test_frame.py | 21 +++++++++++++++++++-- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index fec3be706cef2..dea6c9d5410d5 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -989,10 +989,10 @@ def _prepare_blocks(self): rblocks = self.right.blocks # will short-circuit and not compute lneed_masking - if self._may_need_upcasting(lblocks) and self.lneed_masking: + if self.lneed_masking: lblocks = self._upcast_blocks(lblocks) - if self._may_need_upcasting(rblocks) and self.rneed_masking: + if self.rneed_masking: rblocks = self._upcast_blocks(rblocks) left_blockmap = dict((type(blk), blk) for blk in lblocks) @@ -1001,7 +1001,12 @@ def _prepare_blocks(self): return left_blockmap, right_blockmap def _reindex_block(self, block, side='left', copy=True): - indexer = self.lindexer if side == 'left' else self.rindexer + if side == 'left': + indexer = self.lindexer + mask, need_masking = self.lmask_info + else: + indexer = self.rindexer + mask, need_masking = self.rmask_info # still some inefficiency here for bool/int64 because in the case where # no masking is needed, take_fast will recompute the mask @@ -1009,16 +1014,18 @@ def _reindex_block(self, block, side='left', copy=True): if indexer is None and copy: result = block.copy() else: - result = block.reindex_axis(indexer, None, False, axis=self.axis) + result = block.reindex_axis(indexer, mask, need_masking, + axis=self.axis) result.ref_items = self.result_items return result @cache_readonly def lmask_info(self): - if self.lindexer is None: + if (self.lindexer is None or + not self._may_need_upcasting(self.left.blocks)): lmask = None - lneed_masking = None + lneed_masking = False else: lmask = self.lindexer == -1 lneed_masking = lmask.any() @@ -1027,9 +1034,10 @@ def lmask_info(self): @cache_readonly def rmask_info(self): - if self.rindexer is None: + if (self.rindexer is None or + not self._may_need_upcasting(self.right.blocks)): rmask = None - rneed_masking = None + rneed_masking = False else: rmask = self.rindexer == -1 rneed_masking = rmask.any() @@ -1040,18 +1048,10 @@ def rmask_info(self): def lneed_masking(self): return self.lmask_info[1] - @property - def lmask(self): - return self.lmask_info[0] - @property def rneed_masking(self): return self.rmask_info[1] - @property - def rmask(self): - return self.rmask_info[0] - @staticmethod def _may_need_upcasting(blocks): for block in blocks: @@ -1097,12 +1097,12 @@ def _merge_blocks(self, lblk, rblk): return make_block(out, new_items, self.result_items) @staticmethod - def _upcast_blocks(blocks, need_masking=True): + def _upcast_blocks(blocks): """ Upcast and consolidate if necessary """ - if not need_masking: - return blocks + # if not need_masking: + # return blocks new_blocks = [] for block in blocks: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 06795d5cafd04..3f2ebfc6e9ddf 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -700,6 +700,23 @@ def test_join_index(self): self.assertRaises(Exception, f.join, f2, how='foo') + def test_join_index_more(self): + af = self.frame.ix[:, ['A', 'B']] + bf = self.frame.ix[::2, ['C', 'D']] + + expected = af.copy() + expected['C'] = self.frame['C'][::2] + expected['D'] = self.frame['D'][::2] + + result = af.join(bf) + assert_frame_equal(result, expected) + + result = af.join(bf, how='right') + assert_frame_equal(result, expected[::2]) + + result = bf.join(af, how='right') + assert_frame_equal(result, expected.ix[:, result.columns]) + def test_join_index_series(self): df = self.frame.copy() s = df.pop(self.frame.columns[-1]) @@ -1582,7 +1599,7 @@ def test_to_csv_float32_nanrep(self): lines = open(pth).readlines() self.assert_(lines[1].split(',')[2] == '999') os.remove(pth) - + def test_to_csv_withcommas(self): "Commas inside fields should be correctly escaped when saving as CSV." path = '__tmp__' @@ -1590,7 +1607,7 @@ def test_to_csv_withcommas(self): df.to_csv(path) df2 = DataFrame.from_csv(path) assert_frame_equal(df2, df) - + os.remove(path) def test_info(self): From 7fe8b2c6e6f5ae60f4c56df77f289f80531dbbaf Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Oct 2011 15:29:23 -0400 Subject: [PATCH 006/161] ENH: add monotonic join logic for object dtype Indexes, start on DataFrame.align --- TODO.rst | 1 + pandas/core/frame.py | 32 +++++++++++++++++++++++++++++++- pandas/core/index.py | 27 +++++++++++++++++++++++++++ pandas/core/series.py | 29 +++++++++++------------------ 4 files changed, 70 insertions(+), 19 deletions(-) diff --git a/TODO.rst b/TODO.rst index 2d9dd87867fc9..c879c1fdff57d 100644 --- a/TODO.rst +++ b/TODO.rst @@ -11,3 +11,4 @@ TODO - prettytable output with index names - Add load/save functions to top level pandas namespace - _consolidate, does it always copy? +- Series.align with fill method. Will have to generate more Cython code diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6338492558254..4430bb607c7d2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1010,7 +1010,37 @@ def xs(self, key, axis=0, copy=True): return result #---------------------------------------------------------------------- - # Reindexing + # Reindexing and alignment + + def align(self, other, join='outer', copy=True): + """ + Align two DataFrame object on their index and columns with the specified + join method for each axis Index + + Parameters + ---------- + other : DataFrame + join : {'outer', 'inner', 'left', 'right'}, default 'outer' + + Returns + ------- + (left, right) : (Series, Series) + Aligned Series + """ + join_index, ilidx, iridx = self.index.join(other.index, how=join, + return_indexers=True) + + # TODO: speed up on homogeneous DataFrame objects + join_columns, clidx, cridx = self.columns.join(other.columns, how=join, + return_indexers=True) + + def _align_frame(frame, row_idx, col_idx): + new_data = frame._data + return DataFrame(new_data) + + left = _align_frame(self, ilidx, clidx) + right = _align_frame(other, iridx, cridx) + return left, right def reindex(self, index=None, columns=None, method=None, copy=True): """Conform Series to new index with optional filling logic, placing diff --git a/pandas/core/index.py b/pandas/core/index.py index daa9592c6e66e..82183c5583300 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -441,6 +441,10 @@ def reindex(self, target, method=None): return target, indexer def join(self, other, how='left', return_indexers=False): + if self.is_monotonic and other.is_monotonic: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + if how == 'left': join_index = self elif how == 'right': @@ -465,6 +469,29 @@ def join(self, other, how='left', return_indexers=False): else: return join_index + def _join_monotonic(self, other, how='left', return_indexers=False): + if how == 'left': + join_index = self + lidx = None + ridx = lib.left_join_indexer_object(self, other) + elif how == 'right': + join_index = other + lidx = lib.left_join_indexer_object(other, self) + ridx = None + elif how == 'inner': + join_index, lidx, ridx = lib.inner_join_indexer_object(self, other) + join_index = Index(join_index) + elif how == 'outer': + join_index, lidx, ridx = lib.outer_join_indexer_object(self, other) + join_index = Index(join_index) + else: # pragma: no cover + raise Exception('do not recognize join method %s' % how) + + if return_indexers: + return join_index, lidx, ridx + else: + return join_index + def slice_locs(self, start=None, end=None): """ For an ordered Index, compute the slice locations for input labels diff --git a/pandas/core/series.py b/pandas/core/series.py index 184a6d894c6fb..2c56a374c2227 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1353,26 +1353,19 @@ def align(self, other, join='outer', copy=True): join_index, lidx, ridx = self.index.join(other.index, how=join, return_indexers=True) - if lidx is not None: - left = Series(common.take_1d(self.values, lidx), join_index, - name=self.name) - else: - if copy: - new_values = self.values.copy() - else: - new_values = self.values - left = Series(new_values, join_index, name=self.name) - - if ridx is not None: - right = Series(common.take_1d(other.values, ridx), join_index, - name=other.name) - else: - if copy: - new_values = other.values.copy() + def _align_series(series, indexer): + if indexer is not None: + new_values = common.take_1d(series.values, indexer) else: - new_values = other.values - right = Series(new_values, join_index, name=other.name) + if copy: + new_values = series.values.copy() + else: + new_values = series.values + result = Series(new_values, join_index, name=series.name) + return result + left = _align_series(self, lidx) + right = _align_series(other, ridx) return left, right def reindex(self, index=None, method=None, copy=True): From b7a6c347f55df9f81f241c961f2bc6d9bee0d6f9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Oct 2011 21:02:18 -0400 Subject: [PATCH 007/161] ENH: DataFrame.align function, speed up DataFrame arith, refactoring --- pandas/core/frame.py | 62 +++++++++++++++++-------------------- pandas/core/internals.py | 31 ++++++++++++++++--- pandas/core/series.py | 7 +++++ pandas/tests/test_series.py | 5 +++ 4 files changed, 68 insertions(+), 37 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4430bb607c7d2..e8da7c3cba8e5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -90,6 +90,7 @@ def f(self, other): return f + #---------------------------------------------------------------------- # DataFrame class @@ -1027,15 +1028,33 @@ def align(self, other, join='outer', copy=True): (left, right) : (Series, Series) Aligned Series """ - join_index, ilidx, iridx = self.index.join(other.index, how=join, - return_indexers=True) - - # TODO: speed up on homogeneous DataFrame objects - join_columns, clidx, cridx = self.columns.join(other.columns, how=join, + if self.index.equals(other.index): + join_index = self.index + ilidx, iridx = None, None + else: + join_index, ilidx, iridx = self.index.join(other.index, how=join, return_indexers=True) + if self.columns.equals(other.columns): + join_columns = self.columns + clidx, cridx = None, None + else: + join_columns, clidx, cridx = self.columns.join(other.columns, + how=join, + return_indexers=True) + def _align_frame(frame, row_idx, col_idx): new_data = frame._data + if row_idx is not None: + new_data = new_data.reindex_indexer(join_index, row_idx, axis=1) + + if col_idx is not None: + # TODO: speed up on homogeneous DataFrame objects + new_data = new_data.reindex_items(join_columns) + + if copy and new_data is frame._data: + new_data = new_data.copy() + return DataFrame(new_data) left = _align_frame(self, ilidx, clidx) @@ -1477,7 +1496,8 @@ def _rename_columns_inplace(self, mapper): # Arithmetic / combination related def _combine_frame(self, other, func, fill_value=None): - new_index = self.index.union(other.index) + this, other = self.align(other, join='outer', copy=False) + new_index, new_columns = this.index, this.columns # some shortcuts if fill_value is None: @@ -1488,18 +1508,6 @@ def _combine_frame(self, other, func, fill_value=None): elif not other: return self * nan - need_reindex = False - new_columns = self.columns.union(other.columns) - need_reindex = (need_reindex or not new_index.equals(self.index) - or not new_index.equals(other.index)) - need_reindex = (need_reindex or not new_columns.equals(self.columns) - or not new_columns.equals(other.columns)) - - this = self - if need_reindex: - this = self.reindex(index=new_index, columns=new_columns) - other = other.reindex(index=new_index, columns=new_columns) - this_vals = this.values other_vals = other.values @@ -2275,21 +2283,7 @@ def corrwith(self, other, axis=0, drop=False): this = self._get_numeric_data() other = other._get_numeric_data() - com_index = this._intersect_index(other) - com_cols = this._intersect_columns(other) - - # feels hackish - if axis == 0: - result_index = com_index - if not drop: - result_index = this.columns.union(other.columns) - else: - result_index = com_cols - if not drop: - result_index = this.index.union(other.index) - - left = this.reindex(index=com_index, columns=com_cols) - right = other.reindex(index=com_index, columns=com_cols) + left, right = this.align(other, join='inner', copy=False) # mask missing values left = left + right * 0 @@ -2309,6 +2303,8 @@ def corrwith(self, other, axis=0, drop=False): correl = num / dom if not drop: + raxis = 1 if axis == 0 else 0 + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) correl = correl.reindex(result_index) return correl diff --git a/pandas/core/internals.py b/pandas/core/internals.py index dea6c9d5410d5..ff43280a810ba 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -580,6 +580,24 @@ def reindex_axis(self, new_axis, method=None, axis=0): new_axes[axis] = new_axis return BlockManager(new_blocks, new_axes) + def reindex_indexer(self, new_axis, indexer, axis=1): + """ + pandas-indexer with -1's only + """ + if axis == 0: + raise NotImplementedError + + new_axes = list(self.axes) + new_axes[axis] = new_axis + new_blocks = [] + for blk in self.blocks: + new_values = common.take_fast(blk.values, indexer, None, + False, axis=axis) + newb = make_block(new_values, blk.items, self.items) + new_blocks.append(newb) + + return BlockManager(new_blocks, new_axes) + def reindex_items(self, new_items): """ @@ -617,16 +635,21 @@ def reindex_items(self, new_items): return BlockManager(new_blocks, new_axes) - def take(self, indices, axis=1): + def take(self, indexer, axis=1, pandas_indexer=False): if axis == 0: raise NotImplementedError + if pandas_indexer: + take_f = lambda arr: common.take_fast(arr, indexer, + None, False, axis=axis) + else: + take_f = lambda arr: arr.take(indexer, axis=axis) + new_axes = list(self.axes) - new_axes[axis] = self.axes[axis].take(indices) + new_axes[axis] = self.axes[axis].take(indexer) new_blocks = [] for blk in self.blocks: - newb = make_block(blk.values.take(indices, axis=axis), blk.items, - self.items) + newb = make_block(take_f(blk.values), blk.items, self.items) new_blocks.append(newb) return BlockManager(new_blocks, new_axes) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2c56a374c2227..80f53372b79cf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1350,6 +1350,13 @@ def align(self, other, join='outer', copy=True): (left, right) : (Series, Series) Aligned Series """ + if self.index.equals(other.index): + left, right = self, other + if copy: + left = left.copy() + right = right.copy() + return left, right + join_index, lidx, ridx = self.index.join(other.index, how=join, return_indexers=True) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index f8b865ab7f860..5ad68539f5453 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1025,6 +1025,11 @@ def test_align_nocopy(self): rb[:2] = 5 self.assert_((b[:2] == 5).all()) + def test_align_sameindex(self): + a, b = self.ts.align(self.ts) + self.assert_(a.index is self.ts.index) + self.assert_(b.index is self.ts.index) + def test_reindex(self): identity = self.series.reindex(self.series.index) self.assertEqual(id(self.series.index), id(identity.index)) From 0adcfce5f7be08abe768460e75667b6f846ace28 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 11 Oct 2011 14:36:26 -0400 Subject: [PATCH 008/161] ENH: first cut at optimizing DataFrame.xs, next step cythonize --- pandas/core/frame.py | 18 ++++++++++++------ pandas/core/index.py | 23 +++++++++++++++-------- pandas/core/internals.py | 27 +++++++++++++++++++++++++++ pandas/core/series.py | 2 +- 4 files changed, 55 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e8da7c3cba8e5..b6f5154d66c92 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1002,13 +1002,19 @@ def xs(self, key, axis=0, copy=True): return data self._consolidate_inplace() - new_data = self._data.xs(key, axis=1, copy=copy) - if new_data.ndim == 1: - return Series(new_data.as_matrix(), index=self.columns, name=key) + loc = self.index.get_loc(key) + if np.isscalar(loc): + new_values = self._data.fast_2d_xs(loc, copy=copy) + return Series(new_values, index=self.columns, name=key) else: - result = DataFrame(new_data) - result.index = _maybe_droplevels(result.index, key) - return result + new_data = self._data.xs(key, axis=1, copy=copy) + if new_data.ndim == 1: + return Series(new_data.as_matrix(), index=self.columns, + name=key) + else: + result = DataFrame(new_data) + result.index = _maybe_droplevels(result.index, key) + return result #---------------------------------------------------------------------- # Reindexing and alignment diff --git a/pandas/core/index.py b/pandas/core/index.py index 82183c5583300..672736956c8d0 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -83,15 +83,21 @@ def is_monotonic(self): return lib.is_monotonic_object(self) _indexMap = None + _integrity = False @property def indexMap(self): "{label -> location}" if self._indexMap is None: self._indexMap = lib.map_indices_object(self) - self._verify_integrity() + self._integrity = len(self._indexMap) == len(self) + if not self._integrity: + raise Exception('Index cannot contain duplicate values!') return self._indexMap + def _verify_integrity(self): + return len(self.indexMap) == len(self) + _allDates = None def is_all_dates(self): """ @@ -102,10 +108,6 @@ def is_all_dates(self): return self._allDates - def _verify_integrity(self): - if len(self.indexMap) < len(self): - raise Exception('Index cannot contain duplicate values!') - def __iter__(self): return iter(self.view(np.ndarray)) @@ -361,7 +363,6 @@ def get_loc(self, key): ------- loc : int """ - self._verify_integrity() return self.indexMap[key] def get_indexer(self, target, method=None): @@ -635,7 +636,10 @@ def indexMap(self): "{label -> location}" if self._indexMap is None: self._indexMap = lib.map_indices_int64(self) - self._verify_integrity() + self._integrity = len(self._indexMap) == len(self) + + if not self._integrity: + raise Exception('Index cannot contain duplicate values!') return self._indexMap @@ -992,7 +996,10 @@ def indexMap(self): if self._indexMap is None: zipped = zip(*self.labels) self._indexMap = lib.map_indices_list(zipped) - self._verify_integrity() + self._integrity = len(self._indexMap) == len(self) + + if not self._integrity: + raise Exception('Index cannot contain duplicate values!') return self._indexMap diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ff43280a810ba..5f1d9c5be5035 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -471,6 +471,32 @@ def xs(self, key, axis=1, copy=True): return BlockManager(new_blocks, new_axes) + def fast_2d_xs(self, loc, copy=False): + """ + + """ + if len(self.blocks) == 1: + result = self.blocks[0].values[:, loc] + if copy: + result = result.copy() + return result + + if not copy: + raise Exception('cannot get view of mixed-type or ' + 'non-consolidated DataFrame') + + items = self.items + dtype = _interleaved_dtype(self.blocks) + n = len(items) + result = np.empty(n, dtype=dtype) + for blk in self.blocks: + values = blk.values + for j, item in enumerate(blk.items): + i = items.get_loc(item) + result[i] = values[j, loc] + + return result + def consolidate(self): """ Join together blocks having same dtype @@ -1141,3 +1167,4 @@ def _upcast_blocks(blocks): # use any ref_items return _consolidate(new_blocks, newb.ref_items) + diff --git a/pandas/core/series.py b/pandas/core/series.py index 80f53372b79cf..ebab919a7f08c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -990,7 +990,7 @@ def append(self, other): y : Series """ new_index = self.index.append(other.index) - new_index._verify_integrity() + assert(new_index._verify_integrity()) new_values = np.concatenate((self.values, other.values)) name = _maybe_match_name(self, other) From 5ae1a59bc940947ffb45c73bf7f927effada6070 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 11 Oct 2011 15:32:50 -0400 Subject: [PATCH 009/161] ENH: implement multi-key joining. fairly naive impl for now --- pandas/core/common.py | 6 +++--- pandas/core/frame.py | 8 +++++++- pandas/core/index.py | 11 +++++++---- pandas/core/internals.py | 3 +-- pandas/tests/test_frame.py | 30 ++++++++++++++++++++++++++++++ 5 files changed, 48 insertions(+), 10 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index ea2aea1ef7e6b..fd2863735d0bf 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -514,11 +514,11 @@ def intersection(*seqs): result &= seq return type(seqs[0])(list(result)) -def _asarray_tuplesafe(values): - if not isinstance(values, (list, np.ndarray)): +def _asarray_tuplesafe(values, dtype=None): + if not isinstance(values, (list, tuple, np.ndarray)): values = list(values) - result = np.asarray(values) + result = np.asarray(values, dtype=dtype) if issubclass(result.dtype.type, basestring): result = np.asarray(values, dtype=object) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b6f5154d66c92..125120d2ecb41 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2219,7 +2219,13 @@ def _join_on(self, other, on, lsuffix, rsuffix): if len(other.index) == 0: return self - new_data = self._data.join_on(other._data, self[on], axis=1, + if isinstance(on, (list, tuple)): + join_key = zip(*[self[k] for k in on]) + join_key = common._asarray_tuplesafe(join_key, dtype=object) + else: + join_key = np.asarray(self[on]) + + new_data = self._data.join_on(other._data, join_key, axis=1, lsuffix=lsuffix, rsuffix=rsuffix) return self._constructor(new_data) diff --git a/pandas/core/index.py b/pandas/core/index.py index 672736956c8d0..8335ec0429b43 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -49,10 +49,13 @@ def __new__(cls, data, dtype=None, copy=False, name=None): 'of some kind, %s was passed' % repr(data)) else: # other iterable of some kind - if not isinstance(data, (list, tuple)): - data = list(data) - subarr = np.empty(len(data), dtype=object) - subarr[:] = data + subarr = _asarray_tuplesafe(data, dtype=object) + + # if not isinstance(data, (list, tuple)): + # data = list(data) + + # subarr = np.empty(len(data), dtype=object) + # subarr[:] = data subarr = subarr.view(cls) subarr.name = name diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5f1d9c5be5035..a420cc1e615ff 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -732,8 +732,7 @@ def join_on(self, other, on, axis=1, lsuffix=None, rsuffix=None): this, other = self._maybe_rename_join(other, lsuffix, rsuffix) other_axis = other.axes[axis] - indexer = lib.merge_indexer_object(on.astype(object), - other_axis.indexMap) + indexer = other_axis.get_indexer(on) # TODO: deal with length-0 case? or does it fall out? mask = indexer == -1 diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3f2ebfc6e9ddf..cf64e5a5d377f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2551,6 +2551,36 @@ def test_join_on(self): self.assertRaises(Exception, target.join, source, on='C', how='left') + def test_join_on_multikey(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1' : key1, 'key2' : key2, + 'data' : data}) + + joined = data.join(to_join, on=['key1', 'key2']) + + join_key = Index(zip(key1, key2)) + indexer = to_join.index.get_indexer(join_key) + ex_values = to_join.values.take(indexer, axis=0) + ex_values[indexer == -1] = np.nan + expected = data.join(DataFrame(ex_values, columns=to_join.columns)) + + # TODO: columns aren't in the same order yet + assert_frame_equal(joined, expected.ix[:, joined.columns]) + def test_join_index_mixed(self): df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, From 08a98fd5aae48d404496b571e0ffe55228f7d11c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 11 Oct 2011 22:55:24 -0400 Subject: [PATCH 010/161] ENH: working on possibly faster xs function. new Cython put functions --- pandas/core/internals.py | 36 +- pandas/src/generate_code.py | 86 ++- pandas/src/generated.pyx | 1280 +++++++++++++++++++++++------------ pandas/src/reindex.pyx | 1 + 4 files changed, 940 insertions(+), 463 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a420cc1e615ff..6d29ae56dba56 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -485,8 +485,9 @@ def fast_2d_xs(self, loc, copy=False): raise Exception('cannot get view of mixed-type or ' 'non-consolidated DataFrame') - items = self.items dtype = _interleaved_dtype(self.blocks) + + items = self.items n = len(items) result = np.empty(n, dtype=dtype) for blk in self.blocks: @@ -497,6 +498,37 @@ def fast_2d_xs(self, loc, copy=False): return result + # def fast_2d_xs2(self, loc, copy=False): + # """ + + # """ + # if len(self.blocks) == 1: + # result = self.blocks[0].values[:, loc] + # if copy: + # result = result.copy() + # return result + + # if not copy: + # raise Exception('cannot get view of mixed-type or ' + # 'non-consolidated DataFrame') + + # def _get_put_function(source_dtype, out_dtype): + # src = source_dtype.name + # dst = out_dtype.name + # return getattr(lib, 'put2d_%s_%s' % (src, dst)) + + # out_dtype = np.dtype(_interleaved_dtype(self.blocks)) + + # items = self.items + # n = len(items) + # out = np.empty(n, dtype=out_dtype) + # for blk in self.blocks: + # values = blk.values + # indexer = lib.merge_indexer_object(blk.items, items.indexMap) + # putf = _get_put_function(values.dtype, out_dtype) + # putf(values, indexer, loc, out) + # return out + def consolidate(self): """ Join together blocks having same dtype @@ -904,7 +936,7 @@ def _interleaved_dtype(blocks): elif have_bool: return np.bool_ elif have_int and not have_float: - return np.int_ + return np.int64 else: return np.float64 diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 381a040e12070..48d17b75bcd2a 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -558,6 +558,45 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, """ +#---------------------------------------------------------------------- +# Fast "put" logic for speeding up interleaving logic + +put2d_template = """ +def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, + ndarray[int32_t] indexer, Py_ssize_t loc, + ndarray[%(dest_type2)s] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] +""" + +def generate_put_functions(): + function_list = [ + ('float64', 'float64_t', 'object'), + ('float64', 'float64_t', 'float64_t'), + ('object', 'object', 'object'), + ('int32', 'int32_t', 'int64_t'), + ('int32', 'int32_t', 'float64_t'), + ('int32', 'int32_t', 'object'), + ('int64', 'int64_t', 'int64_t'), + ('int64', 'int64_t', 'float64_t'), + ('int64', 'int64_t', 'object'), + ('bool', 'uint8_t', 'uint8_t'), + ('bool', 'uint8_t', 'object') + ] + + output = StringIO() + for name, c_type, dest_type in function_list: + func = put2d_template % {'name' : name, 'c_type' : c_type, + 'dest_type' : dest_type.replace('_t', ''), + 'dest_type2' : dest_type} + output.write(func) + return output.getvalue() + # name, ctype, capable of holding NA function_list = [ ('float64', 'float64_t', 'np.float64', True), @@ -567,10 +606,10 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, ('bool', 'uint8_t', 'np.bool', False) ] -def generate_from_template(template, ndim=1, subset=None): +def generate_from_template(template, ndim=1, exclude=None): output = StringIO() for name, c_type, dtype, can_hold_na in function_list: - if subset is not None and name not in subset: + if exclude is not None and name in exclude: continue if ndim == 1: @@ -582,25 +621,34 @@ def generate_from_template(template, ndim=1, subset=None): output.write(func) return output.getvalue() +templates_1d = [map_indices_template, + merge_indexer_template, + pad_template, + backfill_template, + take_1d_template, + is_monotonic_template, + groupby_template, + arrmap_template] + +nobool_1d_templates = [left_join_template, + outer_join_template, + inner_join_template] + +templates_2d = [take_2d_axis0_template, + take_2d_axis1_template] + def generate_take_cython_file(path='generated.pyx'): with open(path, 'w') as f: - print >> f, generate_from_template(map_indices_template) - print >> f, generate_from_template(merge_indexer_template) - print >> f, generate_from_template(pad_template) - print >> f, generate_from_template(backfill_template) - print >> f, generate_from_template(take_1d_template) - print >> f, generate_from_template(take_2d_axis0_template, ndim=2) - print >> f, generate_from_template(take_2d_axis1_template, ndim=2) - print >> f, generate_from_template(is_monotonic_template) - print >> f, generate_from_template(groupby_template) - print >> f, generate_from_template(arrmap_template) - - print >> f, generate_from_template(left_join_template, - subset=['object', 'int64']) - print >> f, generate_from_template(outer_join_template, - subset=['object', 'int64']) - print >> f, generate_from_template(inner_join_template, - subset=['object', 'int64']) + for template in templates_1d: + print >> f, generate_from_template(template) + + for template in templates_2d: + print >> f, generate_from_template(template, ndim=2) + + for template in nobool_1d_templates: + print >> f, generate_from_template(template, exclude=['bool']) + + # print >> f, generate_put_functions() if __name__ == '__main__': generate_take_cython_file() diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 8608064100338..2bdf65f0952b4 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -833,430 +833,158 @@ def take_1d_bool(ndarray[uint8_t] values, ndarray[int32_t] indexer, outbuf[i] = values[idx] -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_float64(ndarray[float64_t] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[float64_t, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + float64_t prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - outbuf[i, j] = NaN - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_object(ndarray[object, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_object(ndarray[object] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[object, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + object prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - outbuf[i, j] = NaN - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_int32(ndarray[int32_t] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int32_t, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + int32_t prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_int64(ndarray[int64_t] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + int64_t prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_bool(ndarray[uint8_t] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[uint8_t, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + uint8_t prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[float64_t, ndim=2] outbuf - - n = len(values) - k = len(indexer) +def groupby_float64(ndarray[float64_t] index, ndarray[object] labels): + cdef dict result = {} + cdef ndarray[int8_t] mask + cdef int i, length + cdef list members + cdef object idx, key - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + length = len(index) + mask = isnullobj(labels) - for j from 0 <= j < k: - idx = indexer[j] + for i from 0 <= i < length: + if mask[i]: + continue - if idx == -1: - for i from 0 <= i < n: - outbuf[i, j] = NaN + key = labels[i] + idx = index[i] + if key in result: + members = result[key] + members.append(idx) else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] + result[key] = [idx] + + return result @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_object(ndarray[object, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[object, ndim=2] outbuf - - n = len(values) - k = len(indexer) +def groupby_object(ndarray[object] index, ndarray[object] labels): + cdef dict result = {} + cdef ndarray[int8_t] mask + cdef int i, length + cdef list members + cdef object idx, key - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + length = len(index) + mask = isnullobj(labels) - for j from 0 <= j < k: - idx = indexer[j] + for i from 0 <= i < length: + if mask[i]: + continue - if idx == -1: - for i from 0 <= i < n: - outbuf[i, j] = NaN + key = labels[i] + idx = index[i] + if key in result: + members = result[key] + members.append(idx) else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int32_t, ndim=2] outbuf - - n = len(values) - k = len(indexer) - - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out - - for j from 0 <= j < k: - idx = indexer[j] - - if idx == -1: - for i from 0 <= i < n: - raise ValueError('No NA values allowed') - else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t, ndim=2] outbuf - - n = len(values) - k = len(indexer) - - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out - - for j from 0 <= j < k: - idx = indexer[j] - - if idx == -1: - for i from 0 <= i < n: - raise ValueError('No NA values allowed') - else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[uint8_t, ndim=2] outbuf - - n = len(values) - k = len(indexer) - - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out - - for j from 0 <= j < k: - idx = indexer[j] - - if idx == -1: - for i from 0 <= i < n: - raise ValueError('No NA values allowed') - else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_float64(ndarray[float64_t] arr): - cdef: - Py_ssize_t i, n - float64_t prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_object(ndarray[object] arr): - cdef: - Py_ssize_t i, n - object prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int32(ndarray[int32_t] arr): - cdef: - Py_ssize_t i, n - int32_t prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int64(ndarray[int64_t] arr): - cdef: - Py_ssize_t i, n - int64_t prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_bool(ndarray[uint8_t] arr): - cdef: - Py_ssize_t i, n - uint8_t prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_float64(ndarray[float64_t] index, ndarray[object] labels): - cdef dict result = {} - cdef ndarray[int8_t] mask - cdef int i, length - cdef list members - cdef object idx, key - - length = len(index) - mask = isnullobj(labels) - - for i from 0 <= i < length: - if mask[i]: - continue - - key = labels[i] - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_object(ndarray[object] index, ndarray[object] labels): - cdef dict result = {} - cdef ndarray[int8_t] mask - cdef int i, length - cdef list members - cdef object idx, key - - length = len(index) - mask = isnullobj(labels) - - for i from 0 <= i < length: - if mask[i]: - continue - - key = labels[i] - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result + result[key] = [idx] + + return result @cython.wraparound(False) @cython.boundscheck(False) @@ -1400,97 +1128,639 @@ def arrmap_bool(ndarray[uint8_t] index, object func): for i from 0 <= i < length: result[i] = func(index[i]) - return result + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + outbuf[i, j] = NaN + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_object(ndarray[object, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + outbuf[i, j] = NaN + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + outbuf[i, j] = NaN + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_object(ndarray[object, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + outbuf[i, j] = NaN + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + raise ValueError('No NA values allowed') + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + raise ValueError('No NA values allowed') + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + raise ValueError('No NA values allowed') + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int32_t] indexer + float64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int32) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + lval = left[i] + rval = right[j] + + if lval == right[j]: + indexer[i] = j + i += 1 + j += 1 + elif lval > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int32_t] indexer + object lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int32) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + lval = left[i] + rval = right[j] + + if lval == right[j]: + indexer[i] = j + i += 1 + j += 1 + elif lval > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int32_t] indexer + int32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int32) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + lval = left[i] + rval = right[j] + + if lval == right[j]: + indexer[i] = j + i += 1 + j += 1 + elif lval > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int32_t] indexer + int64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int32) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + lval = left[i] + rval = right[j] + + if lval == right[j]: + indexer[i] = j + i += 1 + j += 1 + elif lval > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + float64_t lval, rval + ndarray[int32_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + i += 1 + count += 1 + break + else: + if left[i] == right[j]: + i += 1 + j += 1 + elif left[i] < right[j]: + i += 1 + else: + j += 1 + + count += 1 + + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=np.float64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + j += 1 + + count += 1 + return result, lindexer, rindexer @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_object(ndarray[object] left, - ndarray[object] right): +def outer_join_indexer_object(ndarray[object] left, + ndarray[object] right): cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int32_t] indexer + Py_ssize_t i, j, nright, nleft, count object lval, rval + ndarray[int32_t] lindexer, rindexer + ndarray[object] result - i = 0 - j = 0 nleft = len(left) nright = len(right) - indexer = np.empty(nleft, dtype=np.int32) + i = 0 + j = 0 + count = 0 while True: if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + i += 1 + count += 1 break + else: + if left[i] == right[j]: + i += 1 + j += 1 + elif left[i] < right[j]: + i += 1 + else: + j += 1 - if j == nright: - indexer[i] = -1 - i += 1 - continue - - lval = left[i] - rval = right[j] + count += 1 - if lval == right[j]: - indexer[i] = j - i += 1 - j += 1 - elif lval > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=object) -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int32_t] indexer - int64_t lval, rval + # do it again, but populate the indexers / result i = 0 j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int32) + count = 0 while True: if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - lval = left[i] - rval = right[j] - - if lval == right[j]: - indexer[i] = j - i += 1 - j += 1 - elif lval > rval: - indexer[i] = -1 - j += 1 else: - indexer[i] = -1 - i += 1 - return indexer + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + j += 1 + count += 1 + + return result, lindexer, rindexer @cython.wraparound(False) @cython.boundscheck(False) -def outer_join_indexer_object(ndarray[object] left, - ndarray[object] right): +def outer_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): cdef: Py_ssize_t i, j, nright, nleft, count - object lval, rval + int32_t lval, rval ndarray[int32_t] lindexer, rindexer - ndarray[object] result + ndarray[int32_t] result nleft = len(left) nright = len(right) @@ -1526,7 +1796,7 @@ def outer_join_indexer_object(ndarray[object] left, lindexer = np.empty(count, dtype=np.int32) rindexer = np.empty(count, dtype=np.int32) - result = np.empty(count, dtype=object) + result = np.empty(count, dtype=np.int32) # do it again, but populate the indexers / result @@ -1675,6 +1945,69 @@ def outer_join_indexer_int64(ndarray[int64_t] left, return result, lindexer, rindexer +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + ''' + Two-pass algorithm? + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int32_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft or j == nright: + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + i += 1 + j += 1 + count += 1 + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft or j == nright: + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + count += 1 + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + @cython.wraparound(False) @cython.boundscheck(False) def inner_join_indexer_object(ndarray[object] left, @@ -1738,6 +2071,69 @@ def inner_join_indexer_object(ndarray[object] left, return result, lindexer, rindexer +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + ''' + Two-pass algorithm? + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int32_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft or j == nright: + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + i += 1 + j += 1 + count += 1 + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft or j == nright: + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + count += 1 + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + @cython.wraparound(False) @cython.boundscheck(False) def inner_join_indexer_int64(ndarray[int64_t] left, diff --git a/pandas/src/reindex.pyx b/pandas/src/reindex.pyx index f1b5a91c2336e..97e20f3911965 100644 --- a/pandas/src/reindex.pyx +++ b/pandas/src/reindex.pyx @@ -224,3 +224,4 @@ def take_join_contiguous(ndarray[float64_t, ndim=2] lvalues, for j from 0 <= j < rk: outbuf[0] = rvalues[ridx, j] outbuf = outbuf + 1 + From 2958bd6d0045c64fe27c7d9d05b639d012ba7ebf Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 11 Oct 2011 23:24:00 -0400 Subject: [PATCH 011/161] ENH: faster_xs script --- scripts/faster_xs.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 scripts/faster_xs.py diff --git a/scripts/faster_xs.py b/scripts/faster_xs.py new file mode 100644 index 0000000000000..a539642b78185 --- /dev/null +++ b/scripts/faster_xs.py @@ -0,0 +1,16 @@ +import numpy as np + +import pandas.util.testing as tm + +from pandas.core.internals import _interleaved_dtype + +df = tm.makeDataFrame() + +df['E'] = 'foo' +df['F'] = 'foo' +df['G'] = 2 +df['H'] = df['A'] > 0 + +blocks = df._data.blocks +items = df.columns + From 99a4400c0fff967dd2020a0ccfaa529726045660 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Oct 2011 10:40:08 -0400 Subject: [PATCH 012/161] BUG: kludge around series[:, np.newaxis] for MPL compat, GH #224 --- pandas/core/series.py | 5 ++++- pandas/tests/test_series.py | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index ebab919a7f08c..15e846d7d3466 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -288,7 +288,10 @@ def _index_with(indexer): # [slice(0, 5, None)] will break if you convert to ndarray, # e.g. as requested by np.median - return _index_with(key) + try: + return _index_with(key) + except Exception: + return self.values[key] def _multilevel_index(self, key): values = self.values diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 5ad68539f5453..ac24817017f2d 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1288,6 +1288,11 @@ def test_first_last_valid(self): self.assert_(ser.last_valid_index() is None) self.assert_(ser.first_valid_index() is None) + def test_mpl_compat_hack(self): + result = self.ts[:, np.newaxis] + expected = self.ts.values[:, np.newaxis] + assert_almost_equal(result, expected) + #------------------------------------------------------------------------------- # GroupBy From 0cc5616ad9c2404f6219eb306240c3b8c8a91912 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Oct 2011 14:50:47 -0400 Subject: [PATCH 013/161] ENH: parser API changes, added parse_dates options, address GH #225, #226 --- RELEASE.rst | 21 ++++ pandas/core/common.py | 3 + pandas/core/frame.py | 10 +- pandas/io/parsers.py | 188 ++++++++++++++++++-------------- pandas/io/tests/test_parsers.py | 41 ++++--- pandas/src/tseries.pyx | 17 +++ pandas/tests/test_frame.py | 5 + 7 files changed, 182 insertions(+), 103 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 6bc55b030be19..2d672b980cefc 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -10,11 +10,32 @@ pandas 0.4.4 **Release date:** not yet released +**New features / modules** + + - Added `parse_dates` option to `read_csv` and `read_table` methods to + optionally try to parse dates in the index columns + - Added ability to join on multiple columns in `DataFrame.join` (GH #214) + +**API Changes** + + - `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for + `index_col` is now None. To use one or more of the columns as the resulting + DataFrame's index, these must be explicitly specified now + - Parsing functions no longer parse dates by default (GH #225) + **Improvements to existing features** - Refactored merging / joining code into a tidy class and disabled unnecessary computations in the float/object case, thus getting about 10% better performance + - Improved speed of `DataFrame.xs` on mixed-type DataFrame objects by about + 5x, regression from 0.3.0 + +**Bug fixes** + + - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should + be reported upstream to matplotlib (GH #224) + pandas 0.4.3 ============ diff --git a/pandas/core/common.py b/pandas/core/common.py index fd2863735d0bf..cadda5d432d39 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -518,6 +518,9 @@ def _asarray_tuplesafe(values, dtype=None): if not isinstance(values, (list, tuple, np.ndarray)): values = list(values) + if isinstance(values, list) and dtype == np.object_: + return lib.list_to_object_array(values) + result = np.asarray(values, dtype=dtype) if issubclass(result.dtype.type, basestring): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 125120d2ecb41..29e2cc6e560f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -430,7 +430,8 @@ def to_records(self, index=True): return np.rec.fromarrays(arrays, names=names) @classmethod - def from_csv(cls, path, header=0, delimiter=',', index_col=0): + def from_csv(cls, path, header=0, delimiter=',', index_col=0, + parse_dates=True): """ Read delimited file into DataFrame @@ -447,16 +448,15 @@ def from_csv(cls, path, header=0, delimiter=',', index_col=0): Notes ----- Will attempt to convert index to datetimes for time series - data. Use read_csv for more options + data. Use read_table for more options Returns ------- y : DataFrame or DataFrame """ from pandas.io.parsers import read_table - df = read_table(path, header=header, sep=delimiter, - index_col=index_col) - return df + return read_table(path, header=header, sep=delimiter, + parse_dates=parse_dates, index_col=index_col) def to_sparse(self, fill_value=None, kind='block'): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1ec552555036f..e3a30a8621c86 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -9,36 +9,10 @@ from pandas.core.index import Index, MultiIndex from pandas.core.frame import DataFrame -def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, - na_values=None, date_parser=None, names=None): - """ - Read CSV file into DataFrame - Parameters - ---------- - filepath_or_buffer : string or file handle / StringIO - sep : string, default None - Delimiter to use. By default will try to automatically determine - this - header : int, default 0 - Row to use for the column labels of the parsed DataFrame - skiprows : list-like - Row numbers to skip (0-indexed) - index_col : int or sequence., default 0 - Column to use as the row labels of the DataFrame. Pass None if there is - no such column. If a sequence is given, a MultiIndex is used. - na_values : list-like, default None - List of additional strings to recognize as NA/NaN - date_parser : function - Function to use for converting dates to strings. Defaults to - dateutil.parser - names : array-like - List of column names - - Returns - ------- - parsed : DataFrame - """ +def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, + skiprows=None, na_values=None, parse_dates=False, + date_parser=None): import csv if hasattr(filepath_or_buffer, 'read'): @@ -71,43 +45,77 @@ def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, else: lines = [l for l in reader] f.close() - return _simple_parser(lines, header=header, indexCol=index_col, - colNames=names, na_values=na_values, - date_parser=date_parser) -def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None, - index_col=0, na_values=None, date_parser=None, names=None): - """ - Read delimited file into DataFrame + if date_parser is not None: + parse_dates = True - Parameters - ---------- - filepath_or_buffer : string or file handle - sep : string, default '\t' - Delimiter to use - header : int, default 0 - Row to use for the column labels of the parsed DataFrame - skiprows : list-like - Row numbers to skip (0-indexed) - index_col : int or sequence, default 0 - Column to use as the row labels of the DataFrame. Pass None if there is - no such column. If a sequence is given, a MultiIndex is used. - na_values : list-like, default None - List of additional strings to recognize as NA/NaN - date_parser : function - Function to use for converting dates to strings. Defaults to - dateutil.parser - names : array-like - List of column names - - Returns - ------- - parsed : DataFrame - """ - return read_csv(filepath_or_buffer, sep, header, skiprows, - index_col, na_values, date_parser, names) + return _simple_parser(lines, + header=header, + index_col=index_col, + colNames=names, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser) -def _simple_parser(lines, colNames=None, header=0, indexCol=0, + +def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, + names=None, skiprows=None, na_values=None, parse_dates=False, + date_parser=None): + return read_csv(filepath_or_buffer, sep=sep, header=header, + skiprows=skiprows, index_col=index_col, + na_values=na_values, date_parser=date_parser, + names=names, parse_dates=parse_dates) + +_parser_params = """Parameters +---------- +filepath_or_buffer : string or file handle / StringIO +%s +header : int, default 0 + Row to use for the column labels of the parsed DataFrame +skiprows : list-like + Row numbers to skip (0-indexed) +index_col : int or sequence, default None + Column to use as the row labels of the DataFrame. If a sequence is + given, a MultiIndex is used. +na_values : list-like, default None + List of additional strings to recognize as NA/NaN +parse_dates : boolean, default False + Attempt to parse dates in the index column(s) +date_parser : function + Function to use for converting dates to strings. Defaults to + dateutil.parser +names : array-like + List of column names""" + +_csv_sep = """sep : string, default None + Delimiter to use. By default will try to automatically determine + this""" + +_table_sep = """sep : string, default \\t (tab-stop) + Delimiter to use""" + +read_csv.__doc__ = """ +Read CSV (comma-separated) file into DataFrame + +%s + +Returns +------- +parsed : DataFrame +""" % (_parser_params % _csv_sep) + +read_table.__doc__ = """ +Read delimited file into DataFrame + +%s + +Returns +------- +parsed : DataFrame +""" % (_parser_params % _table_sep) + + +def _simple_parser(lines, colNames=None, header=0, index_col=0, na_values=None, date_parser=None, parse_dates=True): """ Workhorse function for processing nested list into DataFrame @@ -142,30 +150,48 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0, zipped_content = zip(*content) if len(content) == 0: # pragma: no cover - raise Exception('No content to parse') + if index_col is not None: + if np.isscalar(index_col): + index = Index([], name=columns.pop(index_col)) + else: + cp_cols = list(columns) + names = [] + for i in index_col: + name = cp_cols[i] + columns.remove(name) + names.append(name) + index = MultiIndex.fromarrays([[]] * len(index_col), + names=names) + else: + index = Index([]) + + return DataFrame(index=index, columns=columns) + + if index_col is None and len(content[0]) == len(columns) + 1: + index_col = 0 # no index column specified, so infer that's what is wanted - if indexCol is not None: - if np.isscalar(indexCol): - if indexCol == 0 and len(content[0]) == len(columns) + 1: + if index_col is not None: + if np.isscalar(index_col): + if index_col == 0 and len(content[0]) == len(columns) + 1: index = zipped_content[0] zipped_content = zipped_content[1:] else: - index = zipped_content.pop(indexCol) - columns.pop(indexCol) + index = zipped_content.pop(index_col) + columns.pop(index_col) else: # given a list of index idx_names = [] index = [] - for idx in indexCol: + for idx in index_col: idx_names.append(columns[idx]) index.append(zipped_content[idx]) #remove index items from content and columns, don't pop in loop - for i in range(len(indexCol)): + for i in range(len(index_col)): columns.remove(idx_names[i]) zipped_content.remove(index[i]) - if np.isscalar(indexCol): + if np.isscalar(index_col): if parse_dates: index = _try_parse_dates(index, parser=date_parser) index = Index(_maybe_convert_int(np.array(index, dtype=object))) @@ -232,9 +258,6 @@ def _maybe_convert_int(arr): return arr def _maybe_convert_int_mindex(index, parse_dates, date_parser): - if len(index) == 0: - return index - for i in range(len(index)): try: int(index[i][0]) @@ -298,8 +321,8 @@ def __init__(self, path): def __repr__(self): return object.__repr__(self) - def parse(self, sheetname, header=0, skiprows=None, index_col=0, - na_values=None): + def parse(self, sheetname, header=0, skiprows=None, index_col=None, + parse_dates=False, date_parser=None, na_values=None): """ Read Excel table into DataFrame @@ -348,7 +371,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=0, value = datetime(*dt) row.append(value) data.append(row) - return _simple_parser(data, header=header, indexCol=index_col, + return _simple_parser(data, header=header, index_col=index_col, + parse_dates=parse_dates, date_parser=date_parser, na_values=na_values) #------------------------------------------------------------------------------- @@ -363,7 +387,8 @@ def parseCSV(filepath, header=0, skiprows=None, indexCol=0, """ warnings.warn("parseCSV is deprecated. Use read_csv instead", FutureWarning) return read_csv(filepath, header=header, skiprows=skiprows, - index_col=indexCol, na_values=na_values) + index_col=indexCol, na_values=na_values, + parse_dates=True) def parseText(filepath, sep='\t', header=0, indexCol=0, colNames=None): # pragma: no cover @@ -374,7 +399,7 @@ def parseText(filepath, sep='\t', header=0, warnings.warn("parseText is deprecated. Use read_table instead", FutureWarning) return read_table(filepath, sep=sep, header=header, index_col=indexCol, - names=colNames) + names=colNames, parse_dates=True) def parseExcel(filepath, header=None, indexCol=0, @@ -385,6 +410,7 @@ def parseExcel(filepath, header=None, indexCol=0, warnings.warn("parseExcel is deprecated. Use the ExcelFile class instead", FutureWarning) excel_file = ExcelFile(filepath) - return excel_file.parse(sheetname, header=header, index_col=indexCol) + return excel_file.parse(sheetname, header=header, index_col=indexCol, + parse_dates=True) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index e6cfce6f32cb3..f4049bf3adcbe 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -34,12 +34,11 @@ def test_custom_na_values(self): [nan, 5, nan], [7, 8, nan]] - df = read_csv(StringIO(data), index_col=None, na_values=['baz'], - skiprows=[1]) + df = read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) assert_almost_equal(df.values, expected) - df2 = read_table(StringIO(data), sep=',', index_col=None, - na_values=['baz'], skiprows=[1]) + df2 = read_table(StringIO(data), sep=',', na_values=['baz'], + skiprows=[1]) assert_almost_equal(df2.values, expected) def test_unnamed_columns(self): @@ -51,7 +50,7 @@ def test_unnamed_columns(self): expected = [[1,2,3,4,5.], [6,7,8,9,10], [11,12,13,14,15]] - df = read_table(StringIO(data), sep=',', index_col=None) + df = read_table(StringIO(data), sep=',') assert_almost_equal(df.values, expected) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'Unnamed: 3', @@ -84,7 +83,7 @@ def test_csv_custom_parser(self): """ df = read_csv(StringIO(data), date_parser=lambda x: datetime.strptime(x, '%Y%m%d')) - expected = read_csv(StringIO(data)) + expected = read_csv(StringIO(data), parse_dates=True) assert_frame_equal(df, expected) def test_no_header(self): @@ -92,11 +91,9 @@ def test_no_header(self): 6,7,8,9,10 11,12,13,14,15 """ - df = read_table(StringIO(data), sep=',', index_col=None, - header=None) + df = read_table(StringIO(data), sep=',', header=None) names = ['foo', 'bar', 'baz', 'quux', 'panda'] - df2 = read_table(StringIO(data), sep=',', index_col=None, - header=None, names=names) + df2 = read_table(StringIO(data), sep=',', header=None, names=names) expected = [[1,2,3,4,5.], [6,7,8,9,10], [11,12,13,14,15]] @@ -106,16 +103,16 @@ def test_no_header(self): self.assert_(np.array_equal(df2.columns, names)) def test_read_csv_dataframe(self): - df = read_csv(self.csv1) - df2 = read_table(self.csv1, sep=',') + df = read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = read_table(self.csv1, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D'])) self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.values.dtype == np.float64) assert_frame_equal(df, df2) def test_read_csv_no_index_name(self): - df = read_csv(self.csv2) - df2 = read_table(self.csv2, sep=',') + df = read_csv(self.csv2, index_col=0, parse_dates=True) + df2 = read_table(self.csv2, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])) self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64) @@ -129,12 +126,22 @@ def test_excel_table(self): pth = os.path.join(self.dirpath, 'test.xls') xls = ExcelFile(pth) - df = xls.parse('Sheet1') - df2 = read_csv(self.csv1) - df3 = xls.parse('Sheet2', skiprows=[1]) + df = xls.parse('Sheet1', index_col=0, parse_dates=True) + df2 = read_csv(self.csv1, index_col=0, parse_dates=True) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) assert_frame_equal(df, df2) assert_frame_equal(df3, df2) + def test_read_table_wrong_num_columns(self): + data = """A,B,C,D,E,F +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + self.assertRaises(Exception, read_csv, StringIO(data)) + + + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index ed65a9b39e660..50a8a8cf1e752 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -257,6 +257,23 @@ def isnullobj(ndarray input): return result +def list_to_object_array(list obj): + ''' + Convert list to object ndarray. Seriously can't believe I had to write this + function + ''' + cdef: + Py_ssize_t i, n + ndarray[object] arr + + n = len(obj) + arr = np.empty(n, dtype=object) + + for i from 0 <= i < n: + arr[i] = obj[i] + + return arr + include "skiplist.pyx" include "groupby.pyx" include "moments.pyx" diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cf64e5a5d377f..79fa30e22885c 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1589,6 +1589,11 @@ def test_to_csv_multiindex(self): os.remove(path) + # empty + tsframe[:0].to_csv(path) + recons = DataFrame.from_csv(path) + assert_frame_equal(recons, tsframe[:0]) + def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan From 5ca6ff5d822ee4ddef1ec0d87b6d83d8b4bbd3eb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Oct 2011 15:10:33 -0400 Subject: [PATCH 014/161] ENH: add explicit duplicate check when creating an index in parsing functions, further address concerns raised in GH #226 --- RELEASE.rst | 5 +++++ pandas/core/frame.py | 2 +- pandas/core/index.py | 12 ++++++++++++ pandas/io/parsers.py | 9 +++++++-- pandas/io/tests/test_parsers.py | 11 ++++++++++- pandas/tests/test_index.py | 2 +- 6 files changed, 36 insertions(+), 5 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 2d672b980cefc..18e2bfff89a19 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -15,6 +15,8 @@ pandas 0.4.4 - Added `parse_dates` option to `read_csv` and `read_table` methods to optionally try to parse dates in the index columns - Added ability to join on multiple columns in `DataFrame.join` (GH #214) + - Added private `_get_duplicates` function to `Index` for identifying + duplicate values more easily **API Changes** @@ -25,6 +27,9 @@ pandas 0.4.4 **Improvements to existing features** + - File parsing functions like `read_csv` and `read_table` will explicitly + check if a parsed index has duplicates and raise a more helpful exception + rather than deferring the check until later - Refactored merging / joining code into a tidy class and disabled unnecessary computations in the float/object case, thus getting about 10% better performance diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 29e2cc6e560f0..8297ffabf5222 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2125,7 +2125,7 @@ def append(self, other, ignore_index=False): new_index = None else: new_index = self.index.append(other.index) - new_index._verify_integrity() + assert(new_index._verify_integrity()) if self.columns.equals(other.columns): return self._append_same_columns(other, new_index) diff --git a/pandas/core/index.py b/pandas/core/index.py index 8335ec0429b43..61bd1ada54b59 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -99,8 +99,20 @@ def indexMap(self): return self._indexMap def _verify_integrity(self): + if self._indexMap is None: + try: + self.indexMap + except Exception: + return False return len(self.indexMap) == len(self) + def _get_duplicates(self): + from collections import defaultdict + counter = defaultdict(lambda: 0) + for k in self.values: + counter[k] += 1 + return sorted(k for k, v in counter.iteritems() if v > 1) + _allDates = None def is_all_dates(self): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e3a30a8621c86..4ec49ce188d75 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -196,12 +196,17 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, index = _try_parse_dates(index, parser=date_parser) index = Index(_maybe_convert_int(np.array(index, dtype=object))) else: - index = MultiIndex.from_arrays(_maybe_convert_int_mindex(index, - parse_dates, date_parser), + arrays = _maybe_convert_int_mindex(index, parse_dates, + date_parser) + index = MultiIndex.from_arrays(arrays, names=idx_names) else: index = Index(np.arange(len(content))) + if not index._verify_integrity(): + dups = index._get_duplicates() + raise Exception('Index has duplicates: %s' % str(dups)) + if len(columns) != len(zipped_content): raise Exception('wrong number of columns') diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index f4049bf3adcbe..5f71f793f2384 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -140,7 +140,16 @@ def test_read_table_wrong_num_columns(self): """ self.assertRaises(Exception, read_csv, StringIO(data)) - + def test_read_table_duplicate_index(self): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + self.assertRaises(Exception, read_csv, StringIO(data), index_col=0) def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 7a37780b01984..13b08a25aff79 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -33,7 +33,7 @@ def test_deepcopy(self): def test_duplicates(self): idx = Index([0, 0, 0]) - self.assertRaises(Exception, idx._verify_integrity) + self.assert_(not idx._verify_integrity()) def test_sort(self): self.assertRaises(Exception, self.strIndex.sort) From fbcbefdc9fad999b212754fd0ac7705d6c076c27 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Oct 2011 15:23:37 -0400 Subject: [PATCH 015/161] ENH: hugely sped up MultiIndex -> Index of tuples conversion using small Cython function. Multi-key joining ~10x faster now --- pandas/core/common.py | 2 +- pandas/core/frame.py | 3 ++- scripts/bench_join_multi.py | 54 +++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 scripts/bench_join_multi.py diff --git a/pandas/core/common.py b/pandas/core/common.py index cadda5d432d39..715c91a990ab7 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -518,7 +518,7 @@ def _asarray_tuplesafe(values, dtype=None): if not isinstance(values, (list, tuple, np.ndarray)): values = list(values) - if isinstance(values, list) and dtype == np.object_: + if isinstance(values, list) and dtype in [np.object_, object]: return lib.list_to_object_array(values) result = np.asarray(values, dtype=dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8297ffabf5222..abf7b2f3ea321 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2221,7 +2221,8 @@ def _join_on(self, other, on, lsuffix, rsuffix): if isinstance(on, (list, tuple)): join_key = zip(*[self[k] for k in on]) - join_key = common._asarray_tuplesafe(join_key, dtype=object) + join_key = common._asarray_tuplesafe(join_key, + dtype=np.object_) else: join_key = np.asarray(self[on]) diff --git a/scripts/bench_join_multi.py b/scripts/bench_join_multi.py new file mode 100644 index 0000000000000..c591ca7d833c6 --- /dev/null +++ b/scripts/bench_join_multi.py @@ -0,0 +1,54 @@ +from pandas import * + +import numpy as np +from itertools import izip +from pandas.util.testing import rands +import pandas._tseries as lib + +N = 100000 + +key1 = [rands(10) for _ in xrange(N)] +key2 = [rands(10) for _ in xrange(N)] + +zipped = izip(key1, key2) + +def _zip(*args): + arr = np.empty(N, dtype=object) + arr[:] = zip(*args) + return arr + +def _zip2(*args): + return lib.list_to_object_array(zip(*args)) + +index = MultiIndex.from_arrays([key1, key2]) +to_join = DataFrame({'j1' : np.random.randn(100000)}, index=index) + +data = DataFrame({'A' : np.random.randn(500000), + 'key1' : np.repeat(key1, 5), + 'key2' : np.repeat(key2, 5)}) + +# data.join(to_join, on=['key1', 'key2']) + +""" +Cython function for list_to_object_array + +def list_to_object_array(list obj): + ''' + Convert list to object ndarray. Seriously can't believe I had to write this + function + ''' + cdef: + Py_ssize_t i, n + ndarray[object] arr + + n = len(obj) + arr = np.empty(n, dtype=object) + + for i from 0 <= i < n: + arr[i] = obj[i] + + return arr +""" + + + From ba35f62e664ceebf7069dd9c0ec1aff045edbf09 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Oct 2011 18:10:31 -0400 Subject: [PATCH 016/161] RLS: release notes --- RELEASE.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index 18e2bfff89a19..d4a328f7d185e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -12,6 +12,7 @@ pandas 0.4.4 **New features / modules** + - Added `DataFrame.align` method with standard join options - Added `parse_dates` option to `read_csv` and `read_table` methods to optionally try to parse dates in the index columns - Added ability to join on multiple columns in `DataFrame.join` (GH #214) @@ -35,6 +36,8 @@ pandas 0.4.4 performance - Improved speed of `DataFrame.xs` on mixed-type DataFrame objects by about 5x, regression from 0.3.0 + - With new `DataFrame.align` method, speeding up binary operations between + differently-indexed DataFrame objects by 10-25%. **Bug fixes** From 4ea44a47538dbc35aef5df80e3e5f57191606710 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Oct 2011 18:11:44 -0400 Subject: [PATCH 017/161] DOC: increase minor version number to 0.5 --- RELEASE.rst | 16 ++++++++-------- setup.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index d4a328f7d185e..052400ea5db6d 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -5,11 +5,18 @@ Release Notes This is the list of changes to pandas between each release. For full details, see the commit logs at http://github.com/wesm/pandas -pandas 0.4.4 +pandas 0.5.0 ============ **Release date:** not yet released +**API Changes** + + - `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for + `index_col` is now None. To use one or more of the columns as the resulting + DataFrame's index, these must be explicitly specified now + - Parsing functions no longer parse dates by default (GH #225) + **New features / modules** - Added `DataFrame.align` method with standard join options @@ -19,13 +26,6 @@ pandas 0.4.4 - Added private `_get_duplicates` function to `Index` for identifying duplicate values more easily -**API Changes** - - - `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for - `index_col` is now None. To use one or more of the columns as the resulting - DataFrame's index, these must be explicitly specified now - - Parsing functions no longer parse dates by default (GH #225) - **Improvements to existing features** - File parsing functions like `read_csv` and `read_table` will explicitly diff --git a/setup.py b/setup.py index 5cee226f296ff..633be5ebc96fa 100755 --- a/setup.py +++ b/setup.py @@ -128,8 +128,8 @@ ] MAJOR = 0 -MINOR = 4 -MICRO = 4 +MINOR = 5 +MICRO = 0 ISRELEASED = False VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) From ee46b0642b71a3bd3412846c35ab836cebfe48d4 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Wed, 12 Oct 2011 21:29:23 +0100 Subject: [PATCH 018/161] Update docs on reading CSV files. --- doc/source/io.rst | 86 +++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 6e92a5ec166d9..6d52a7e1325a9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -19,40 +19,55 @@ IO Tools (Text, CSV, HDF5, ...) ******************************* -Text files ----------- +CSV & Text files +---------------- -The two workhorse functions for reading text (a.k.a. flat) files are -``read_csv`` and ``read_table``. They both utilize the same parsing code for -intelligently converting tabular data into a DataFrame object. They take a -number of different arguments: +The two workhorse functions for reading text files (a.k.a. flat files) are +:func:`~pandas.io.parsers.read_csv` and :func:`~pandas.io.parsers.read_table`. +They both utilize the same parsing code for intelligently converting tabular +data into a DataFrame object. They take a number of different arguments: - - ``path_or_buffer``: Either a string path to a file or any object (such as - an open ``file`` or ``StringIO``) with a ``read`` method. + - ``path_or_buffer``: Either a string path to a file or any object with a + ``read`` method (such as an open file or ``StringIO``). - ``delimiter``: For ``read_table`` only, a regular expression to split fields on. ``read_csv`` uses the ``csv`` module to do this and hence only - supports comma-separated values - - ``skiprows``: Rows in the file to skip - - ``header``: row number to use as the columns, defaults to 0 (first row) - - ``index_col``: integer, defaulting to 0 (the first column), instructing the - parser to use a particular column as the ``index`` (row labels) of the - resulting DataFrame - - ``na_values``: optional list of strings to recognize as NA/NaN + supports comma-separated values. + - ``header``: row number to use as the column names, and the start of the data. + Defaults to 0 (first row); specify None if there is no header row. + - ``names``: List of column names to use if header is None. + - ``skiprows``: A collection of numbers for rows in the file to skip. + - ``index_col``: column number, or list of column numbers, to use as the + ``index`` (row labels) of the resulting DataFrame. By default, it will number + the rows without using any column, unless there is one more data column than + there are headers, in which case the first column is taken as the index. + - ``parse_dates``: If True, attempt to parse the index column as dates. False + by default. - ``date_parser``: function to use to parse strings into datetime - objects. Defaults to the very robust ``dateutil.parser`` - - ``names``: optional list of column names for the data. Otherwise will be - read from the file + objects. If ``parse_dates`` is True, it defaults to the very robust + ``dateutil.parser``. Specifying this implicitly sets ``parse_dates`` as True. + - ``na_values``: optional list of strings to recognize as NaN (missing values), + in addition to a default set. + .. code-block:: ipython - In [2]: print open('foo.csv').read() - A,B,C + In [1]: print open('foo.csv').read() + date,A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 + + # A basic index is created by default: + In [3]: read_csv('foo.csv') + Out[3]: + date A B C + 0 20090101 a 1 2 + 1 20090102 b 3 4 + 2 20090103 c 4 5 - In [3]: df = read_csv('foo.csv') - + # Use a column as an index, and parse it as dates. + In [3]: df = read_csv('foo.csv', index_col=0, parse_dates=True) + In [4]: df Out[4]: A B C @@ -60,36 +75,11 @@ number of different arguments: 2009-01-02 b 3 4 2009-01-03 c 4 5 - # dates parsed to datetime + # These are python datetime objects In [16]: df.index Out[16]: Index([2009-01-01 00:00:00, 2009-01-02 00:00:00, 2009-01-03 00:00:00], dtype=object) -If ``index_col=None``, the index will be a generic ``0...nrows-1``: - -.. code-block:: ipython - - In [1]: print open('foo.csv').read() - index,A,B,C - 20090101,a,1,2 - 20090102,b,3,4 - 20090103,c,4,5 - - In [2]: read_csv('foo.csv') - Out[2]: - A B C - 2009-01-01 a 1 2 - 2009-01-02 b 3 4 - 2009-01-03 c 4 5 - - - In [3]: read_csv('foo.csv', index_col=None) - Out[3]: - index A B C - 0 20090101 a 1 2 - 1 20090102 b 3 4 - 2 20090103 c 4 5 - The parsers make every attempt to "do the right thing" and not be very fragile. Type inference is a pretty big deal. So if a column can be coerced to From ac8083725f0d90c5b5d68aabc4cbf139c479b8b5 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Wed, 12 Oct 2011 21:50:55 +0100 Subject: [PATCH 019/161] Remove Sphinx autosummary extensions, which now ship with Sphinx. --- doc/sphinxext/autosummary.py | 349 -------------------------- doc/sphinxext/autosummary_generate.py | 219 ---------------- 2 files changed, 568 deletions(-) delete mode 100755 doc/sphinxext/autosummary.py delete mode 100755 doc/sphinxext/autosummary_generate.py diff --git a/doc/sphinxext/autosummary.py b/doc/sphinxext/autosummary.py deleted file mode 100755 index 2f8a00a3035c5..0000000000000 --- a/doc/sphinxext/autosummary.py +++ /dev/null @@ -1,349 +0,0 @@ -""" -=========== -autosummary -=========== - -Sphinx extension that adds an autosummary:: directive, which can be -used to generate function/method/attribute/etc. summary lists, similar -to those output eg. by Epydoc and other API doc generation tools. - -An :autolink: role is also provided. - -autosummary directive ---------------------- - -The autosummary directive has the form:: - - .. autosummary:: - :nosignatures: - :toctree: generated/ - - module.function_1 - module.function_2 - ... - -and it generates an output table (containing signatures, optionally) - - ======================== ============================================= - module.function_1(args) Summary line from the docstring of function_1 - module.function_2(args) Summary line from the docstring - ... - ======================== ============================================= - -If the :toctree: option is specified, files matching the function names -are inserted to the toctree with the given prefix: - - generated/module.function_1 - generated/module.function_2 - ... - -Note: The file names contain the module:: or currentmodule:: prefixes. - -.. seealso:: autosummary_generate.py - - -autolink role -------------- - -The autolink role functions as ``:obj:`` when the name referred can be -resolved to a Python object, and otherwise it becomes simple emphasis. -This can be used as the default role to make links 'smart'. - -""" -import sys, os, posixpath, re - -from docutils.parsers.rst import directives -from docutils.statemachine import ViewList -from docutils import nodes - -import sphinx.addnodes, sphinx.roles -from sphinx.util import patfilter - -from docscrape_sphinx import get_doc_object - -import warnings -warnings.warn( - "The numpydoc.autosummary extension can also be found as " - "sphinx.ext.autosummary in Sphinx >= 0.6, and the version in " - "Sphinx >= 0.7 is superior to the one in numpydoc. This numpydoc " - "version of autosummary is no longer maintained.", - DeprecationWarning, stacklevel=2) - -def setup(app): - app.add_directive('autosummary', autosummary_directive, True, (0, 0, False), - toctree=directives.unchanged, - nosignatures=directives.flag) - app.add_role('autolink', autolink_role) - - app.add_node(autosummary_toc, - html=(autosummary_toc_visit_html, autosummary_toc_depart_noop), - latex=(autosummary_toc_visit_latex, autosummary_toc_depart_noop)) - app.connect('doctree-read', process_autosummary_toc) - -#------------------------------------------------------------------------------ -# autosummary_toc node -#------------------------------------------------------------------------------ - -class autosummary_toc(nodes.comment): - pass - -def process_autosummary_toc(app, doctree): - """ - Insert items described in autosummary:: to the TOC tree, but do - not generate the toctree:: list. - - """ - env = app.builder.env - crawled = {} - def crawl_toc(node, depth=1): - crawled[node] = True - for j, subnode in enumerate(node): - try: - if (isinstance(subnode, autosummary_toc) - and isinstance(subnode[0], sphinx.addnodes.toctree)): - env.note_toctree(env.docname, subnode[0]) - continue - except IndexError: - continue - if not isinstance(subnode, nodes.section): - continue - if subnode not in crawled: - crawl_toc(subnode, depth+1) - crawl_toc(doctree) - -def autosummary_toc_visit_html(self, node): - """Hide autosummary toctree list in HTML output""" - raise nodes.SkipNode - -def autosummary_toc_visit_latex(self, node): - """Show autosummary toctree (= put the referenced pages here) in Latex""" - pass - -def autosummary_toc_depart_noop(self, node): - pass - -#------------------------------------------------------------------------------ -# .. autosummary:: -#------------------------------------------------------------------------------ - -def autosummary_directive(dirname, arguments, options, content, lineno, - content_offset, block_text, state, state_machine): - """ - Pretty table containing short signatures and summaries of functions etc. - - autosummary also generates a (hidden) toctree:: node. - - """ - - names = [] - names += [x.strip().split()[0] for x in content - if x.strip() and re.search(r'^[a-zA-Z_]', x.strip()[0])] - - table, warnings, real_names = get_autosummary(names, state, - 'nosignatures' in options) - node = table - - env = state.document.settings.env - suffix = env.config.source_suffix - all_docnames = env.found_docs.copy() - dirname = posixpath.dirname(env.docname) - - if 'toctree' in options: - tree_prefix = options['toctree'].strip() - docnames = [] - for name in names: - name = real_names.get(name, name) - - docname = tree_prefix + name - if docname.endswith(suffix): - docname = docname[:-len(suffix)] - docname = posixpath.normpath(posixpath.join(dirname, docname)) - if docname not in env.found_docs: - warnings.append(state.document.reporter.warning( - 'toctree references unknown document %r' % docname, - line=lineno)) - docnames.append(docname) - - tocnode = sphinx.addnodes.toctree() - tocnode['includefiles'] = docnames - tocnode['maxdepth'] = -1 - tocnode['glob'] = None - tocnode['entries'] = [(None, docname) for docname in docnames] - - tocnode = autosummary_toc('', '', tocnode) - return warnings + [node] + [tocnode] - else: - return warnings + [node] - -def get_autosummary(names, state, no_signatures=False): - """ - Generate a proper table node for autosummary:: directive. - - Parameters - ---------- - names : list of str - Names of Python objects to be imported and added to the table. - document : document - Docutils document object - - """ - document = state.document - - real_names = {} - warnings = [] - - prefixes = [''] - prefixes.insert(0, document.settings.env.currmodule) - - table = nodes.table('') - group = nodes.tgroup('', cols=2) - table.append(group) - group.append(nodes.colspec('', colwidth=10)) - group.append(nodes.colspec('', colwidth=90)) - body = nodes.tbody('') - group.append(body) - - def append_row(*column_texts): - row = nodes.row('') - for text in column_texts: - node = nodes.paragraph('') - vl = ViewList() - vl.append(text, '') - state.nested_parse(vl, 0, node) - try: - if isinstance(node[0], nodes.paragraph): - node = node[0] - except IndexError: - pass - row.append(nodes.entry('', node)) - body.append(row) - - for name in names: - try: - obj, real_name = import_by_name(name, prefixes=prefixes) - except ImportError: - warnings.append(document.reporter.warning( - 'failed to import %s' % name)) - append_row(":obj:`%s`" % name, "") - continue - - real_names[name] = real_name - - doc = get_doc_object(obj) - - if doc['Summary']: - title = " ".join(doc['Summary']) - else: - title = "" - - col1 = u":obj:`%s <%s>`" % (name, real_name) - if doc['Signature']: - sig = re.sub('^[^(\[]*', '', doc['Signature'].strip()) - if '=' in sig: - # abbreviate optional arguments - sig = re.sub(r', ([a-zA-Z0-9_]+)=', r'[, \1=', sig, count=1) - sig = re.sub(r'\(([a-zA-Z0-9_]+)=', r'([\1=', sig, count=1) - sig = re.sub(r'=[^,)]+,', ',', sig) - sig = re.sub(r'=[^,)]+\)$', '])', sig) - # shorten long strings - sig = re.sub(r'(\[.{16,16}[^,]*?),.*?\]\)', r'\1, ...])', sig) - else: - sig = re.sub(r'(\(.{16,16}[^,]*?),.*?\)', r'\1, ...)', sig) - # make signature contain non-breaking spaces - col1 += u"\\ \u00a0" + unicode(sig).replace(u" ", u"\u00a0") - col2 = title - append_row(col1, col2) - - return table, warnings, real_names - -def import_by_name(name, prefixes=[None]): - """ - Import a Python object that has the given name, under one of the prefixes. - - Parameters - ---------- - name : str - Name of a Python object, eg. 'numpy.ndarray.view' - prefixes : list of (str or None), optional - Prefixes to prepend to the name (None implies no prefix). - The first prefixed name that results to successful import is used. - - Returns - ------- - obj - The imported object - name - Name of the imported object (useful if `prefixes` was used) - - """ - for prefix in prefixes: - try: - if prefix: - prefixed_name = '.'.join([prefix, name]) - else: - prefixed_name = name - return _import_by_name(prefixed_name), prefixed_name - except ImportError: - pass - raise ImportError - -def _import_by_name(name): - """Import a Python object given its full name""" - try: - # try first interpret `name` as MODNAME.OBJ - name_parts = name.split('.') - try: - modname = '.'.join(name_parts[:-1]) - __import__(modname) - return getattr(sys.modules[modname], name_parts[-1]) - except (ImportError, IndexError, AttributeError): - pass - - # ... then as MODNAME, MODNAME.OBJ1, MODNAME.OBJ1.OBJ2, ... - last_j = 0 - modname = None - for j in reversed(range(1, len(name_parts)+1)): - last_j = j - modname = '.'.join(name_parts[:j]) - try: - __import__(modname) - except ImportError: - continue - if modname in sys.modules: - break - - if last_j < len(name_parts): - obj = sys.modules[modname] - for obj_name in name_parts[last_j:]: - obj = getattr(obj, obj_name) - return obj - else: - return sys.modules[modname] - except (ValueError, ImportError, AttributeError, KeyError), e: - raise ImportError(e) - -#------------------------------------------------------------------------------ -# :autolink: (smart default role) -#------------------------------------------------------------------------------ - -def autolink_role(typ, rawtext, etext, lineno, inliner, - options={}, content=[]): - """ - Smart linking role. - - Expands to ":obj:`text`" if `text` is an object that can be imported; - otherwise expands to "*text*". - """ - r = sphinx.roles.xfileref_role('obj', rawtext, etext, lineno, inliner, - options, content) - pnode = r[0][0] - - prefixes = [None] - #prefixes.insert(0, inliner.document.settings.env.currmodule) - try: - obj, name = import_by_name(pnode['reftarget'], prefixes) - except ImportError: - content = pnode[0] - r[0][0] = nodes.emphasis(rawtext, content[0].astext(), - classes=content['classes']) - return r diff --git a/doc/sphinxext/autosummary_generate.py b/doc/sphinxext/autosummary_generate.py deleted file mode 100755 index a327067488a7c..0000000000000 --- a/doc/sphinxext/autosummary_generate.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python -r""" -autosummary_generate.py OPTIONS FILES - -Generate automatic RST source files for items referred to in -autosummary:: directives. - -Each generated RST file contains a single auto*:: directive which -extracts the docstring of the referred item. - -Example Makefile rule:: - - generate: - ./ext/autosummary_generate.py -o source/generated source/*.rst - -""" -import glob, re, inspect, os, optparse, pydoc -from autosummary import import_by_name - -try: - from phantom_import import import_phantom_module -except ImportError: - import_phantom_module = lambda x: x - -def main(): - p = optparse.OptionParser(__doc__.strip()) - p.add_option("-p", "--phantom", action="store", type="string", - dest="phantom", default=None, - help="Phantom import modules from a file") - p.add_option("-o", "--output-dir", action="store", type="string", - dest="output_dir", default=None, - help=("Write all output files to the given directory (instead " - "of writing them as specified in the autosummary:: " - "directives)")) - options, args = p.parse_args() - - if len(args) == 0: - p.error("wrong number of arguments") - - if options.phantom and os.path.isfile(options.phantom): - import_phantom_module(options.phantom) - - # read - names = {} - for name, loc in get_documented(args).items(): - for (filename, sec_title, keyword, toctree) in loc: - if toctree is not None: - path = os.path.join(os.path.dirname(filename), toctree) - names[name] = os.path.abspath(path) - - # write - for name, path in sorted(names.items()): - if options.output_dir is not None: - path = options.output_dir - - if not os.path.isdir(path): - os.makedirs(path) - - try: - obj, name = import_by_name(name) - except ImportError, e: - print "Failed to import '%s': %s" % (name, e) - continue - - fn = os.path.join(path, '%s.rst' % name) - - if os.path.exists(fn): - # skip - continue - - f = open(fn, 'w') - - try: - f.write('%s\n%s\n\n' % (name, '='*len(name))) - - if inspect.isclass(obj): - if issubclass(obj, Exception): - f.write(format_modulemember(name, 'autoexception')) - else: - f.write(format_modulemember(name, 'autoclass')) - elif inspect.ismodule(obj): - f.write(format_modulemember(name, 'automodule')) - elif inspect.ismethod(obj) or inspect.ismethoddescriptor(obj): - f.write(format_classmember(name, 'automethod')) - elif callable(obj): - f.write(format_modulemember(name, 'autofunction')) - elif hasattr(obj, '__get__'): - f.write(format_classmember(name, 'autoattribute')) - else: - f.write(format_modulemember(name, 'autofunction')) - finally: - f.close() - -def format_modulemember(name, directive): - parts = name.split('.') - mod, name = '.'.join(parts[:-1]), parts[-1] - return ".. currentmodule:: %s\n\n.. %s:: %s\n" % (mod, directive, name) - -def format_classmember(name, directive): - parts = name.split('.') - mod, name = '.'.join(parts[:-2]), '.'.join(parts[-2:]) - return ".. currentmodule:: %s\n\n.. %s:: %s\n" % (mod, directive, name) - -def get_documented(filenames): - """ - Find out what items are documented in source/*.rst - See `get_documented_in_lines`. - - """ - documented = {} - for filename in filenames: - f = open(filename, 'r') - lines = f.read().splitlines() - documented.update(get_documented_in_lines(lines, filename=filename)) - f.close() - return documented - -def get_documented_in_docstring(name, module=None, filename=None): - """ - Find out what items are documented in the given object's docstring. - See `get_documented_in_lines`. - - """ - try: - obj, real_name = import_by_name(name) - lines = pydoc.getdoc(obj).splitlines() - return get_documented_in_lines(lines, module=name, filename=filename) - except AttributeError: - pass - except ImportError, e: - print "Failed to import '%s': %s" % (name, e) - return {} - -def get_documented_in_lines(lines, module=None, filename=None): - """ - Find out what items are documented in the given lines - - Returns - ------- - documented : dict of list of (filename, title, keyword, toctree) - Dictionary whose keys are documented names of objects. - The value is a list of locations where the object was documented. - Each location is a tuple of filename, the current section title, - the name of the directive, and the value of the :toctree: argument - (if present) of the directive. - - """ - title_underline_re = re.compile("^[-=*_^#]{3,}\s*$") - autodoc_re = re.compile(".. auto(function|method|attribute|class|exception|module)::\s*([A-Za-z0-9_.]+)\s*$") - autosummary_re = re.compile(r'^\.\.\s+autosummary::\s*') - module_re = re.compile(r'^\.\.\s+(current)?module::\s*([a-zA-Z0-9_.]+)\s*$') - autosummary_item_re = re.compile(r'^\s+([_a-zA-Z][a-zA-Z0-9_.]*)\s*.*?') - toctree_arg_re = re.compile(r'^\s+:toctree:\s*(.*?)\s*$') - - documented = {} - - current_title = [] - last_line = None - toctree = None - current_module = module - in_autosummary = False - - for line in lines: - try: - if in_autosummary: - m = toctree_arg_re.match(line) - if m: - toctree = m.group(1) - continue - - if line.strip().startswith(':'): - continue # skip options - - m = autosummary_item_re.match(line) - if m: - name = m.group(1).strip() - if current_module and not name.startswith(current_module + '.'): - name = "%s.%s" % (current_module, name) - documented.setdefault(name, []).append( - (filename, current_title, 'autosummary', toctree)) - continue - if line.strip() == '': - continue - in_autosummary = False - - m = autosummary_re.match(line) - if m: - in_autosummary = True - continue - - m = autodoc_re.search(line) - if m: - name = m.group(2).strip() - if m.group(1) == "module": - current_module = name - documented.update(get_documented_in_docstring( - name, filename=filename)) - elif current_module and not name.startswith(current_module+'.'): - name = "%s.%s" % (current_module, name) - documented.setdefault(name, []).append( - (filename, current_title, "auto" + m.group(1), None)) - continue - - m = title_underline_re.match(line) - if m and last_line: - current_title = last_line.strip() - continue - - m = module_re.match(line) - if m: - current_module = m.group(2) - continue - finally: - last_line = line - - return documented - -if __name__ == "__main__": - main() From 35e278470356135a2050c8888e958ebc191aacc6 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Wed, 12 Oct 2011 22:26:24 +0100 Subject: [PATCH 020/161] Tweak wording in io documentation. --- doc/source/io.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 6d52a7e1325a9..f0b58af32e026 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -24,10 +24,10 @@ CSV & Text files The two workhorse functions for reading text files (a.k.a. flat files) are :func:`~pandas.io.parsers.read_csv` and :func:`~pandas.io.parsers.read_table`. -They both utilize the same parsing code for intelligently converting tabular -data into a DataFrame object. They take a number of different arguments: +They both use the same parsing code to intelligently convert tabular +data into a DataFrame object. They can take a number of arguments: - - ``path_or_buffer``: Either a string path to a file or any object with a + - ``path_or_buffer``: Either a string path to a file, or any object with a ``read`` method (such as an open file or ``StringIO``). - ``delimiter``: For ``read_table`` only, a regular expression to split fields on. ``read_csv`` uses the ``csv`` module to do this and hence only From b5435f3a5372d73348a321cc8c0d7eff8ef58116 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Wed, 12 Oct 2011 22:38:17 +0100 Subject: [PATCH 021/161] Minor updates to docs on missing values. --- doc/source/missing_data.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index c4df50b169f2b..272ad9ec64397 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -64,13 +64,15 @@ not ``NaN``, I think you will find this is a worthwhile trade-off (Zen of Python: "practicality beats purity"). To make detecting missing values easier (and across different array dtypes), -pandas provides the ``isnull`` and ``notnull`` functions: +pandas provides the :func:`~pandas.core.common.isnull` and +:func:`~pandas.core.common.notnull` functions, which are also methods on +``Series`` objects: .. ipython:: python df2['one'] isnull(df2['one']) - notnull(df2['four']) + df2['four'].notnull() **Summary:** ``NaN``, ``inf``, ``-inf``, and ``None`` (in object arrays) are all considered missing by the ``isnull`` and ``notnull`` functions. @@ -125,8 +127,6 @@ Cleaning / filling missing data pandas objects are equipped with various data manipulation methods for dealing with missing data. -dropna: - .. _missing_data.fillna: Filling missing values: fillna @@ -165,6 +165,8 @@ To remind you, these are the available filling methods: With time series data, using pad/ffill is extremely common so that the "last known value" is available at every time point. +.. _missing_data.dropna: + Dropping axis labels with missing data: dropna ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 75310383650e04241172f735ba11e00455fae51a Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Mon, 10 Oct 2011 18:37:05 +0100 Subject: [PATCH 022/161] Add simple attribute access to DataFrame columns. --- pandas/core/frame.py | 8 ++++++++ pandas/tests/test_frame.py | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index abf7b2f3ea321..b92d6c445b312 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -861,6 +861,14 @@ def _getitem_single(self, key): res = Series(values, index=self.index, name=key) self._series_cache[key] = res return res + + def __getattr__(self, name): + """After regular attribute access, try looking up the name of a column. + This allows simpler access to columns for interactive use.""" + if name in self.columns: + return self[name] + raise AttributeError("'%s' object has no attribute '%s'" % \ + (type(self).__name__, name)) def __setitem__(self, key, value): # support boolean setting with DataFrame input, e.g. diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 79fa30e22885c..67d21b180edf2 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -67,6 +67,10 @@ def test_getitem_boolean(self): subframe_obj = self.tsframe[indexer_obj] assert_frame_equal(subframe_obj, subframe) + + def test_getattr(self): + tm.assert_series_equal(self.frame.A, self.frame['A']) + self.assertRaises(AttributeError, getattr, self.frame, 'NONEXISTENT_NAME') def test_setitem(self): # not sure what else to do here From 4d60df8b6064c61393ca1b96800e8b159262703d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 13 Oct 2011 00:16:48 -0400 Subject: [PATCH 023/161] ENH: speed up DataFrame constructor with nested dict, GH #212 --- RELEASE.rst | 5 +++ pandas/core/frame.py | 43 +++++++++++++++++------- pandas/core/index.py | 7 +--- pandas/src/groupby.pyx | 47 --------------------------- pandas/src/tseries.pyx | 74 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 111 insertions(+), 65 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 052400ea5db6d..71d11801a559c 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -25,9 +25,13 @@ pandas 0.5.0 - Added ability to join on multiple columns in `DataFrame.join` (GH #214) - Added private `_get_duplicates` function to `Index` for identifying duplicate values more easily + - Added column attribute access to DataFrame, e.g. df.A equivalent to df['A'] + if 'A' is a column in the DataFrame (PR #213) **Improvements to existing features** + - Added Cython function for converting tuples to ndarray very fast. Speeds up + many MultiIndex-related operations - File parsing functions like `read_csv` and `read_table` will explicitly check if a parsed index has duplicates and raise a more helpful exception rather than deferring the check until later @@ -38,6 +42,7 @@ pandas 0.5.0 5x, regression from 0.3.0 - With new `DataFrame.align` method, speeding up binary operations between differently-indexed DataFrame objects by 10-25%. + - Significantly sped up conversion of nested dict into DataFrame **Bug fixes** diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b92d6c445b312..0efdcd838bf63 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -33,7 +33,7 @@ from pandas.util import py3compat import pandas.core.common as common import pandas.core.datetools as datetools -import pandas._tseries as _tseries +import pandas._tseries as lib #---------------------------------------------------------------------- # Factory helper methods @@ -861,7 +861,7 @@ def _getitem_single(self, key): res = Series(values, index=self.index, name=key) self._series_cache[key] = res return res - + def __getattr__(self, name): """After regular attribute access, try looking up the name of a column. This allows simpler access to columns for interactive use.""" @@ -3125,7 +3125,7 @@ def _get_index(v): if isinstance(v, Series): return v.index elif isinstance(v, dict): - return Index(_try_sort(v)) + return v.keys() index = None if len(data) == 0: @@ -3155,26 +3155,45 @@ def _get_index(v): def _union_indexes(indexes): if len(indexes) == 1: - index = indexes[0] - if _any_special_indexes(indexes): + result = indexes[0] + if isinstance(result, list): + result = Index(sorted(result)) + return result + + indexes, kind = _sanitize_and_check(indexes) + + if kind == 'special': result = indexes[0] for other in indexes[1:]: result = result.union(other) return result - else: + elif kind == 'array': index = indexes[0] for other in indexes[1:]: if not index.equals(other): - return Index(_tseries.fast_unique_multiple(indexes)) + return Index(lib.fast_unique_multiple(indexes)) return index + else: + return Index(lib.fast_unique_multiple_list(indexes)) + + +def _sanitize_and_check(indexes): + kinds = list(set([type(index) for index in indexes])) + + if list in kinds: + if len(kinds) > 1: + indexes = [Index(_try_sort(x)) if not isinstance(x, Index) else x + for x in indexes] + kinds.remove(list) + else: + return indexes, 'list' -def _any_special_indexes(indexes): - for index in indexes: - if type(index) != Index: - return True - return False + if len(kinds) > 1 or Index not in kinds: + return indexes, 'special' + else: + return indexes, 'array' def _check_data_types(data): diff --git a/pandas/core/index.py b/pandas/core/index.py index 61bd1ada54b59..7582e06ca1c98 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -39,6 +39,7 @@ class Index(np.ndarray): ---- An Index instance can **only** contain hashable objects """ + name = None def __new__(cls, data, dtype=None, copy=False, name=None): if isinstance(data, np.ndarray): if dtype is None and issubclass(data.dtype.type, np.integer): @@ -51,12 +52,6 @@ def __new__(cls, data, dtype=None, copy=False, name=None): # other iterable of some kind subarr = _asarray_tuplesafe(data, dtype=object) - # if not isinstance(data, (list, tuple)): - # data = list(data) - - # subarr = np.empty(len(data), dtype=object) - # subarr[:] = data - subarr = subarr.view(cls) subarr.name = name return subarr diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx index cffac35edc48c..7f56e11c37fc0 100644 --- a/pandas/src/groupby.pyx +++ b/pandas/src/groupby.pyx @@ -213,27 +213,6 @@ def group_labels2(ndarray[object] values): return reverse, labels -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique(ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - list uniques = [] - dict table = {} - object val, stub = 0 - - for i from 0 <= i < n: - val = values[i] - if val not in table: - table[val] = stub - uniques.append(val) - try: - uniques.sort() - except Exception: - pass - - return uniques - @cython.wraparound(False) @cython.boundscheck(False) def get_unique_labels(ndarray[object] values, dict idMap): @@ -248,32 +227,6 @@ def get_unique_labels(ndarray[object] values, dict idMap): return fillVec -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique_multiple(list arrays): - cdef: - ndarray[object] buf - Py_ssize_t k = len(arrays) - Py_ssize_t i, j, n - list uniques = [] - dict table = {} - object val, stub = 0 - - for i from 0 <= i < k: - buf = arrays[i] - n = len(buf) - for j from 0 <= j < n: - val = buf[j] - if val not in table: - table[val] = stub - uniques.append(val) - try: - uniques.sort() - except Exception: - pass - - return uniques - # from libcpp.set cimport set as stlset # cdef fast_unique_int32(ndarray arr): diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index 50a8a8cf1e752..ceccc3c1536a9 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -274,6 +274,80 @@ def list_to_object_array(list obj): return arr + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < n: + val = values[i] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple(list arrays): + cdef: + ndarray[object] buf + Py_ssize_t k = len(arrays) + Py_ssize_t i, j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < k: + buf = arrays[i] + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple_list(list lists): + cdef: + list buf + Py_ssize_t k = len(lists) + Py_ssize_t i, j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < k: + buf = lists[i] + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + include "skiplist.pyx" include "groupby.pyx" include "moments.pyx" From fc78e34312afd3007242366f1ba7d15509b79d8d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 00:08:20 -0400 Subject: [PATCH 024/161] DOC: minor release note update --- RELEASE.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index 71d11801a559c..af896d4b54483 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -15,7 +15,8 @@ pandas 0.5.0 - `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for `index_col` is now None. To use one or more of the columns as the resulting DataFrame's index, these must be explicitly specified now - - Parsing functions no longer parse dates by default (GH #225) + - Parsing functions like `read_csv` no longer parse dates by default (GH + #225) **New features / modules** From 9fe43b7d11ad4b7fd88e39f3292c18f53e0327d2 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Thu, 13 Oct 2011 16:56:48 +0100 Subject: [PATCH 025/161] Add columns to IPython tab completions of DataFrame attributes. --- pandas/core/frame.py | 19 +++++++++++++++++-- pandas/util/py3compat.py | 11 +++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0efdcd838bf63..934dfb04b8ec1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,6 +15,7 @@ from StringIO import StringIO import csv import operator +import sys import warnings from numpy import nan @@ -550,7 +551,6 @@ def to_string(self, buf=None, columns=None, colSpace=None, nanRep='NaN', formatters=None, float_format=None, sparsify=True): from pandas.core.common import _format, adjoin - import sys if buf is None: # pragma: no cover buf = sys.stdout @@ -629,7 +629,6 @@ def info(self, verbose=True, buf=None): If False, don't print column count summary buf : writable buffer, defaults to sys.stdout """ - import sys if buf is None: # pragma: no cover buf = sys.stdout @@ -3304,6 +3303,22 @@ def _homogenize(data, index, columns, dtype=None): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) +def install_ipython_completers(): + """Register the DataFrame type with IPython's tab completion machinery, so + that it knows about accessing column names as attributes.""" + from IPython.utils.generics import complete_object + + @complete_object.when_type(DataFrame) + def complete_dataframe(obj, prev_completions): + return prev_completions + [c for c in obj.columns \ + if py3compat.isidentifier(c)] + +# Importing IPython brings in about 200 modules, so we want to avoid it unless +# we're in IPython (when those modules are loaded anyway). +if "IPython" in sys.modules: + install_ipython_completers() + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/util/py3compat.py b/pandas/util/py3compat.py index e8bb212e215f2..afb48ef41cc95 100644 --- a/pandas/util/py3compat.py +++ b/pandas/util/py3compat.py @@ -1,3 +1,14 @@ import sys PY3 = (sys.version_info[0] >= 3) + +if PY3: + def isidentifier(s): + return s.isidentifier() + +else: + # Python 2 + import re + _name_re = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*$") + def isidentifier(s, dotted=False): + return bool(_name_re.match(s)) From 600c36a1a2ced2bbe12b17982b995e6365ce4ca1 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Thu, 13 Oct 2011 21:22:44 +0100 Subject: [PATCH 026/161] Catch and silence errors in installing IPython completers. Avoid problems with older versions of IPython (pre 0.11) Closes gh-230 --- pandas/core/frame.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 934dfb04b8ec1..90ae7feb15d83 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3316,7 +3316,10 @@ def complete_dataframe(obj, prev_completions): # Importing IPython brings in about 200 modules, so we want to avoid it unless # we're in IPython (when those modules are loaded anyway). if "IPython" in sys.modules: - install_ipython_completers() + try: + install_ipython_completers() + except Exception: + pass if __name__ == '__main__': From 66d5ea18efcd903cf67030e91c29580048a07994 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 00:10:58 -0400 Subject: [PATCH 027/161] DOC: add release note --- RELEASE.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index af896d4b54483..d043a9022f5f3 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -28,6 +28,7 @@ pandas 0.5.0 duplicate values more easily - Added column attribute access to DataFrame, e.g. df.A equivalent to df['A'] if 'A' is a column in the DataFrame (PR #213) + - Added IPython tab completion hook for DataFrame columns. (PR #233, GH #230) **Improvements to existing features** @@ -50,6 +51,10 @@ pandas 0.5.0 - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should be reported upstream to matplotlib (GH #224) +Thanks +------ + + - Thomas Kluyver pandas 0.4.3 ============ From 7ecd163a99c351da5bead1ebae2bf850f5b58423 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 00:31:39 -0400 Subject: [PATCH 028/161] ENH: remove panel weights option that was not doing anything principled --- pandas/stats/fama_macbeth.py | 8 +++---- pandas/stats/interface.py | 3 --- pandas/stats/plm.py | 40 ++++--------------------------- pandas/stats/tests/test_ols.py | 44 ---------------------------------- 4 files changed, 9 insertions(+), 86 deletions(-) diff --git a/pandas/stats/fama_macbeth.py b/pandas/stats/fama_macbeth.py index 1e1839e24f92c..e632b9b001b07 100644 --- a/pandas/stats/fama_macbeth.py +++ b/pandas/stats/fama_macbeth.py @@ -25,7 +25,7 @@ def fama_macbeth(**kwargs): return klass(**kwargs) class FamaMacBeth(object): - def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, + def __init__(self, y, x, intercept=True, nw_lags=None, nw_lags_beta=None, entity_effects=False, time_effects=False, x_effects=None, cluster=None, dropped_dummies={}, verbose=False): @@ -33,7 +33,7 @@ def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, from pandas.stats.plm import MovingPanelOLS self._ols_result = MovingPanelOLS( - y=y, x=x, weights=weights, window_type='rolling', window=1, + y=y, x=x, window_type='rolling', window=1, intercept=intercept, nw_lags=nw_lags, entity_effects=entity_effects, time_effects=time_effects, x_effects=x_effects, cluster=cluster, @@ -141,7 +141,7 @@ def summary(self): return template % params class MovingFamaMacBeth(FamaMacBeth): - def __init__(self, y, x, weights=None, window_type='rolling', window=10, + def __init__(self, y, x, window_type='rolling', window=10, intercept=True, nw_lags=None, nw_lags_beta=None, entity_effects=False, time_effects=False, x_effects=None, cluster=None, dropped_dummies={}, verbose=False): @@ -149,7 +149,7 @@ def __init__(self, y, x, weights=None, window_type='rolling', window=10, self._window = window FamaMacBeth.__init__( - self, y=y, x=x, weights=weights, intercept=intercept, + self, y=y, x=x, intercept=intercept, nw_lags=nw_lags, nw_lags_beta=nw_lags_beta, entity_effects=entity_effects, time_effects=time_effects, x_effects=x_effects, cluster=cluster, diff --git a/pandas/stats/interface.py b/pandas/stats/interface.py index b3cc9eb41ed54..89da36c98eea7 100644 --- a/pandas/stats/interface.py +++ b/pandas/stats/interface.py @@ -39,9 +39,6 @@ def ols(**kwargs): Panel OLS options: pool: bool Whether to run pooled panel regression. Defaults to true. - weights: DataFrame - Weight for each observation. The weights are not normalized; - they're multiplied directly by each observation. entity_effects: bool Whether to account for entity fixed effects. Defaults to false. time_effects: bool diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py index 43d6322f807d2..f477d8aeb95df 100644 --- a/pandas/stats/plm.py +++ b/pandas/stats/plm.py @@ -36,9 +36,6 @@ class PanelOLS(OLS): FULL_SAMPLE, ROLLING, EXPANDING. FULL_SAMPLE by default. window : int size of window (for rolling/expanding OLS) - weights : DataFrame - Weight for each observation. The weights are not normalized; - they're multiplied directly by each observation. pool : bool, default True Whether to run pooled panel regression entity_effects : bool, deafult False @@ -65,14 +62,12 @@ class PanelOLS(OLS): 2. There is autocorrelation - use 'entity' """ - def __init__(self, y, x, weights=None, - intercept=True, nw_lags=None, entity_effects=False, + def __init__(self, y, x, intercept=True, nw_lags=None, entity_effects=False, time_effects=False, x_effects=None, cluster=None, dropped_dummies=None, verbose=False, nw_overlap=False): self._x_orig = x self._y_orig = y - self._weights = weights self._intercept = intercept self._nw_lags = nw_lags self._nw_overlap = nw_overlap @@ -110,8 +105,7 @@ def _prepare_data(self): The categorical variables will get dropped from x. """ - (x, x_filtered, y, weights, - weights_filt, cat_mapping) = self._filter_data() + (x, x_filtered, y, cat_mapping) = self._filter_data() self.log('Adding dummies to X variables') x = self._add_dummies(x, cat_mapping) @@ -141,12 +135,6 @@ def _prepare_data(self): x_regressor = x y_regressor = y - if weights is not None: - assert(y_regressor.index is weights.index) - assert(x_regressor.index is weights.index) - y_regressor = y_regressor * weights - x_regressor = x_regressor.mul(weights, axis=0) - return x, x_regressor, x_filtered, y, y_regressor def _filter_data(self): @@ -170,9 +158,6 @@ def _filter_data(self): x_names = data.items - if self._weights is not None: - data['__weights__'] = self._weights - # Filter x's without y (so we can make a prediction) filtered = data.to_long() @@ -187,21 +172,10 @@ def _filter_data(self): data_long = data.to_long() x_filt = filtered.filter(x_names) - - if self._weights: - weights_filt = filtered['__weights__'] - else: - weights_filt = None - x = data_long.filter(x_names) y = data_long['__y__'] - if self._weights: - weights = data_long['__weights__'] - else: - weights = None - - return x, x_filt, y, weights, weights_filt, cat_mapping + return x, x_filt, y, cat_mapping def _convert_x(self, x): # Converts non-numeric data in x to floats. x_converted is the @@ -527,9 +501,6 @@ class MovingPanelOLS(MovingOLS, PanelOLS): Minimum number of total observations to require. Default is rank(X matrix) + 1. In some cases we might want to be able to relax this number. - weights : DataFrame - Weight for each observation. The weights are not normalized; - they're multiplied directly by each observation. pool : bool Whether to run pooled panel regression. Defaults to true. entity_effects : bool @@ -554,7 +525,7 @@ class MovingPanelOLS(MovingOLS, PanelOLS): 1. Countries are correlated - use 'time' 2. There is autocorrelation - use 'entity' """ - def __init__(self, y, x, weights=None, + def __init__(self, y, x, window_type='expanding', window=None, min_periods=None, min_obs=None, @@ -567,8 +538,7 @@ def __init__(self, y, x, weights=None, dropped_dummies=None, verbose=False): - self._args = dict(weights=weights, - intercept=intercept, + self._args = dict(intercept=intercept, nw_lags=nw_lags, nw_overlap=nw_overlap, entity_effects=entity_effects, diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index dd53183e38b9a..79286c66f9283 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -381,41 +381,6 @@ def testFiltering(self): self.assertTrue(result._x_filtered.major_axis.equals( result.y_fitted.index)) - def testWithWeights(self): - data = np.arange(10).reshape((5, 2)) - index = [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5)] - cols = ['A', 'B'] - weights = DataFrame(data, index=index, columns=cols) - - result = ols(y=self.panel_y2, x=self.panel_x2, weights=weights) - - assert_almost_equal(result._y_trans.values.flat, [0, 16, 25]) - - exp_x = [[0, 0, 0], - [36, 68, 4], - [150, 240, 5]] - assert_almost_equal(result._x_trans.values, exp_x) - - - exp_x_filtered = [[6, 14, 1], - [9, 17, 1], - [30, 48, 1], - [11, 20, 1], - [12, 21, 1]] -# exp_x_filtered = [[0, 0, 0], -# [36, 68, 4], -# [150, 240, 5], -# [66, 120, 6], -# [84, 147, 7]] - - assert_almost_equal(result._x_filtered.values, exp_x_filtered) - - # _check_non_raw_results(result) - def testWithTimeEffects(self): result = ols(y=self.panel_y2, x=self.panel_x2, time_effects=True) @@ -513,15 +478,6 @@ def testForSeries(self): self.series_x, self.series_y, nw_lags=1, nw_overlap=True) - def testRollingWithWeights(self): - idx = self.panel_y.index - cols = self.panel_y.columns - - - weights = DataFrame(np.random.standard_normal((len(idx), len(cols))), - index=idx, columns=cols) - self.checkMovingOLS(self.panel_x, - self.panel_y, weights=weights) def testRolling(self): self.checkMovingOLS(self.panel_x, self.panel_y) From 52723fae6481c807a0e9720d4b1cd56730cd30f3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 00:32:59 -0400 Subject: [PATCH 029/161] DOC: release notes --- RELEASE.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index d043a9022f5f3..757d81e3d6f88 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -17,6 +17,8 @@ pandas 0.5.0 DataFrame's index, these must be explicitly specified now - Parsing functions like `read_csv` no longer parse dates by default (GH #225) + - Removed `weights` option in panel regression which was not doing anything + principled **New features / modules** From bc4fb0d2b4c2cf892392a809663e3b384099948b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 09:40:45 -0400 Subject: [PATCH 030/161] ENH: groupby level name instead of number, GH #223 --- pandas/core/groupby.py | 4 ++++ pandas/tests/test_groupby.py | 21 +++++++++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 02b572f97302c..fc6930a18a46c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -503,6 +503,10 @@ def __init__(self, index, grouper=None, name=None, level=None): self.index = index if level is not None: + if not isinstance(level, int): + assert(level in index.names) + level = index.names.index(level) + inds = index.labels[level] labels = index.levels[level].take(inds) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8caae56dd335c..904bcb6efdc6a 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -56,7 +56,8 @@ def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]]) + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) self.mframe = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) @@ -655,7 +656,7 @@ def _testit(op): def test_grouping_attrs(self): deleveled = self.mframe.delevel() - grouped = deleveled.groupby(['level_0', 'level_1']) + grouped = deleveled.groupby(['first', 'second']) for i, ping in enumerate(grouped.groupings): the_counts = self.mframe.groupby(level=i).count()['A'] @@ -668,12 +669,20 @@ def test_groupby_level(self): result0 = frame.groupby(level=0).sum() result1 = frame.groupby(level=1).sum() - expected0 = frame.groupby(deleveled['level_0']).sum() - expected1 = frame.groupby(deleveled['level_1']).sum() + expected0 = frame.groupby(deleveled['first']).sum() + expected1 = frame.groupby(deleveled['second']).sum() assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) + # groupby level name + result0 = frame.groupby(level='first').sum() + result1 = frame.groupby(level='second').sum() + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + + # axis=1 + result0 = frame.T.groupby(level=0, axis=1).sum() result1 = frame.T.groupby(level=1, axis=1).sum() assert_frame_equal(result0, expected0.T) @@ -693,8 +702,8 @@ def test_groupby_level_mapper(self): result0 = frame.groupby(mapper0, level=0).sum() result1 = frame.groupby(mapper1, level=1).sum() - mapped_level0 = np.array([mapper0.get(x) for x in deleveled['level_0']]) - mapped_level1 = np.array([mapper1.get(x) for x in deleveled['level_1']]) + mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) + mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) expected0 = frame.groupby(mapped_level0).sum() expected1 = frame.groupby(mapped_level1).sum() From 0113ed44f0b328e4f650ef15796d23c8bed36d31 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 16:55:32 -0400 Subject: [PATCH 031/161] BUG: test failure in groupby transform --- RELEASE.rst | 2 ++ pandas/tests/test_groupby.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index 757d81e3d6f88..d111a73572afc 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -47,6 +47,8 @@ pandas 0.5.0 - With new `DataFrame.align` method, speeding up binary operations between differently-indexed DataFrame objects by 10-25%. - Significantly sped up conversion of nested dict into DataFrame + - Can pass hierarchical index level name to `groupby` instead of the level + number if desired (GH #223) **Bug fixes** diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 904bcb6efdc6a..c863e9edf1a4e 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -872,6 +872,24 @@ def test_grouping_ndarray(self): expected = self.df.groupby('A').sum() assert_frame_equal(result, expected) + def test_apply_example(self): + df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], + 'c' : np.tile(['a','b','c'], 2), + 'v' : np.arange(1., 7.)}) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + assert_frame_equal(result, expected) + + class TestPanelGroupBy(unittest.TestCase): def setUp(self): From 82d73a9d672b0da098bb665c2d859bcf6b775f92 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 17:14:57 -0400 Subject: [PATCH 032/161] BUG: fix GroupBy.apply bug, GH #237 --- pandas/core/groupby.py | 39 +++++++++++++++++++++++++++--------- pandas/tests/test_groupby.py | 20 +++++++++++++++++- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index fc6930a18a46c..eb32db4c37701 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1069,22 +1069,35 @@ def transform(self, func, *args, **kwargs): axis=self.axis) def _concat_frames(frames, index, columns=None, axis=0): - if axis == 0: - all_index = [np.asarray(x.index) for x in frames] - new_index = Index(np.concatenate(all_index)) + if len(frames) == 1: + return frames[0] + if axis == 0: + new_index = _concat_indexes([x.index for x in frames]) if columns is None: new_columns = frames[0].columns else: new_columns = columns else: - all_columns = [np.asarray(x.columns) for x in frames] - new_columns = Index(np.concatenate(all_columns)) + new_columns = _concat_indexes([x.columns for x in frames]) new_index = index - new_values = np.concatenate([x.values for x in frames], axis=axis) - result = DataFrame(new_values, index=new_index, columns=new_columns) - return result.reindex(index=index, columns=columns) + if frames[0]._is_mixed_type: + new_data = {} + for col in new_columns: + new_data[col] = np.concatenate([x[col].values for x in frames]) + return DataFrame(new_data, index=new_index, columns=new_columns) + else: + new_values = np.concatenate([x.values for x in frames], axis=axis) + result = DataFrame(new_values, index=new_index, columns=new_columns) + return result.reindex(index=index, columns=columns) + +def _concat_indexes(indexes): + if len(indexes) == 1: + new_index = indexes[0] + else: + new_index = indexes[0].append(indexes[1:]) + return new_index def _concat_frames_hierarchical(frames, keys, groupings, axis=0): if axis == 0: @@ -1096,8 +1109,14 @@ def _concat_frames_hierarchical(frames, keys, groupings, axis=0): new_columns = _make_concat_multiindex(all_columns, keys, groupings) new_index = frames[0].index - new_values = np.concatenate([x.values for x in frames], axis=axis) - return DataFrame(new_values, index=new_index, columns=new_columns) + if frames[0]._is_mixed_type: + new_data = {} + for col in new_columns: + new_data[col] = np.concatenate([x[col].values for x in frames]) + return DataFrame(new_data, index=new_index, columns=new_columns) + else: + new_values = np.concatenate([x.values for x in frames], axis=axis) + return DataFrame(new_values, index=new_index, columns=new_columns) def _make_concat_multiindex(indexes, keys, groupings): if not _all_indexes_same(indexes): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c863e9edf1a4e..d7c8b679a2105 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -872,7 +872,7 @@ def test_grouping_ndarray(self): expected = self.df.groupby('A').sum() assert_frame_equal(result, expected) - def test_apply_example(self): + def test_apply_typecast_fail(self): df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], 'c' : np.tile(['a','b','c'], 2), 'v' : np.arange(1., 7.)}) @@ -889,6 +889,24 @@ def f(group): assert_frame_equal(result, expected) + def test_apply_multiindex_fail(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]]) + df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], + 'c' : np.tile(['a','b','c'], 2), + 'v' : np.arange(1., 7.)}, index=index) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + assert_frame_equal(result, expected) class TestPanelGroupBy(unittest.TestCase): From 36345fd2797cdce3d2dcf8174d62117597715c0e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 17:47:33 -0400 Subject: [PATCH 033/161] API: Series/DataFrame.to_string return string by default. Series.to_string buffer keyword namechange. addresses GH #232 --- RELEASE.rst | 3 +++ pandas/core/frame.py | 9 +++++++-- pandas/core/series.py | 8 ++++++-- pandas/tests/test_frame.py | 28 +++++++++++++++++----------- pandas/tests/test_series.py | 8 +++++++- 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index d111a73572afc..33a6e46c77430 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -19,6 +19,9 @@ pandas 0.5.0 #225) - Removed `weights` option in panel regression which was not doing anything principled + - Changed `buffer` argument name in `Series.to_string` to `buf` + - `Series.to_string` and `DataFrame.to_string` now return strings by default + instead of printing to sys.stdout **New features / modules** diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 90ae7feb15d83..02481123e7f3d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -552,8 +552,10 @@ def to_string(self, buf=None, columns=None, colSpace=None, sparsify=True): from pandas.core.common import _format, adjoin + return_ = False if buf is None: # pragma: no cover - buf = sys.stdout + buf = StringIO() + return_ = True if colSpace is None: def _myformat(v): @@ -596,6 +598,9 @@ def _format_col(col): for s in to_write: print >> buf, s + if return_: + return buf.getvalue() + def _get_formatted_labels(self, sparsify=True): from pandas.core.index import _sparsify @@ -3307,7 +3312,7 @@ def install_ipython_completers(): """Register the DataFrame type with IPython's tab completion machinery, so that it knows about accessing column names as attributes.""" from IPython.utils.generics import complete_object - + @complete_object.when_type(DataFrame) def complete_dataframe(obj, prev_completions): return prev_completions + [c for c in obj.columns \ diff --git a/pandas/core/series.py b/pandas/core/series.py index 15e846d7d3466..d6db7bbdb6c7b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -378,8 +378,12 @@ def _tidy_repr(self, max_vals=20): result = '%s\nName: %s, Length: %d' % (result, self.name, len(self)) return result - def to_string(self, buffer=sys.stdout, nanRep='NaN'): - print >> buffer, self._get_repr(nanRep=nanRep) + def to_string(self, buf=None, nanRep='NaN'): + the_repr = self._get_repr(nanRep=nanRep) + if buf is None: + return the_repr + else: + print >> buf, the_repr def _get_repr(self, name=False, nanRep='NaN'): vals = self.values diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 67d21b180edf2..e509c455a0d41 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -67,10 +67,11 @@ def test_getitem_boolean(self): subframe_obj = self.tsframe[indexer_obj] assert_frame_equal(subframe_obj, subframe) - + def test_getattr(self): tm.assert_series_equal(self.frame.A, self.frame['A']) - self.assertRaises(AttributeError, getattr, self.frame, 'NONEXISTENT_NAME') + self.assertRaises(AttributeError, getattr, self.frame, + 'NONEXISTENT_NAME') def test_setitem(self): # not sure what else to do here @@ -1268,20 +1269,25 @@ def test_to_string(self): biggie['A'][:20] = nan biggie['B'][:20] = nan + s = biggie.to_string() + buf = StringIO() - biggie.to_string(buf=buf) + retval = biggie.to_string(buf=buf) + self.assert_(retval is None) + self.assertEqual(buf.getvalue(), s) + + self.assert_(isinstance(s, basestring)) - biggie.to_string(buf=buf, columns=['B', 'A'], colSpace=17) - biggie.to_string(buf=buf, columns=['B', 'A'], - formatters={'A' : lambda x: '%.1f' % x}) + biggie.to_string(columns=['B', 'A'], colSpace=17) + biggie.to_string(columns=['B', 'A'], + formatters={'A' : lambda x: '%.1f' % x}) - biggie.to_string(buf=buf, columns=['B', 'A'], - float_format=str) - biggie.to_string(buf=buf, columns=['B', 'A'], colSpace=12, - float_format=str) + biggie.to_string(columns=['B', 'A'], float_format=str) + biggie.to_string(columns=['B', 'A'], colSpace=12, + float_format=str) frame = DataFrame(index=np.arange(1000)) - frame.to_string(buf=buf) + frame.to_string() def test_insert(self): df = DataFrame(np.random.randn(5, 3), index=np.arange(5), diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index ac24817017f2d..817721dbcfeb8 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -426,7 +426,13 @@ def test_repr(self): def test_to_string(self): from cStringIO import StringIO - self.ts.to_string(buffer=StringIO()) + buf = StringIO() + + s = self.ts.to_string() + + retval = self.ts.to_string(buf=buf) + self.assert_(retval is None) + self.assertEqual(buf.getvalue().strip(), s) def test_iter(self): for i, val in enumerate(self.series): From b72cfb5220589f61cca67ecfcfaef8d042949273 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 18:12:27 -0400 Subject: [PATCH 034/161] API: removed functions and methods deprecated in 0.4 Series, address GH #229 --- RELEASE.rst | 46 +++++++++++++++ pandas/core/frame.py | 128 +----------------------------------------- pandas/core/index.py | 7 +-- pandas/core/panel.py | 9 --- pandas/core/series.py | 28 +-------- pandas/io/parsers.py | 40 ------------- 6 files changed, 49 insertions(+), 209 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 33a6e46c77430..e12bdab36f0d8 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -22,6 +22,52 @@ pandas 0.5.0 - Changed `buffer` argument name in `Series.to_string` to `buf` - `Series.to_string` and `DataFrame.to_string` now return strings by default instead of printing to sys.stdout + - Series functions renamed (and thus deprecated) in 0.4 series have been + removed: + + * `asOf`, use `asof` + * `toDict`, use `to_dict` + * `toString`, use `to_string` + * `toCSV`, use `to_csv` + * `merge`, use `map` + * `applymap`, use `apply` + * `combineFirst`, use `combine_first` + * `_firstTimeWithValue` use `first_valid_index` + * `_lastTimeWithValue` use `last_valid_index` + + - DataFrame functions renamed / deprecated in 0.4 series have been removed: + + * `asMatrix` method, use `as_matrix` or `values` attribute + * `combineFirst`, use `combine_first` + * `getXS`, use `xs` + * `merge`, use `join` + * `fromRecords`, use `from_records` + * `fromcsv`, use `from_csv` + * `toRecords`, use `to_records` + * `toDict`, use `to_dict` + * `toString`, use `to_string` + * `toCSV`, use `to_csv` + * `_firstTimeWithValue` use `first_valid_index` + * `_lastTimeWithValue` use `last_valid_index` + * `toDataMatrix` is no longer needed + * `rows()` method, use `index` attribute + * `cols()` method, use `columns` attribute + * `dropEmptyRows()`, use `dropna(how='all')` + * `dropIncompleteRows()`, use `dropna()` + * `tapply(f)`, use `apply(f, axis=1)` + * `tgroupby(keyfunc, aggfunc)`, use `groupby` with `axis=1` + + - Other outstanding deprecations have been removed: + + * `indexField` argument in `DataFrame.from_records` + * `missingAtEnd` argument in `Series.order`. Use `na_last` instead + * `Series.fromValue` classmethod, use regular `Series` constructor instead + * Functions `parseCSV`, `parseText`, and `parseExcel` methods in + `pandas.io.parsers` have been removed + * `Index.asOfDate` function + * `Panel.getMinorXS` (use `minor_xs`) and `Panel.getMajorXS` (use + `major_xs`) + * `Panel.toWide`, use `Panel.to_wide` instead **New features / modules** diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 02481123e7f3d..4c17016fed808 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -16,7 +16,6 @@ import csv import operator import sys -import warnings from numpy import nan import numpy as np @@ -30,7 +29,6 @@ from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.series import Series, _is_bool_indexer -from pandas.util.decorators import deprecate from pandas.util import py3compat import pandas.core.common as common import pandas.core.datetools as datetools @@ -355,8 +353,7 @@ def to_dict(self): return dict((k, v.to_dict()) for k, v in self.iteritems()) @classmethod - def from_records(cls, data, index=None, indexField=None, - exclude=None): + def from_records(cls, data, index=None, exclude=None): """ Convert structured or record ndarray to DataFrame @@ -371,11 +368,6 @@ def from_records(cls, data, index=None, indexField=None, ------- df : DataFrame """ - if indexField is not None: # pragma: no cover - warnings.warn("indexField argument is deprecated. Use index " - "instead", FutureWarning) - index = indexField - columns, sdict = _rec_to_dict(data) if exclude is None: @@ -2943,124 +2935,6 @@ def combineMult(self, other): """ return self.mul(other, fill_value=1.) - def toDataMatrix(self): # pragma: no cover - warnings.warn("toDataMatrix will disappear in next release " - "as there is no longer a DataMatrix class", - FutureWarning) - return self.copy() - - def rows(self): # pragma: no cover - """Alias for the frame's index""" - warnings.warn("Replace usage of .rows() with .index, will be removed " - "in next release", FutureWarning) - return self.index - - def cols(self): # pragma: no cover - """Return sorted list of frame's columns""" - warnings.warn("Replace usage of .cols() with .columns, will be " - "removed in next release", FutureWarning) - return list(self.columns) - - def asMatrix(self, *args, **kwargs): # pragma: no cover - warnings.warn("asMatrix is deprecated. Use 'as_matrix' or .values " - "instead", FutureWarning) - return self.as_matrix(*args, **kwargs) - - @classmethod - def fromRecords(cls, *args, **kwargs): # pragma: no cover - warnings.warn("fromRecords is deprecated. Use 'from_records' " - "instead", FutureWarning) - return cls.from_records(*args, **kwargs) - - @classmethod - def fromcsv(cls, *args, **kwargs): # pragma: no cover - warnings.warn("fromcsv is deprecated. Use 'from_csv' " - "instead", FutureWarning) - return cls.from_csv(*args, **kwargs) - - combineFirst = deprecate('combineFirst', combine_first) - getXS = deprecate('getXS', xs) - merge = deprecate('merge', join) - toRecords = deprecate('toRecords', to_records) - toDict = deprecate('toDict', to_dict) - toString = deprecate('toString', to_string) - _firstTimeWithValue = deprecate('_firstTimeWithValue', first_valid_index) - _lastTimeWithValue = deprecate('_lastTimeWithValue', last_valid_index) - toCSV = deprecate('toCSV', to_csv) - - def dropEmptyRows(self, specificColumns=None): # pragma: no cover - """ - Return DataFrame with rows omitted containing ALL NaN values - for optionally specified set of columns. - - Parameters - ---------- - specificColumns : list-like, optional keyword - Columns to consider in removing NaN values. As a typical - application, you might provide the list of the columns involved in - a regression to exlude all the missing data in one shot. - - Returns - ------- - This DataFrame with rows containing any NaN values deleted - """ - warnings.warn("dropEmptyRows is deprecated. Use dropna(how='all')", - FutureWarning) - return self.dropna(axis=0, subset=specificColumns, how='all') - - def dropIncompleteRows(self, specificColumns=None, - minObs=None): # pragma: no cover - """ - Return DataFrame with rows omitted containing ANY NaN values for - optionally specified set of columns. - - Parameters - ---------- - minObs : int or None (default) - Instead of requiring all the columns to have observations, require - only minObs observations - specificColumns : list-like, optional keyword - Columns to consider in removing NaN values. As a typical - application, you might provide the list of the columns involved in - a regression to exlude all the missing data in one shot. - - Returns - ------- - This DataFrame with rows containing any NaN values deleted - - """ - warnings.warn("dropEmptyRows is deprecated. Use dropna()", - FutureWarning) - if minObs is None: - return self.dropna(axis=0, subset=specificColumns, how='any') - else: - return self.dropna(axis=0, subset=specificColumns, thresh=minObs) - - def tapply(self, func): # pragma: no cover - """ - Apply func to the transposed DataFrame, results as per apply - """ - warnings.warn("tapply is deprecated. Use apply(f, axis=1)", - FutureWarning) - return self.apply(func, axis=1) - - def tgroupby(self, keyfunc, applyfunc): # pragma: no cover - """ - Aggregate columns based on passed function - - Parameters - ---------- - keyfunc : function - applyfunc : function - - Returns - ------- - y : DataFrame - """ - warnings.warn("tgroupby is deprecated. Use groupby with axis=1", - FutureWarning) - return self.T.groupby(keyfunc).aggregate(applyfunc).T - def group_agg(values, bounds, f): """ R-style aggregator diff --git a/pandas/core/index.py b/pandas/core/index.py index 7582e06ca1c98..418b0940a136e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -7,7 +7,7 @@ from pandas.core.common import (adjoin as _adjoin, _stringify, _is_bool_indexer, _asarray_tuplesafe) -from pandas.util.decorators import deprecate, cache_readonly +from pandas.util.decorators import cache_readonly import pandas._tseries as lib __all__ = ['Index'] @@ -595,11 +595,6 @@ def copy(self, order='C'): cp.__dict__.update(self.__dict__) return cp - #---------------------------------------------------------------------- - # deprecated stuff - - asOfDate = deprecate('asOfDate', asof) - class Int64Index(Index): diff --git a/pandas/core/panel.py b/pandas/core/panel.py index d0378821cac14..64f8e449d151e 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -14,7 +14,6 @@ from pandas.core.frame import DataFrame, _union_indexes from pandas.core.generic import AxisProperty, NDFrame from pandas.core.series import Series -from pandas.util.decorators import deprecate from pandas.util import py3compat import pandas.core.common as common import pandas._tseries as _tseries @@ -1080,12 +1079,6 @@ def _get_join_index(self, other, how): join_minor = self.minor_axis.union(other.minor_axis) return join_major, join_minor - #---------------------------------------------------------------------- - # Deprecated stuff - - getMinorXS = deprecate('getMinorXS', minor_xs) - getMajorXS = deprecate('getMajorXS', major_xs) - WidePanel = Panel #------------------------------------------------------------------------------- @@ -1277,8 +1270,6 @@ def _to_wide_mixed(self, mask): columns=self.minor_axis) return Panel.from_dict(data) - toWide = deprecate('toWide', to_wide) - def toCSV(self, path): def format_cols(items): cols = ['Major', 'Minor'] + list(items) diff --git a/pandas/core/series.py b/pandas/core/series.py index d6db7bbdb6c7b..bc921dfd31141 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -8,8 +8,6 @@ import csv import itertools import operator -import sys -import warnings from numpy import nan, ndarray import numpy as np @@ -20,7 +18,6 @@ from pandas.core.generic import PandasObject from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import _SeriesIndexer, _maybe_droplevels -from pandas.util.decorators import deprecate from pandas.util import py3compat import pandas.core.common as common import pandas.core.datetools as datetools @@ -1156,7 +1153,7 @@ def argsort(self, axis=0, kind='quicksort', order=None): else: return Series(np.argsort(values), index=self.index, name=self.name) - def order(self, na_last=True, ascending=True, **kwds): + def order(self, na_last=True, ascending=True): """ Sorts Series object, by value, maintaining index-value link @@ -1179,11 +1176,6 @@ def _try_mergesort(arr): # stable sort not available for object dtype return arr.argsort() - if 'missingAtEnd' in kwds: # pragma: no cover - warnings.warn("missingAtEnd is deprecated, use na_last", - FutureWarning) - na_last = kwds['missingAtEnd'] - arr = self.values sortedIdx = np.empty(len(self), dtype=np.int32) @@ -1852,24 +1844,6 @@ def mapper_f(x): def weekday(self): return Series([d.weekday() for d in self.index], index=self.index) - #---------------------------------------------------------------------- - # Deprecated stuff - - @classmethod - def fromValue(cls, value=nan, index=None, dtype=None): # pragma: no cover - warnings.warn("'fromValue', can call Series(value, index=index) now", - FutureWarning) - return Series(value, index=index, dtype=dtype) - - asOf = deprecate('asOf', asof) - toDict = deprecate('toDict', to_dict) - toString = deprecate('toString', to_string) - merge = deprecate('merge', map) - applymap = deprecate('applymap', apply) - combineFirst = deprecate('combineFirst', combine_first) - _firstTimeWithValue = deprecate('_firstTimeWithValue', first_valid_index) - _lastTimeWithValue = deprecate('_lastTimeWithValue', last_valid_index) - toCSV = deprecate('toCSV', to_csv) class TimeSeries(Series): pass diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4ec49ce188d75..4124cfc740c49 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -379,43 +379,3 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None, return _simple_parser(data, header=header, index_col=index_col, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values) - -#------------------------------------------------------------------------------- -# Deprecated stuff - -import warnings - -def parseCSV(filepath, header=0, skiprows=None, indexCol=0, - na_values=None): # pragma: no cover - """ - Parse CSV file into a DataFrame object. Try to parse dates if possible. - """ - warnings.warn("parseCSV is deprecated. Use read_csv instead", FutureWarning) - return read_csv(filepath, header=header, skiprows=skiprows, - index_col=indexCol, na_values=na_values, - parse_dates=True) - -def parseText(filepath, sep='\t', header=0, - indexCol=0, colNames=None): # pragma: no cover - """ - Parse whitespace separated file into a DataFrame object. - Try to parse dates if possible. - """ - warnings.warn("parseText is deprecated. Use read_table instead", - FutureWarning) - return read_table(filepath, sep=sep, header=header, index_col=indexCol, - names=colNames, parse_dates=True) - - -def parseExcel(filepath, header=None, indexCol=0, - sheetname=None, **kwds): # pragma: no cover - """ - - """ - warnings.warn("parseExcel is deprecated. Use the ExcelFile class instead", - FutureWarning) - excel_file = ExcelFile(filepath) - return excel_file.parse(sheetname, header=header, index_col=indexCol, - parse_dates=True) - - From 02b522a8d6601560f3b851e5e66c03979f7183f2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 18:18:04 -0400 Subject: [PATCH 035/161] DOC: release notes --- RELEASE.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index e12bdab36f0d8..ae36de25af9b4 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -103,6 +103,8 @@ pandas 0.5.0 - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should be reported upstream to matplotlib (GH #224) + - Fixed problem in which data would get upcasted to object dtype in + GroupBy.apply operations (GH #237) Thanks ------ From 088e20fd6dc2c7f6ed8b0bf25b313f4760e880e0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 18:30:25 -0400 Subject: [PATCH 036/161] DOC: release note --- RELEASE.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index ae36de25af9b4..2d50038780160 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -10,6 +10,11 @@ pandas 0.5.0 **Release date:** not yet released + + +Thanks to Thomas Kluyver and others for contributing patches and feedback +leading to some of the minor API changes listed below. + **API Changes** - `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for @@ -106,11 +111,6 @@ pandas 0.5.0 - Fixed problem in which data would get upcasted to object dtype in GroupBy.apply operations (GH #237) -Thanks ------- - - - Thomas Kluyver - pandas 0.4.3 ============ From 857038a31a032b9edac445a6cceff747b56c4cc7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 18:30:32 -0400 Subject: [PATCH 037/161] ENH: speed enhancement in multi-key joining --- pandas/core/frame.py | 2 -- pandas/core/index.py | 18 +++++++++++++----- pandas/core/internals.py | 31 ------------------------------- pandas/src/reindex.pyx | 17 +++++++++++++++++ 4 files changed, 30 insertions(+), 38 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4c17016fed808..2001c7b0a6567 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2225,8 +2225,6 @@ def _join_on(self, other, on, lsuffix, rsuffix): if isinstance(on, (list, tuple)): join_key = zip(*[self[k] for k in on]) - join_key = common._asarray_tuplesafe(join_key, - dtype=np.object_) else: join_key = np.asarray(self[on]) diff --git a/pandas/core/index.py b/pandas/core/index.py index 418b0940a136e..9894c2a8c987e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -403,21 +403,24 @@ def get_indexer(self, target, method=None): ------- (indexer, mask) : (ndarray, ndarray) """ - target = _ensure_index(target) - method = self._get_method(method) if self.dtype != target.dtype: target = Index(target, dtype=object) if method == 'pad': + target = _ensure_index(target) indexer = lib.pad_object(self, target, self.indexMap, target.indexMap) elif method == 'backfill': + target = _ensure_index(target) indexer = lib.backfill_object(self, target, self.indexMap, target.indexMap) elif method is None: - indexer = lib.merge_indexer_object(target, self.indexMap) + if isinstance(target, list): + indexer = lib.merge_indexer_list(target, self.indexMap) + else: + indexer = lib.merge_indexer_object(target, self.indexMap) else: raise ValueError('unrecognized method: %s' % method) return indexer @@ -1256,8 +1259,13 @@ def get_indexer(self, target, method=None): indexer = lib.backfill_object(self_index, target_index, self_index.indexMap, target.indexMap) else: - indexer = lib.merge_indexer_object(target_index, - self_index.indexMap) + if isinstance(target_index, list): + indexer = lib.merge_indexer_list(target_index, + self_index.indexMap) + else: + indexer = lib.merge_indexer_object(target_index, + self_index.indexMap) + return indexer def reindex(self, target, method=None): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 6d29ae56dba56..55f316ae18cda 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -498,37 +498,6 @@ def fast_2d_xs(self, loc, copy=False): return result - # def fast_2d_xs2(self, loc, copy=False): - # """ - - # """ - # if len(self.blocks) == 1: - # result = self.blocks[0].values[:, loc] - # if copy: - # result = result.copy() - # return result - - # if not copy: - # raise Exception('cannot get view of mixed-type or ' - # 'non-consolidated DataFrame') - - # def _get_put_function(source_dtype, out_dtype): - # src = source_dtype.name - # dst = out_dtype.name - # return getattr(lib, 'put2d_%s_%s' % (src, dst)) - - # out_dtype = np.dtype(_interleaved_dtype(self.blocks)) - - # items = self.items - # n = len(items) - # out = np.empty(n, dtype=out_dtype) - # for blk in self.blocks: - # values = blk.values - # indexer = lib.merge_indexer_object(blk.items, items.indexMap) - # putf = _get_put_function(values.dtype, out_dtype) - # putf(values, indexer, loc, out) - # return out - def consolidate(self): """ Join together blocks having same dtype diff --git a/pandas/src/reindex.pyx b/pandas/src/reindex.pyx index 97e20f3911965..447e04d059cc9 100644 --- a/pandas/src/reindex.pyx +++ b/pandas/src/reindex.pyx @@ -225,3 +225,20 @@ def take_join_contiguous(ndarray[float64_t, ndim=2] lvalues, outbuf[0] = rvalues[ridx, j] outbuf = outbuf + 1 +@cython.wraparound(False) +@cython.boundscheck(False) +def merge_indexer_list(list values, dict oldMap): + cdef int i, j, length, newLength + cdef object idx + cdef ndarray[int32_t] fill_vec + + newLength = len(values) + fill_vec = np.empty(newLength, dtype=np.int32) + for i from 0 <= i < newLength: + idx = values[i] + if idx in oldMap: + fill_vec[i] = oldMap[idx] + else: + fill_vec[i] = -1 + + return fill_vec From bf639d11fd0e57f371950085765824968d2278c2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 18:34:36 -0400 Subject: [PATCH 038/161] ENH: revert multi-key join attempted speed enhancement, use object arrays always --- pandas/core/frame.py | 2 ++ pandas/core/index.py | 17 +++++------------ 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2001c7b0a6567..4c17016fed808 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2225,6 +2225,8 @@ def _join_on(self, other, on, lsuffix, rsuffix): if isinstance(on, (list, tuple)): join_key = zip(*[self[k] for k in on]) + join_key = common._asarray_tuplesafe(join_key, + dtype=np.object_) else: join_key = np.asarray(self[on]) diff --git a/pandas/core/index.py b/pandas/core/index.py index 9894c2a8c987e..d54e97fdd45dc 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -405,22 +405,19 @@ def get_indexer(self, target, method=None): """ method = self._get_method(method) + target = _ensure_index(target) + if self.dtype != target.dtype: target = Index(target, dtype=object) if method == 'pad': - target = _ensure_index(target) indexer = lib.pad_object(self, target, self.indexMap, target.indexMap) elif method == 'backfill': - target = _ensure_index(target) indexer = lib.backfill_object(self, target, self.indexMap, target.indexMap) elif method is None: - if isinstance(target, list): - indexer = lib.merge_indexer_list(target, self.indexMap) - else: - indexer = lib.merge_indexer_object(target, self.indexMap) + indexer = lib.merge_indexer_object(target, self.indexMap) else: raise ValueError('unrecognized method: %s' % method) return indexer @@ -1259,12 +1256,8 @@ def get_indexer(self, target, method=None): indexer = lib.backfill_object(self_index, target_index, self_index.indexMap, target.indexMap) else: - if isinstance(target_index, list): - indexer = lib.merge_indexer_list(target_index, - self_index.indexMap) - else: - indexer = lib.merge_indexer_object(target_index, - self_index.indexMap) + indexer = lib.merge_indexer_object(target_index, + self_index.indexMap) return indexer From 35253aaa36462a17ba46a7fa7a7a0f655654821e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Oct 2011 19:52:43 -0400 Subject: [PATCH 039/161] ENH: use level name when calling unstack --- pandas/core/reshape.py | 4 ++-- pandas/tests/test_multilevel.py | 5 +++++ pandas/tools/pivot.py | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 pandas/tools/pivot.py diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 3fe653819f690..ebaa82d061a35 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -61,8 +61,8 @@ def __init__(self, values, index, level=-1, value_columns=None): self.new_index_levels = list(index.levels) self.new_index_names = list(index.names) - self.removed_name = self.new_index_names.pop(level) - self.removed_level = self.new_index_levels.pop(level) + self.removed_name = self.new_index_names.pop(self.level) + self.removed_level = self.new_index_levels.pop(self.level) v = self.level lshape = self.index.levshape diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index b6cfac1a7b023..f4a93a5c0bb6f 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -377,6 +377,11 @@ def test_stack_unstack_preserve_names(self): restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) + def test_unstack_level_name(self): + result = self.frame.unstack('second') + expected = self.frame.unstack(level=1) + assert_frame_equal(result, expected) + def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py new file mode 100644 index 0000000000000..8b137891791fe --- /dev/null +++ b/pandas/tools/pivot.py @@ -0,0 +1 @@ + From 01788630c1d696898161f8de0d969e2c4fc9746d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 14:31:31 -0400 Subject: [PATCH 040/161] ENH: pivot table function --- pandas/tools/pivot.py | 53 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 8b137891791fe..5d2eba93ea52e 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -1 +1,54 @@ +from pandas import DataFrame +import numpy as np + +def pivot_table(data, values=None, xby=None, yby=None, aggfunc=np.mean, + fill_value=None): + """ + + """ + + xby = [] if xby is None else list(xby) + yby = [] if yby is None else list(yby) + + keys = xby + yby + grouped = data.groupby(keys) + + if values is not None: + grouped = grouped[values] + + agged = grouped.agg(aggfunc) + + table = agged + for k in yby: + table = table.unstack(level=k) + + if fill_value is not None: + table = table.fillna(value=fill_value) + + return table + +def pprint_table(table): + pass + +if __name__ == '__main__': + def _sample(values, n): + indexer = np.random.randint(0, len(values), n) + return np.asarray(values).take(indexer) + + levels = [['a', 'b', 'c', 'd'], + ['foo', 'bar', 'baz'], + ['one', 'two'], + ['US', 'JP', 'UK']] + names = ['k1', 'k2', 'k3', 'k4'] + + n = 100000 + + data = {} + for name, level in zip(names, levels): + data[name] = _sample(level, n) + + data['values'] = np.random.randn(n) + data = DataFrame(data) + + table = pivot_table(data, values='values', xby=['k1', 'k2'], yby=['k3', 'k4']) From 8b86400a77d9732f1ff13b052d66d072b83f7978 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 14:51:42 -0400 Subject: [PATCH 041/161] BUG: test for join bug fixed since 0.4.3, GH #238 --- pandas/tests/test_frame.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index e509c455a0d41..2250be415e96c 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2638,6 +2638,11 @@ def test_join_index_mixed(self): def test_join_on_series(self): pass + def test_join_empty_bug(self): + # generated an exception in 0.4.3 + x = DataFrame() + x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + def test_clip(self): median = self.frame.median().median() From 78383098ea628c0a54216a5751a08a55ac79ec22 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 15:05:20 -0400 Subject: [PATCH 042/161] BUG: iteritems and _series not assigning Series.name --- RELEASE.rst | 3 +++ pandas/core/frame.py | 3 +-- pandas/core/internals.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 2d50038780160..034507ef3cf2a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -110,6 +110,9 @@ leading to some of the minor API changes listed below. be reported upstream to matplotlib (GH #224) - Fixed problem in which data would get upcasted to object dtype in GroupBy.apply operations (GH #237) + - Fixed outer join bug with empty DataFrame (GH #238) + - DataFrame.iteritems was not returning Series with the name attribute + set. Also neither was DataFrame._series pandas 0.4.3 ============ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4c17016fed808..5f777813364c6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -276,8 +276,7 @@ def __iter__(self): def iteritems(self): """Iterator over (column, series) pairs""" - series = self._series - return ((k, series[k]) for k in self.columns) + return ((k, self[k]) for k in self.columns) iterkv = iteritems if py3compat.PY3: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 55f316ae18cda..5eb0212450716 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -875,7 +875,7 @@ def _blocks_to_series_dict(blocks, index=None): for block in blocks: for item, vec in zip(block.items, block.values): - series_dict[item] = Series(vec, index=index) + series_dict[item] = Series(vec, index=index, name=item) return series_dict def _interleaved_dtype(blocks): From defa04059e280d1a98707e4f43a82ed8732c4db6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 15:12:06 -0400 Subject: [PATCH 043/161] BUG: can now store datetime.date objects in PyTables, address GH #231 --- pandas/io/pytables.py | 20 +++++++++++++------- pandas/io/tests/test_pytables.py | 5 +++++ pandas/tests/test_frame.py | 9 +++++++++ 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 436e61d70b949..e0e3c3d5f6082 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5,7 +5,7 @@ # pylint: disable-msg=E1101,W0613,W0603 -from datetime import datetime +from datetime import datetime, date import time import numpy as np @@ -706,11 +706,14 @@ def _convert_index(index): # Let's assume the index is homogeneous values = np.asarray(index) - import time - if isinstance(values[0], datetime): - converted = np.array([time.mktime(v.timetuple()) - for v in values], dtype=np.int64) - return converted, 'datetime', _tables().Time64Col() + if isinstance(values[0], (datetime, date)): + if isinstance(values[0], datetime): + kind = 'datetime' + else: + kind = 'date' + converted = np.array([time.mktime(v.timetuple()) for v in values], + dtype=np.int64) + return converted, kind, _tables().Time64Col() elif isinstance(values[0], basestring): converted = np.array(list(values), dtype=np.str_) itemsize = converted.dtype.itemsize @@ -722,7 +725,6 @@ def _convert_index(index): else: # pragma: no cover raise ValueError('unrecognized index type %s' % type(values[0])) - def _read_array(group, key): import tables node = getattr(group, key) @@ -737,6 +739,10 @@ def _unconvert_index(data, kind): if kind == 'datetime': index = np.array([datetime.fromtimestamp(v) for v in data], dtype=object) + elif kind == 'date': + index = np.array([date.fromtimestamp(v) for v in data], + dtype=object) + elif kind in ('string', 'integer'): index = np.array(data, dtype=object) else: # pragma: no cover diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index ee470d593733b..3c76ae6067a23 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -214,6 +214,11 @@ def test_frame(self): self.assertRaises(ValueError, self._check_roundtrip, df[:0], tm.assert_frame_equal) + def test_can_serialize_dates(self): + rng = [x.date() for x in DateRange('1/1/2000', '1/30/2000')] + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + self._check_roundtrip(frame, tm.assert_frame_equal) + def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 2250be415e96c..70d1bc05eedc6 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3165,6 +3165,15 @@ def test_take(self): expected = self.mixed_frame.ix[:, ['foo', 'B', 'C', 'A', 'D']] assert_frame_equal(result, expected) + def test_iterkv_names(self): + for k, v in self.mixed_frame.iterkv(): + self.assertEqual(v.name, k) + + def test_series_put_names(self): + series = self.mixed_frame._series + for k, v in series.iteritems(): + self.assertEqual(v.name, k) + def _join_by_hand(a, b, how='left'): join_index = a.index.join(b.index, how=how) From a84afb96729bc103148c6f3a92f8d4993dde41ce Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 15:20:49 -0400 Subject: [PATCH 044/161] BUG: store Index and Series names in HDFStore --- RELEASE.rst | 2 ++ pandas/io/pytables.py | 12 +++++++++--- pandas/io/tests/test_pytables.py | 27 ++++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 034507ef3cf2a..d2edf1b2ae36f 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -113,6 +113,8 @@ leading to some of the minor API changes listed below. - Fixed outer join bug with empty DataFrame (GH #238) - DataFrame.iteritems was not returning Series with the name attribute set. Also neither was DataFrame._series + - Can store datetime.date objects in HDFStore (GH #231) + - Index and Series names are now stored in HDFStore pandas 0.4.3 ============ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e0e3c3d5f6082..8b8ac500c775a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -10,7 +10,7 @@ import numpy as np from pandas import (Series, TimeSeries, DataFrame, Panel, LongPanel, - MultiIndex) + Index, MultiIndex) from pandas.core.common import adjoin import pandas._tseries as lib @@ -331,6 +331,7 @@ def _write_to_group(self, key, value, table=False, append=False, def _write_series(self, group, series): self._write_index(group, 'index', series.index) self._write_array(group, 'values', series.values) + group._v_attrs.name = series.name def _write_frame(self, group, df): self._write_block_manager(group, df._data) @@ -440,6 +441,7 @@ def _write_index(self, group, key, index): self._write_array(group, key, converted) node = getattr(group, key) node._v_attrs.kind = kind + node._v_attrs.name = index.name def _read_index(self, group, key): try: @@ -503,7 +505,10 @@ def _read_index_node(self, node): except Exception: name = None - return name, _unconvert_index(data, kind) + index = Index(_unconvert_index(data, kind)) + index.name = name + + return name, index def _write_array(self, group, key, value): if key in group: @@ -617,7 +622,8 @@ def _read_group(self, group, where=None): def _read_series(self, group, where=None): index = self._read_index(group, 'index') values = _read_array(group, 'values') - return Series(values, index=index) + name = getattr(group._v_attrs, 'name', None) + return Series(values, index=index, name=name) def _read_legacy_series(self, group, where=None): index = self._read_index_legacy(group, 'index') diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 3c76ae6067a23..6d980d8b13e70 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -232,7 +232,7 @@ def test_store_hierarchical(self): self._check_roundtrip(frame.T, tm.assert_frame_equal) self._check_roundtrip(frame['A'], tm.assert_series_equal) - # check that the + # check that the names are stored try: store = HDFStore(self.scratchpath) store['frame'] = frame @@ -242,6 +242,31 @@ def test_store_hierarchical(self): store.close() os.remove(self.scratchpath) + def test_store_index_name(self): + df = tm.makeDataFrame() + df.index.name = 'foo' + try: + store = HDFStore(self.scratchpath) + store['frame'] = df + recons = store['frame'] + assert(recons.index.name == 'foo') + finally: + store.close() + os.remove(self.scratchpath) + + def test_store_series_name(self): + df = tm.makeDataFrame() + series = df['A'] + + try: + store = HDFStore(self.scratchpath) + store['series'] = series + recons = store['series'] + assert(recons.name == 'A') + finally: + store.close() + os.remove(self.scratchpath) + def test_store_mixed(self): def _make_one(): df = tm.makeDataFrame() From cc2241d966db20f66fc9d6a1b659471f29555a25 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 15:51:47 -0400 Subject: [PATCH 045/161] BUG: can create empty Panel, address GH #239 --- RELEASE.rst | 1 + pandas/core/frame.py | 3 +++ pandas/core/panel.py | 5 ++++- pandas/tests/test_panel.py | 6 ++++++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index d2edf1b2ae36f..55a3ad0e47e1f 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -115,6 +115,7 @@ leading to some of the minor API changes listed below. set. Also neither was DataFrame._series - Can store datetime.date objects in HDFStore (GH #231) - Index and Series names are now stored in HDFStore + - Can create empty Panel (GH #239) pandas 0.4.3 ============ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f777813364c6..212bd6331fb04 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3031,6 +3031,9 @@ def _get_index(v): def _union_indexes(indexes): + if len(indexes) == 0: + return Index([]) + if len(indexes) == 1: result = indexes[0] if isinstance(result, list): diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 64f8e449d151e..836c4eefcc427 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -186,7 +186,7 @@ class Panel(NDFrame): __div__ = _arith_method(operator.div, '__div__') __rdiv__ = _arith_method(lambda x, y: y / x, '__rdiv__') - def __init__(self, data, items=None, major_axis=None, minor_axis=None, + def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None): """ Represents wide format panel data, stored as 3-dimensional array @@ -205,6 +205,9 @@ def __init__(self, data, items=None, major_axis=None, minor_axis=None, copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input """ + if data is None: + data = {} + passed_axes = [items, major_axis, minor_axis] if isinstance(data, BlockManager): mgr = data diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index fc5cdbcac1603..8defc8c7e1c7a 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -560,6 +560,12 @@ def test_constructor_cast(self): data = [[['foo', 'bar', 'baz']]] self.assertRaises(ValueError, Panel, data, dtype=float) + def test_constructor_empty_panel(self): + empty = Panel() + self.assert_(len(empty.items) == 0) + self.assert_(len(empty.major_axis) == 0) + self.assert_(len(empty.minor_axis) == 0) + def test_consolidate(self): self.assert_(self.panel._data.is_consolidated()) From 579956a9c59ad5e885463eb437e5bd072b1a3411 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 15:54:53 -0400 Subject: [PATCH 046/161] BUG: Panel.__repr__ works with len-0 major/minor axes --- RELEASE.rst | 1 + pandas/core/panel.py | 14 ++++++++++---- pandas/tests/test_panel.py | 6 ++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 55a3ad0e47e1f..970707e74c255 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -116,6 +116,7 @@ leading to some of the minor API changes listed below. - Can store datetime.date objects in HDFStore (GH #231) - Index and Series names are now stored in HDFStore - Can create empty Panel (GH #239) + - Panel.__repr__ raised exception on length-0 major/minor axes pandas 0.4.3 ============ diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 836c4eefcc427..bc591c530f3ca 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -317,11 +317,17 @@ def __repr__(self): dims = 'Dimensions: %d (items) x %d (major) x %d (minor)' % (I, N, K) - major = 'Major axis: %s to %s' % (self.major_axis[0], - self.major_axis[-1]) + if len(self.major_axis) > 0: + major = 'Major axis: %s to %s' % (self.major_axis[0], + self.major_axis[-1]) + else: + major = 'Major axis: None' - minor = 'Minor axis: %s to %s' % (self.minor_axis[0], - self.minor_axis[-1]) + if len(self.minor_axis) > 0: + minor = 'Minor axis: %s to %s' % (self.minor_axis[0], + self.minor_axis[-1]) + else: + minor = 'Minor axis: None' if len(self.items) > 0: items = 'Items: %s to %s' % (self.items[0], self.items[-1]) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 8defc8c7e1c7a..cb3e7b4a993b3 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -826,6 +826,10 @@ def test_join_overlap(self): expected = p1_suf.join(p2_suf).join(no_overlap) assert_panel_equal(joined, expected) + def test_repr_empty(self): + empty = Panel() + repr(empty) + class TestLongPanel(unittest.TestCase): def setUp(self): @@ -1144,6 +1148,8 @@ def test_pivot(self): # corner case, empty df = pivot(np.array([]), np.array([]), np.array([])) + + def test_group_agg(): values = np.ones((10, 2)) * np.arange(10).reshape((10, 1)) bounds = np.arange(5) * 2 From 4db8b72e528eccd97043dddf8361511c70af86ea Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 16:25:48 -0400 Subject: [PATCH 047/161] DOC: release notes --- RELEASE.rst | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 970707e74c255..1a3a1db2f8978 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -10,10 +10,18 @@ pandas 0.5.0 **Release date:** not yet released - - -Thanks to Thomas Kluyver and others for contributing patches and feedback -leading to some of the minor API changes listed below. +This major release of pandas includes a number of API changes (see below) and +cleanup of deprecated APIs from pre-0.4.0 releases. There are also bug fixes, +some new features, performance enhancements, and includes a new IPython +completer hook to enable tab completion of DataFrame columns accesses as +attributes (a new feature). + +In addition to the changes listed here from 0.4.3 to 0.5.0, the minor releases +0.4.1, 0.4.2, and 0.4.3 brought some significant new functionality and +performance improvements that are worth taking a look at. + +Thanks to Thomas Kluyver and others for contributing patches and providing +feedback on the library. **API Changes** From ea45212d065a7f3331fbf663cb8e60e5d300396d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 16:48:27 -0400 Subject: [PATCH 048/161] DOC: release notes --- RELEASE.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 1a3a1db2f8978..4f834dde2fec4 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -10,11 +10,11 @@ pandas 0.5.0 **Release date:** not yet released -This major release of pandas includes a number of API changes (see below) and -cleanup of deprecated APIs from pre-0.4.0 releases. There are also bug fixes, -some new features, performance enhancements, and includes a new IPython -completer hook to enable tab completion of DataFrame columns accesses as -attributes (a new feature). +This release of pandas includes a number of API changes (see below) and cleanup +of deprecated APIs from pre-0.4.0 releases. There are also bug fixes, some new +features, performance enhancements, and includes a new IPython completer hook to +enable tab completion of DataFrame columns accesses as attributes (a new +feature). In addition to the changes listed here from 0.4.3 to 0.5.0, the minor releases 0.4.1, 0.4.2, and 0.4.3 brought some significant new functionality and From d5213b0c872cb950c9315989ebfd2f358cbb59f8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Oct 2011 20:50:41 -0400 Subject: [PATCH 049/161] ENH: starting Cython parser infrastructure. add boolean handling. speed up read_csv by a lot --- pandas/io/parsers.py | 84 +++++++-------- pandas/io/tests/test_parsers.py | 23 +++++ pandas/src/parsing.pyx | 175 ++++++++++++++++++++++++++++++++ pandas/src/tseries.pyx | 1 + pandas/tests/test_frame.py | 4 +- pandas/tools/pivot.py | 3 +- 6 files changed, 248 insertions(+), 42 deletions(-) create mode 100644 pandas/src/parsing.pyx diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4124cfc740c49..d0f7b01334f11 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -9,6 +9,7 @@ from pandas.core.index import Index, MultiIndex from pandas.core.frame import DataFrame +import pandas._tseries as lib def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, skiprows=None, na_values=None, parse_dates=False, @@ -167,6 +168,19 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, return DataFrame(index=index, columns=columns) + + # common NA values + # no longer excluding inf representations + # '1.#INF','-1.#INF', '1.#INF000000', + NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', + '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', + 'nan', '']) + if na_values is None: + na_values = NA_VALUES + else: + na_values = set(list(na_values)) | NA_VALUES + + if index_col is None and len(content[0]) == len(columns) + 1: index_col = 0 @@ -194,7 +208,7 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, if np.isscalar(index_col): if parse_dates: index = _try_parse_dates(index, parser=date_parser) - index = Index(_maybe_convert_int(np.array(index, dtype=object))) + index = Index(_convert_ndarray(index, na_values)) else: arrays = _maybe_convert_int_mindex(index, parse_dates, date_parser) @@ -211,39 +225,26 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, raise Exception('wrong number of columns') data = dict(izip(columns, zipped_content)) - data = _floatify(data, na_values=na_values) - data = _convert_to_ndarrays(data) + data = _convert_to_ndarrays(data, na_values) + return DataFrame(data=data, columns=columns, index=index) -def _floatify(data_dict, na_values=None): - """ + +def _floatify(tup, na_values): """ - # common NA values - # no longer excluding inf representations - # '1.#INF','-1.#INF', '1.#INF000000', - NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', - 'nan', '']) - if na_values is None: - na_values = NA_VALUES - else: - na_values = set(list(na_values)) | NA_VALUES - def _convert_float(val): - if val in na_values: - return np.nan + """ + try: + if isinstance(tup, tuple): + return lib.maybe_convert_float_tuple(tup, na_values) else: - try: - return np.float64(val) - except Exception: - return val - - result = {} - for col, values in data_dict.iteritems(): - result[col] = [_convert_float(val) for val in values] - - return result + return lib.maybe_convert_float_list(tup, na_values) + except Exception: + if isinstance(tup, tuple): + return lib.string_to_ndarray_tuple(tup) + else: + return lib.string_to_ndarray_list(tup) def _maybe_convert_int(arr): if len(arr) == 0: # pragma: no cover @@ -251,17 +252,18 @@ def _maybe_convert_int(arr): try: if arr.dtype == np.object_: - return arr.astype(int) - - if abs(arr[0] - int(arr[0])) < 1e-10: - casted = arr.astype(int) - if (np.abs(casted - arr) < 1e-10).all(): - return casted + return lib.maybe_convert_int_object(arr) + return lib.maybe_convert_int(arr) except (TypeError, ValueError): pass return arr +def _maybe_convert_bool(arr): + if arr.dtype == np.object_: + return lib.maybe_convert_bool_object(arr) + return arr + def _maybe_convert_int_mindex(index, parse_dates, date_parser): for i in range(len(index)): try: @@ -273,16 +275,18 @@ def _maybe_convert_int_mindex(index, parse_dates, date_parser): return index -def _convert_to_ndarrays(dct): +def _convert_to_ndarrays(dct, na_values): result = {} for c, values in dct.iteritems(): - try: - values = np.array(values, dtype=float) - except Exception: - values = np.array(values, dtype=object) - result[c] = _maybe_convert_int(values) + result[c] = _convert_ndarray(values, na_values) return result +def _convert_ndarray(tup, na_values): + values = _floatify(tup, na_values) + values = _maybe_convert_int(values) + values = _maybe_convert_bool(values) + return values + def _try_parse_dates(values, parser=None): if parser is None: try: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 5f71f793f2384..8dca1a7381486 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -56,6 +56,20 @@ def test_unnamed_columns(self): ['A', 'B', 'C', 'Unnamed: 3', 'Unnamed: 4'])) + def test_string_nas(self): + data = """A,B,C +a,b,c +d,,f +,g,h +""" + result = read_csv(StringIO(data)) + expected = DataFrame([['a', 'b', 'c'], + ['d', np.nan, 'f'], + [np.nan, 'g', 'h']], + columns=['A', 'B', 'C']) + + assert_frame_equal(result, expected) + def test_duplicate_columns(self): data = """A,A,B,B,B 1,2,3,4,5 @@ -151,6 +165,15 @@ def test_read_table_duplicate_index(self): """ self.assertRaises(Exception, read_csv, StringIO(data), index_col=0) + def test_parse_bools(self): + data = """A,B +True,1 +False,2 +True,3 +""" + data = read_csv(StringIO(data)) + self.assert_(data['A'].dtype == np.bool_) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx new file mode 100644 index 0000000000000..715c7fc83628f --- /dev/null +++ b/pandas/src/parsing.pyx @@ -0,0 +1,175 @@ +cimport cpython + +cdef extern from "math.h": + double fabs(double) + +def maybe_convert_float_list(tuple values): + cdef: + Py_ssize_t i, n + ndarray[float64_t] result + object val + + n = len(values) + result = np.empty(n, dtype='f8') + + for i from 0 <= i < n: + val = values[i] + result[i] = val + + return val + +def maybe_convert_float_tuple(tuple values, set na_values): + cdef: + Py_ssize_t i, n + ndarray[float64_t] result + object val + + n = len(values) + result = np.empty(n, dtype='f8') + + for i from 0 <= i < n: + val = values[i] + + if cpython.PyFloat_Check(val): + result[i] = val + elif val in na_values: + result[i] = nan + elif val is None: + result[i] = nan + elif len(val) == 0: + result[i] = nan + else: + result[i] = float(val) + + return result + +def maybe_convert_float_list(list values, set na_values): + cdef: + Py_ssize_t i, n + ndarray[float64_t] result + object val + + n = len(values) + result = np.empty(n, dtype='f8') + + for i from 0 <= i < n: + val = values[i] + + if cpython.PyFloat_Check(val): + result[i] = val + elif val in na_values: + result[i] = nan + elif val is None: + result[i] = nan + elif len(val) == 0: + result[i] = nan + else: + result[i] = float(val) + + return result + +def string_to_ndarray_tuple(tuple values): + cdef: + Py_ssize_t i, n + ndarray[object] result + object val, onan + + n = len(values) + result = np.empty(n, dtype=object) + onan = np.nan + + for i from 0 <= i < n: + val = values[i] + + if val == '': + result[i] = onan + else: + result[i] = val + + return result + +def string_to_ndarray_list(list values): + cdef: + Py_ssize_t i, n + ndarray[object] result + object val, onan + + n = len(values) + result = np.empty(n, dtype=object) + onan = np.nan + + for i from 0 <= i < n: + val = values[i] + + if val == '': + result[i] = onan + else: + result[i] = val + + return result + +def maybe_convert_bool_object(ndarray[object] arr): + cdef: + Py_ssize_t i, n + ndarray[uint8_t, cast=True] result + object val + + n = len(arr) + result = np.empty(n, dtype=bool) + + for i from 0 <= i < n: + val = arr[i] + + if val == 'True': + result[i] = 1 + elif val == 'False': + result[i] = 0 + else: + return arr + + return result + +cdef float64_t FP_ERR = 1e-10 + +def maybe_convert_int(ndarray[float64_t] arr): + cdef: + Py_ssize_t i, n + ndarray[int64_t] result + float64_t val + + n = len(arr) + result = np.empty(n, dtype='i8') + for i from 0 <= i < n: + val = arr[i] + result[i] = val + + # NA + if val != val: + return arr + + if fabs(result[i] - val) > FP_ERR: + return arr + + return result + +def maybe_convert_int_object(ndarray[object] arr): + cdef: + Py_ssize_t i, n + ndarray[int64_t] result + object val + + n = len(arr) + result = np.empty(n, dtype='i8') + for i from 0 <= i < n: + val = arr[i] + result[i] = val + + # NA + if val != val: + return arr + + if fabs(result[i] - val) > FP_ERR: + return arr + + return result + diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index ceccc3c1536a9..765c5c7eb2dcb 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -353,3 +353,4 @@ include "groupby.pyx" include "moments.pyx" include "reindex.pyx" include "generated.pyx" +include "parsing.pyx" diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 70d1bc05eedc6..92ba3c343d279 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1616,8 +1616,10 @@ def test_to_csv_float32_nanrep(self): os.remove(pth) def test_to_csv_withcommas(self): - "Commas inside fields should be correctly escaped when saving as CSV." + path = '__tmp__' + # Commas inside fields should be correctly escaped when saving as CSV. + df = DataFrame({'A':[1,2,3], 'B':['5,6','7,8','9,0']}) df.to_csv(path) df2 = DataFrame.from_csv(path) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 5d2eba93ea52e..cb235f5b07341 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -50,5 +50,6 @@ def _sample(values, n): data['values'] = np.random.randn(n) data = DataFrame(data) - table = pivot_table(data, values='values', xby=['k1', 'k2'], yby=['k3', 'k4']) + table = pivot_table(data, values='values', + xby=['k1', 'k2'], yby=['k3', 'k4']) From 8927f0d3df2f50a98a4134e4145a53b12305676c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 16 Oct 2011 10:29:34 -0400 Subject: [PATCH 050/161] ENH: better about int conversions, consolidated numeric conversion function --- RELEASE.rst | 2 + pandas/io/parsers.py | 3 +- pandas/io/tests/test_parsers.py | 10 +++++ pandas/src/parsing.pyx | 78 ++++++++++++++++++++++++--------- 4 files changed, 70 insertions(+), 23 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 4f834dde2fec4..7ea238baecc49 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -125,6 +125,8 @@ feedback on the library. - Index and Series names are now stored in HDFStore - Can create empty Panel (GH #239) - Panel.__repr__ raised exception on length-0 major/minor axes + - Be less aggressive about converting float->int in `read_csv` and + `read_table` pandas 0.4.3 ============ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d0f7b01334f11..5208f30c07aca 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -237,7 +237,7 @@ def _floatify(tup, na_values): """ try: if isinstance(tup, tuple): - return lib.maybe_convert_float_tuple(tup, na_values) + return lib.maybe_convert_numeric(tup, na_values) else: return lib.maybe_convert_float_list(tup, na_values) except Exception: @@ -283,7 +283,6 @@ def _convert_to_ndarrays(dct, na_values): def _convert_ndarray(tup, na_values): values = _floatify(tup, na_values) - values = _maybe_convert_int(values) values = _maybe_convert_bool(values) return values diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 8dca1a7381486..cd1a578473306 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -174,6 +174,16 @@ def test_parse_bools(self): data = read_csv(StringIO(data)) self.assert_(data['A'].dtype == np.bool_) + def test_int_conversion(self): + data = """A,B +1.0,1 +2.0,2 +3.0,3 +""" + data = read_csv(StringIO(data)) + self.assert_(data['A'].dtype == np.float_) + self.assert_(data['B'].dtype == np.int_) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index 715c7fc83628f..e792342d1130f 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -18,55 +18,91 @@ def maybe_convert_float_list(tuple values): return val -def maybe_convert_float_tuple(tuple values, set na_values): +def maybe_convert_numeric(tuple values, set na_values): cdef: Py_ssize_t i, n - ndarray[float64_t] result + ndarray[float64_t] floats + ndarray[int64_t] ints + bint seen_float = 0 object val + float64_t fval n = len(values) - result = np.empty(n, dtype='f8') + + floats = np.empty(n, dtype='f8') + ints = np.empty(n, dtype='i8') for i from 0 <= i < n: val = values[i] if cpython.PyFloat_Check(val): - result[i] = val + floats[i] = val + seen_float = 1 elif val in na_values: - result[i] = nan + floats[i] = nan + seen_float = 1 elif val is None: - result[i] = nan + floats[i] = nan + seen_float = 1 elif len(val) == 0: - result[i] = nan + floats[i] = nan + seen_float = 1 else: - result[i] = float(val) - - return result - -def maybe_convert_float_list(list values, set na_values): + fval = float(val) + floats[i] = fval + if not seen_float: + if '.' in val: + seen_float = 1 + else: + ints[i] = fval + + if seen_float: + return floats + else: + return ints + +def maybe_convert_numeric_list(list values, set na_values): cdef: Py_ssize_t i, n - ndarray[float64_t] result + ndarray[float64_t] floats + ndarray[int64_t] ints + bint seen_float = 0 object val + float64_t fval n = len(values) - result = np.empty(n, dtype='f8') + + floats = np.empty(n, dtype='f8') + ints = np.empty(n, dtype='i8') for i from 0 <= i < n: val = values[i] if cpython.PyFloat_Check(val): - result[i] = val + floats[i] = val + seen_float = 1 elif val in na_values: - result[i] = nan + floats[i] = nan + seen_float = 1 elif val is None: - result[i] = nan + floats[i] = nan + seen_float = 1 elif len(val) == 0: - result[i] = nan + floats[i] = nan + seen_float = 1 else: - result[i] = float(val) - - return result + fval = float(val) + floats[i] = fval + if not seen_float: + if '.' in val: + seen_float = 1 + else: + ints[i] = fval + + if seen_float: + return floats + else: + return ints def string_to_ndarray_tuple(tuple values): cdef: From fa5599322da184fa89e4f476fc75a61f04ffeab6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 16 Oct 2011 11:09:22 -0400 Subject: [PATCH 051/161] ENH: parsers: fast zip(*args), made everything ndarray-based, faster --- pandas/io/parsers.py | 78 ++++------------ pandas/io/tests/test_parsers.py | 16 +++- pandas/src/parsing.pyx | 157 +++++++++----------------------- 3 files changed, 75 insertions(+), 176 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5208f30c07aca..6d869a0f0eaa8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -148,8 +148,6 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, columns = colNames content = lines - zipped_content = zip(*content) - if len(content) == 0: # pragma: no cover if index_col is not None: if np.isscalar(index_col): @@ -181,18 +179,21 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, na_values = set(list(na_values)) | NA_VALUES + zipped_content = list(lib.to_object_array(content).T) + if index_col is None and len(content[0]) == len(columns) + 1: index_col = 0 # no index column specified, so infer that's what is wanted if index_col is not None: if np.isscalar(index_col): - if index_col == 0 and len(content[0]) == len(columns) + 1: - index = zipped_content[0] - zipped_content = zipped_content[1:] + index = zipped_content.pop(index_col) + + if len(content[0]) == len(columns) + 1: + name = None else: - index = zipped_content.pop(index_col) - columns.pop(index_col) + name = columns.pop(index_col) + else: # given a list of index idx_names = [] index = [] @@ -204,11 +205,10 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, columns.remove(idx_names[i]) zipped_content.remove(index[i]) - if np.isscalar(index_col): if parse_dates: - index = _try_parse_dates(index, parser=date_parser) - index = Index(_convert_ndarray(index, na_values)) + index = lib.try_parse_dates(index, parser=date_parser) + index = Index(_convert_types(index, na_values), name=name) else: arrays = _maybe_convert_int_mindex(index, parse_dates, date_parser) @@ -224,28 +224,11 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, if len(columns) != len(zipped_content): raise Exception('wrong number of columns') - data = dict(izip(columns, zipped_content)) + data = dict((k, v) for k, v in zip(columns, zipped_content)) data = _convert_to_ndarrays(data, na_values) - return DataFrame(data=data, columns=columns, index=index) - -def _floatify(tup, na_values): - """ - - """ - try: - if isinstance(tup, tuple): - return lib.maybe_convert_numeric(tup, na_values) - else: - return lib.maybe_convert_float_list(tup, na_values) - except Exception: - if isinstance(tup, tuple): - return lib.string_to_ndarray_tuple(tup) - else: - return lib.string_to_ndarray_list(tup) - def _maybe_convert_int(arr): if len(arr) == 0: # pragma: no cover return arr @@ -259,11 +242,6 @@ def _maybe_convert_int(arr): return arr -def _maybe_convert_bool(arr): - if arr.dtype == np.object_: - return lib.maybe_convert_bool_object(arr) - return arr - def _maybe_convert_int_mindex(index, parse_dates, date_parser): for i in range(len(index)): try: @@ -271,41 +249,25 @@ def _maybe_convert_int_mindex(index, parse_dates, date_parser): index[i] = map(int, index[i]) except ValueError: if parse_dates: - index[i] = _try_parse_dates(index[i], date_parser) + index[i] = lib.try_parse_dates(index[i], date_parser) return index def _convert_to_ndarrays(dct, na_values): result = {} for c, values in dct.iteritems(): - result[c] = _convert_ndarray(values, na_values) + result[c] = _convert_types(values, na_values) return result -def _convert_ndarray(tup, na_values): - values = _floatify(tup, na_values) - values = _maybe_convert_bool(values) - return values - -def _try_parse_dates(values, parser=None): - if parser is None: - try: - from dateutil import parser - parse_date = parser.parse - except ImportError: # pragma: no cover - def parse_date(s): - try: - return datetime.strptime(s, '%m/%d/%Y') - except Exception: - return s - else: - parse_date = parser - - # EAFP +def _convert_types(values, na_values): try: - return [parse_date(val) for val in values] + values = lib.maybe_convert_numeric(values, na_values) except Exception: - # failed - return values + lib.sanitize_objects(values) + + if values.dtype == np.object_: + return lib.maybe_convert_bool(values) + return values #------------------------------------------------------------------------------- # ExcelFile class diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index cd1a578473306..4a84d21294318 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -8,7 +8,7 @@ from numpy import nan import numpy as np -from pandas import DataFrame +from pandas import DataFrame, Index from pandas.io.parsers import read_csv, read_table, ExcelFile from pandas.util.testing import assert_almost_equal, assert_frame_equal @@ -76,7 +76,7 @@ def test_duplicate_columns(self): 6,7,8,9,10 11,12,13,14,15 """ - df = read_table(StringIO(data), sep=',', index_col=None) + df = read_table(StringIO(data), sep=',') self.assert_(np.array_equal(df.columns, ['A', 'A.1', 'B', 'B.1', 'B.2'])) @@ -86,7 +86,7 @@ def test_csv_mixed_type(self): b,3,4 c,4,5 """ - df = read_csv(StringIO(data), index_col=None) + df = read_csv(StringIO(data)) # TODO def test_csv_custom_parser(self): @@ -120,6 +120,7 @@ def test_read_csv_dataframe(self): df = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = read_table(self.csv1, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D'])) + self.assert_(df.index.name == 'index') self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.values.dtype == np.float64) assert_frame_equal(df, df2) @@ -184,6 +185,15 @@ def test_int_conversion(self): self.assert_(data['A'].dtype == np.float_) self.assert_(data['B'].dtype == np.int_) + def test_infer_index_col(self): + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + data = read_csv(StringIO(data)) + self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index e792342d1130f..8a67b90d43692 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -3,65 +3,31 @@ cimport cpython cdef extern from "math.h": double fabs(double) -def maybe_convert_float_list(tuple values): +def to_object_array(list rows): cdef: - Py_ssize_t i, n - ndarray[float64_t] result - object val + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + list row - n = len(values) - result = np.empty(n, dtype='f8') + n = len(rows) + k = 0 for i from 0 <= i < n: - val = values[i] - result[i] = val - - return val - -def maybe_convert_numeric(tuple values, set na_values): - cdef: - Py_ssize_t i, n - ndarray[float64_t] floats - ndarray[int64_t] ints - bint seen_float = 0 - object val - float64_t fval - - n = len(values) + tmp = len(rows[i]) + if tmp > k: + k = tmp - floats = np.empty(n, dtype='f8') - ints = np.empty(n, dtype='i8') + result = np.empty((n, k), dtype=object) for i from 0 <= i < n: - val = values[i] + row = rows[i] - if cpython.PyFloat_Check(val): - floats[i] = val - seen_float = 1 - elif val in na_values: - floats[i] = nan - seen_float = 1 - elif val is None: - floats[i] = nan - seen_float = 1 - elif len(val) == 0: - floats[i] = nan - seen_float = 1 - else: - fval = float(val) - floats[i] = fval - if not seen_float: - if '.' in val: - seen_float = 1 - else: - ints[i] = fval + for j from 0 <= j < len(row): + result[i, j] = row[j] - if seen_float: - return floats - else: - return ints + return result -def maybe_convert_numeric_list(list values, set na_values): +def maybe_convert_numeric(ndarray[object] values, set na_values): cdef: Py_ssize_t i, n ndarray[float64_t] floats @@ -104,47 +70,53 @@ def maybe_convert_numeric_list(list values, set na_values): else: return ints -def string_to_ndarray_tuple(tuple values): +def try_parse_dates(ndarray[object] values, parser=None): cdef: Py_ssize_t i, n ndarray[object] result - object val, onan - n = len(values) - result = np.empty(n, dtype=object) - onan = np.nan + from datetime import datetime - for i from 0 <= i < n: - val = values[i] + n = len(values) + result = np.empty(n, dtype='O') + + if parser is None: + try: + from dateutil import parser + parse_date = parser.parse + except ImportError: # pragma: no cover + def parse_date(s): + try: + return datetime.strptime(s, '%m/%d/%Y') + except Exception: + return s + else: + parse_date = parser - if val == '': - result[i] = onan - else: - result[i] = val + # EAFP + try: + for i from 0 <= i < n: + result[i] = parse_date(values[i]) + except Exception: + # failed + return values return result -def string_to_ndarray_list(list values): +def sanitize_objects(ndarray[object] values): cdef: Py_ssize_t i, n - ndarray[object] result object val, onan n = len(values) - result = np.empty(n, dtype=object) onan = np.nan for i from 0 <= i < n: val = values[i] - if val == '': - result[i] = onan - else: - result[i] = val - - return result + values[i] = onan -def maybe_convert_bool_object(ndarray[object] arr): +def maybe_convert_bool(ndarray[object] arr): cdef: Py_ssize_t i, n ndarray[uint8_t, cast=True] result @@ -164,48 +136,3 @@ def maybe_convert_bool_object(ndarray[object] arr): return arr return result - -cdef float64_t FP_ERR = 1e-10 - -def maybe_convert_int(ndarray[float64_t] arr): - cdef: - Py_ssize_t i, n - ndarray[int64_t] result - float64_t val - - n = len(arr) - result = np.empty(n, dtype='i8') - for i from 0 <= i < n: - val = arr[i] - result[i] = val - - # NA - if val != val: - return arr - - if fabs(result[i] - val) > FP_ERR: - return arr - - return result - -def maybe_convert_int_object(ndarray[object] arr): - cdef: - Py_ssize_t i, n - ndarray[int64_t] result - object val - - n = len(arr) - result = np.empty(n, dtype='i8') - for i from 0 <= i < n: - val = arr[i] - result[i] = val - - # NA - if val != val: - return arr - - if fabs(result[i] - val) > FP_ERR: - return arr - - return result - From 988e2a5d6ec585501f3fddf805da673624861b52 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 16 Oct 2011 12:22:16 -0400 Subject: [PATCH 052/161] DOC: release note --- RELEASE.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index 7ea238baecc49..9638236e5ec97 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -96,6 +96,8 @@ feedback on the library. **Improvements to existing features** + - Major performance improvements in file parsing functions `read_csv` and + `read_table` - Added Cython function for converting tuples to ndarray very fast. Speeds up many MultiIndex-related operations - File parsing functions like `read_csv` and `read_table` will explicitly @@ -127,6 +129,7 @@ feedback on the library. - Panel.__repr__ raised exception on length-0 major/minor axes - Be less aggressive about converting float->int in `read_csv` and `read_table` + - `read_csv` will set Index name attribute pandas 0.4.3 ============ From 3a7af7e55704b7dfaf40923a351404e9b59502f7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Oct 2011 18:11:46 -0400 Subject: [PATCH 053/161] ENH: DataReader improvements --- RELEASE.rst | 8 +++++--- pandas/io/data.py | 27 +++++++++------------------ pandas/src/parsing.pyx | 2 +- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 9638236e5ec97..27c77177906c2 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -127,9 +127,11 @@ feedback on the library. - Index and Series names are now stored in HDFStore - Can create empty Panel (GH #239) - Panel.__repr__ raised exception on length-0 major/minor axes - - Be less aggressive about converting float->int in `read_csv` and - `read_table` - - `read_csv` will set Index name attribute + - `read_csv` / `read_table` fixes + - Be less aggressive about converting float->int in cases of floating point + representations of integers like 1.0, 2.0, etc. + - "True"/"False" will not get correctly converted to boolean + - Index name attribute will get set when specifying an index column pandas 0.4.3 ============ diff --git a/pandas/io/data.py b/pandas/io/data.py index 3ecd3caf3ef4f..fa7d8aa519f2d 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -11,7 +11,7 @@ from zipfile import ZipFile from StringIO import StringIO -from pandas import DataFrame, Index +from pandas import DataFrame, read_csv class DataReader(list): @@ -19,7 +19,7 @@ class DataReader(list): Imports data from a number of online sources. Currently supports Yahoo! finance, St. Louis FED (FRED), and Kenneth French's data library. - + Parameters ---------- name : str @@ -52,8 +52,8 @@ def __new__(cls, name, data_source=None, start=None, end=None, **kwds): start = dt.datetime(2010, 1, 1) if(end is None): end = dt.datetime.today() - - self = super(DataReader, cls).__new__(cls) + + self = super(DataReader, cls).__new__(cls) if(data_source == "yahoo"): return self.get_data_yahoo(name=name, start=start, end=end) @@ -86,20 +86,11 @@ def get_data_yahoo(self, name=None, start=None, end=None): '&g=d' + \ '&ignore=.csv' - days = urllib.urlopen(url).readlines() - - data = np.array([day[:-2].split(',') for day in days]) - header = [str.lower(name) for name in data[0]] - index = Index([dt.datetime.strptime(row[0], "%Y-%m-%d") for row in data[1:]]) - data = np.array([[row[1], row[2], row[3], row[4], int(row[5]), row[6]] for row in data[1:]], dtype=float) - - data = DataFrame(data, index, columns=header[1:]).sort() - - return data - - + lines = urllib.urlopen(url).read() + return read_csv(StringIO(lines), index_col=0, parse_dates=True) - def get_data_fred(self, name=None, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): + def get_data_fred(self, name=None, start=dt.datetime(2010, 1, 1), + end=dt.datetime.today()): """ Get data for the given name from the St. Louis FED (FRED). Date format is datetime @@ -144,7 +135,7 @@ def get_data_famafrench(self, name): dataset = [d.split() for d in data[(file_edges[i] + 1):file_edges[i+1]]] if(len(dataset) > 10): ncol = np.median(np.array([len(d) for d in dataset])) - header_index = np.where(np.array([len(d) for d in dataset]) == (ncol-1))[0][-1] + header_index = np.where(np.array([len(d) for d in dataset]) == (ncol-1))[0][-1] header = dataset[header_index] # to ensure the header is unique header = [str(j + 1) + " " + header[j] for j in range(len(header))] diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index 8a67b90d43692..7ab5e9474265b 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -57,7 +57,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values): floats[i] = nan seen_float = 1 else: - fval = float(val) + fval = cpython.PyFloat_FromString(val) floats[i] = fval if not seen_float: if '.' in val: From e823ec7d2672e0ee8e7743a8fc663250c7351ced Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Oct 2011 12:31:29 -0400 Subject: [PATCH 054/161] BUG: cpython PyFloat_FromString problem --- pandas/src/parsing.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index 7ab5e9474265b..8a67b90d43692 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -57,7 +57,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values): floats[i] = nan seen_float = 1 else: - fval = cpython.PyFloat_FromString(val) + fval = float(val) floats[i] = fval if not seen_float: if '.' in val: From ca5a702a7bc224d76573b9856c3f168690735b6a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Oct 2011 14:31:40 -0400 Subject: [PATCH 055/161] BUG: single-key DataFrame.join with list failed, GH #246 --- pandas/core/frame.py | 11 +++++++---- pandas/tests/test_frame.py | 12 ++++++++++-- pandas/util/testing.py | 4 ++-- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 212bd6331fb04..7b4ffcb5bc4ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2223,11 +2223,14 @@ def _join_on(self, other, on, lsuffix, rsuffix): return self if isinstance(on, (list, tuple)): - join_key = zip(*[self[k] for k in on]) - join_key = common._asarray_tuplesafe(join_key, - dtype=np.object_) + if len(on) == 1: + join_key = self[on[0]].values + else: + join_key = zip(*[self[k] for k in on]) + join_key = common._asarray_tuplesafe(join_key, + dtype=np.object_) else: - join_key = np.asarray(self[on]) + join_key = self[on].values new_data = self._data.join_on(other._data, join_key, axis=1, lsuffix=lsuffix, rsuffix=rsuffix) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 92ba3c343d279..d61b1f5713a7e 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2554,8 +2554,6 @@ def test_join_on(self): # merge column not p resent self.assertRaises(Exception, target.join, source, on='E') - # corner cases - # nothing to merge merged = target.join(source.reindex([]), on='C') @@ -2568,6 +2566,16 @@ def test_join_on(self): self.assertRaises(Exception, target.join, source, on='C', how='left') + def test_join_on_singlekey_list(self): + df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c']) + + # corner cases + joined = df.join(df2, on=['key']) + expected = df.join(df2, on='key') + + assert_frame_equal(joined, expected) + def test_join_on_multikey(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c398eebc8cb59..92982e100b3aa 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -70,7 +70,7 @@ def isiterable(obj): def assert_almost_equal(a, b): if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) - + if isinstance(a, basestring): assert a == b, (a, b) return True @@ -116,9 +116,9 @@ def assert_dict_equal(a, b, compare_keys=True): assert_almost_equal(a[k], b[k]) def assert_series_equal(left, right): + assert_almost_equal(left, right) assert(left.dtype == right.dtype) assert(left.index.equals(right.index)) - assert_almost_equal(left, right) def assert_frame_equal(left, right): assert(isinstance(left, DataFrame)) From c375fa3999f3ccd25152610834e0108030a23bef Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Oct 2011 15:47:43 -0400 Subject: [PATCH 056/161] ENH: implement inner join on in DataFrame.join, GH #248 --- pandas/core/frame.py | 22 +-- pandas/core/internals.py | 15 +- pandas/core/sparse.py | 2 +- pandas/tests/test_frame.py | 287 ++++++++++++++++++++----------------- 4 files changed, 178 insertions(+), 148 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7b4ffcb5bc4ea..476bcd7da8d91 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2179,14 +2179,15 @@ def _get_raw_column(self, col): def join(self, other, on=None, how=None, lsuffix='', rsuffix=''): """ Join columns with other DataFrame either on index or on a key - column + column. Parameters ---------- other : DataFrame Index should be similar to one of the columns in this one on : string, default None - Column name to use, otherwise join on index + Column name to use, otherwise join on index. Just like an Excel + VLOOKUP operation how : {'left', 'right', 'outer', 'inner'} How to handle indexes of the two objects. Default: 'left' for joining on index, None otherwise @@ -2203,18 +2204,17 @@ def join(self, other, on=None, how=None, lsuffix='', rsuffix=''): ------- joined : DataFrame """ + if how is None: + how = 'left' if on is not None: - if how is not None: - raise Exception('how parameter is not valid when ' - '*on* specified') - return self._join_on(other, on, lsuffix, rsuffix) + return self._join_on(other, on, how, lsuffix, rsuffix) else: - if how is None: - how = 'left' - return self._join_index(other, how, lsuffix, rsuffix) - def _join_on(self, other, on, lsuffix, rsuffix): + def _join_on(self, other, on, how, lsuffix, rsuffix): + if how not in ['left', 'inner']: + raise Exception('Only inner / left joins currently supported') + if isinstance(other, Series): assert(other.name is not None) other = DataFrame({other.name : other}) @@ -2232,7 +2232,7 @@ def _join_on(self, other, on, lsuffix, rsuffix): else: join_key = self[on].values - new_data = self._data.join_on(other._data, join_key, axis=1, + new_data = self._data.join_on(other._data, join_key, how=how, axis=1, lsuffix=lsuffix, rsuffix=rsuffix) return self._constructor(new_data) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5eb0212450716..4247b6f31c858 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -729,15 +729,24 @@ def _is_indexed_like(self, other): return False return True - def join_on(self, other, on, axis=1, lsuffix=None, rsuffix=None): + def join_on(self, other, on, how='left', axis=1, lsuffix=None, + rsuffix=None): this, other = self._maybe_rename_join(other, lsuffix, rsuffix) other_axis = other.axes[axis] indexer = other_axis.get_indexer(on) # TODO: deal with length-0 case? or does it fall out? - mask = indexer == -1 - needs_masking = len(on) > 0 and mask.any() + if how == 'left': + mask = indexer == -1 + needs_masking = len(on) > 0 and mask.any() + else: + mask = indexer != -1 + this = this.take(mask.nonzero()[0], axis=axis) + indexer = indexer[mask] + mask = None + needs_masking = False + other_blocks = [] for block in other.blocks: newb = block.reindex_axis(indexer, mask, needs_masking, axis=axis) diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py index c02b0cee22011..0df5880a6580f 100644 --- a/pandas/core/sparse.py +++ b/pandas/core/sparse.py @@ -1104,7 +1104,7 @@ def add_suffix(self, suffix): f = ('%s' + ('%s' % suffix)).__mod__ return self.rename(columns=f) - def _join_on(self, other, on): + def _join_on(self, other, on, how, lsuffix, rsuffix): # need to implement? raise NotImplementedError diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index d61b1f5713a7e..91b28561bf43d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2520,139 +2520,6 @@ def test_combineMult(self): comb = self.empty.combineMult(self.frame) assert_frame_equal(comb, self.frame) - def test_join_on(self): - index, data = tm.getMixedTypeDict() - target = DataFrame(data, index=index) - - # Join on string value - source = DataFrame({'MergedA' : data['A'], 'MergedD' : data['D']}, - index=data['C']) - merged = target.join(source, on='C') - self.assert_(np.array_equal(merged['MergedA'], target['A'])) - self.assert_(np.array_equal(merged['MergedD'], target['D'])) - - # join with duplicates (fix regression from DataFrame/Matrix merge) - df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c']) - joined = df.join(df2, on='key') - expected = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c'], - 'value' : [0, 0, 1, 1, 2]}) - assert_frame_equal(joined, expected) - - # Test when some are missing - df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], - columns=['one']) - df_b = DataFrame([['foo'], ['bar']], index=[1, 2], - columns=['two']) - df_c = DataFrame([[1], [2]], index=[1, 2], - columns=['three']) - joined = df_a.join(df_b, on='one') - joined = joined.join(df_c, on='one') - self.assert_(np.isnan(joined['two']['c'])) - self.assert_(np.isnan(joined['three']['c'])) - - # merge column not p resent - self.assertRaises(Exception, target.join, source, on='E') - - # nothing to merge - merged = target.join(source.reindex([]), on='C') - - # overlap - source_copy = source.copy() - source_copy['A'] = 0 - self.assertRaises(Exception, target.join, source_copy, on='A') - - # can't specify how - self.assertRaises(Exception, target.join, source, on='C', - how='left') - - def test_join_on_singlekey_list(self): - df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c']) - - # corner cases - joined = df.join(df2, on=['key']) - expected = df.join(df2, on='key') - - assert_frame_equal(joined, expected) - - def test_join_on_multikey(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - to_join = DataFrame(np.random.randn(10, 3), index=index, - columns=['j_one', 'j_two', 'j_three']) - - # a little relevant example with NAs - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] - - data = np.random.randn(len(key1)) - data = DataFrame({'key1' : key1, 'key2' : key2, - 'data' : data}) - - joined = data.join(to_join, on=['key1', 'key2']) - - join_key = Index(zip(key1, key2)) - indexer = to_join.index.get_indexer(join_key) - ex_values = to_join.values.take(indexer, axis=0) - ex_values[indexer == -1] = np.nan - expected = data.join(DataFrame(ex_values, columns=to_join.columns)) - - # TODO: columns aren't in the same order yet - assert_frame_equal(joined, expected.ix[:, joined.columns]) - - def test_join_index_mixed(self): - - df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, - index=np.arange(10), - columns=['A', 'B', 'C', 'D']) - self.assert_(df1['B'].dtype == np.int_) - self.assert_(df1['D'].dtype == np.bool_) - - df2 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, - index=np.arange(0, 10, 2), - columns=['A', 'B', 'C', 'D']) - - # overlap - joined = df1.join(df2, lsuffix='_one', rsuffix='_two') - expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', - 'A_two', 'B_two', 'C_two', 'D_two'] - df1.columns = expected_columns[:4] - df2.columns = expected_columns[4:] - expected = _join_by_hand(df1, df2) - assert_frame_equal(joined, expected) - - # no overlapping blocks - df1 = DataFrame(index=np.arange(10)) - df1['bool'] = True - df1['string'] = 'foo' - - df2 = DataFrame(index=np.arange(5, 15)) - df2['int'] = 1 - df2['float'] = 1. - - for kind in JOIN_TYPES: - joined = df1.join(df2, how=kind) - expected = _join_by_hand(df1, df2, how=kind) - assert_frame_equal(joined, expected) - - joined = df2.join(df1, how=kind) - expected = _join_by_hand(df2, df1, how=kind) - assert_frame_equal(joined, expected) - - def test_join_on_series(self): - pass - - def test_join_empty_bug(self): - # generated an exception in 0.4.3 - x = DataFrame() - x.join(DataFrame([3], index=[0], columns=['A']), how='outer') - def test_clip(self): median = self.frame.median().median() @@ -3184,6 +3051,160 @@ def test_series_put_names(self): for k, v in series.iteritems(): self.assertEqual(v.name, k) + + +class TestDataFrameJoin(unittest.TestCase): + + def setUp(self): + index, data = tm.getMixedTypeDict() + self.target = DataFrame(data, index=index) + + # Join on string value + self.source = DataFrame({'MergedA' : data['A'], 'MergedD' : data['D']}, + index=data['C']) + + def test_join_on(self): + target = self.target + source = self.source + + merged = target.join(source, on='C') + self.assert_(np.array_equal(merged['MergedA'], target['A'])) + self.assert_(np.array_equal(merged['MergedD'], target['D'])) + + # join with duplicates (fix regression from DataFrame/Matrix merge) + df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c']) + joined = df.join(df2, on='key') + expected = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c'], + 'value' : [0, 0, 1, 1, 2]}) + assert_frame_equal(joined, expected) + + # Test when some are missing + df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], + columns=['one']) + df_b = DataFrame([['foo'], ['bar']], index=[1, 2], + columns=['two']) + df_c = DataFrame([[1], [2]], index=[1, 2], + columns=['three']) + joined = df_a.join(df_b, on='one') + joined = joined.join(df_c, on='one') + self.assert_(np.isnan(joined['two']['c'])) + self.assert_(np.isnan(joined['three']['c'])) + + # merge column not p resent + self.assertRaises(Exception, target.join, source, on='E') + + # overlap + source_copy = source.copy() + source_copy['A'] = 0 + self.assertRaises(Exception, target.join, source_copy, on='A') + + def test_join_with_len0(self): + # nothing to merge + merged = self.target.join(self.source.reindex([]), on='C') + for col in self.source: + self.assert_(col in merged) + self.assert_(merged[col].isnull().all()) + + def test_join_on_inner(self): + df = DataFrame({'key' : ['a', 'a', 'd', 'b', 'b', 'c']}) + df2 = DataFrame({'value' : [0, 1]}, index=['a', 'b']) + + joined = df.join(df2, on='key', how='inner') + + expected = df.join(df2, on='key') + expected = expected[expected['value'].notnull()] + self.assert_(np.array_equal(joined['key'], expected['key'])) + self.assert_(np.array_equal(joined['value'], expected['value'])) + self.assert_(joined.index.equals(expected.index)) + + def test_join_on_singlekey_list(self): + df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c']) + + # corner cases + joined = df.join(df2, on=['key']) + expected = df.join(df2, on='key') + + assert_frame_equal(joined, expected) + + def test_join_on_multikey(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1' : key1, 'key2' : key2, + 'data' : data}) + + joined = data.join(to_join, on=['key1', 'key2']) + + join_key = Index(zip(key1, key2)) + indexer = to_join.index.get_indexer(join_key) + ex_values = to_join.values.take(indexer, axis=0) + ex_values[indexer == -1] = np.nan + expected = data.join(DataFrame(ex_values, columns=to_join.columns)) + + # TODO: columns aren't in the same order yet + assert_frame_equal(joined, expected.ix[:, joined.columns]) + + def test_join_index_mixed(self): + + df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, + index=np.arange(10), + columns=['A', 'B', 'C', 'D']) + self.assert_(df1['B'].dtype == np.int_) + self.assert_(df1['D'].dtype == np.bool_) + + df2 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, + index=np.arange(0, 10, 2), + columns=['A', 'B', 'C', 'D']) + + # overlap + joined = df1.join(df2, lsuffix='_one', rsuffix='_two') + expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', + 'A_two', 'B_two', 'C_two', 'D_two'] + df1.columns = expected_columns[:4] + df2.columns = expected_columns[4:] + expected = _join_by_hand(df1, df2) + assert_frame_equal(joined, expected) + + # no overlapping blocks + df1 = DataFrame(index=np.arange(10)) + df1['bool'] = True + df1['string'] = 'foo' + + df2 = DataFrame(index=np.arange(5, 15)) + df2['int'] = 1 + df2['float'] = 1. + + for kind in JOIN_TYPES: + joined = df1.join(df2, how=kind) + expected = _join_by_hand(df1, df2, how=kind) + assert_frame_equal(joined, expected) + + joined = df2.join(df1, how=kind) + expected = _join_by_hand(df2, df1, how=kind) + assert_frame_equal(joined, expected) + + def test_join_on_series(self): + pass + + def test_join_empty_bug(self): + # generated an exception in 0.4.3 + x = DataFrame() + x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + def _join_by_hand(a, b, how='left'): join_index = a.index.join(b.index, how=how) From a8b5728a53f46aa574388d63be981632c71faf40 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Oct 2011 16:27:44 -0400 Subject: [PATCH 057/161] BUG: join on length-0 frame produce correct columns --- pandas/core/frame.py | 3 --- pandas/core/internals.py | 1 - pandas/tests/test_frame.py | 5 +++++ 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 476bcd7da8d91..69e9d686134f7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2219,9 +2219,6 @@ def _join_on(self, other, on, how, lsuffix, rsuffix): assert(other.name is not None) other = DataFrame({other.name : other}) - if len(other.index) == 0: - return self - if isinstance(on, (list, tuple)): if len(on) == 1: join_key = self[on[0]].values diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4247b6f31c858..e3dc1e296affd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -736,7 +736,6 @@ def join_on(self, other, on, how='left', axis=1, lsuffix=None, other_axis = other.axes[axis] indexer = other_axis.get_indexer(on) - # TODO: deal with length-0 case? or does it fall out? if how == 'left': mask = indexer == -1 needs_masking = len(on) > 0 and mask.any() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 91b28561bf43d..b7ac9a7d14842 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3106,6 +3106,11 @@ def test_join_with_len0(self): self.assert_(col in merged) self.assert_(merged[col].isnull().all()) + merged2 = self.target.join(self.source.reindex([]), on='C', + how='inner') + self.assert_(merged2.columns.equals(merged.columns)) + self.assertEqual(len(merged2), 0) + def test_join_on_inner(self): df = DataFrame({'key' : ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value' : [0, 1]}, index=['a', 'b']) From 3f6b47edbadb3163d10134cbe0b635df5eefbda4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Oct 2011 18:06:03 -0400 Subject: [PATCH 058/161] ENH: select columns with DataFrame.__getitem__, GH #253 --- pandas/core/frame.py | 26 +++++++++++++++++++------- pandas/tests/test_frame.py | 13 +++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69e9d686134f7..850b927c25c92 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -806,22 +806,34 @@ def __getitem__(self, key): new_data = self._data.get_slice(key, axis=1) return self._constructor(new_data) # either boolean or fancy integer index - elif isinstance(key, np.ndarray): - if len(key) != len(self.index): - raise ValueError('Item wrong length %d instead of %d!' % - (len(key), len(self.index))) + elif isinstance(key, (np.ndarray, list)): + if isinstance(key, list): + key = np.array(key, dtype=object) # also raises Exception if object array with NA values if _is_bool_indexer(key): key = np.asarray(key, dtype=bool) - - new_index = self.index[key] - return self.reindex(new_index) + return self._getitem_array(key) elif isinstance(self.columns, MultiIndex): return self._getitem_multilevel(key) else: return self._getitem_single(key) + def _getitem_array(self, key): + if key.dtype == np.bool_: + if len(key) != len(self.index): + raise ValueError('Item wrong length %d instead of %d!' % + (len(key), len(self.index))) + + new_index = self.index[key] + return self.reindex(new_index) + else: + indexer = self.columns.get_indexer(key) + mask = indexer == -1 + if mask.any(): + raise Exception("No column(s) named: %s" % str(key[mask])) + return self.reindex(columns=key) + def _slice(self, slobj, axis=0): if axis == 0: mgr_axis = 1 diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b7ac9a7d14842..84576c173502f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -53,6 +53,19 @@ def test_getitem_iterator(self): expected = self.frame.ix[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) + def test_getitem_list(self): + result = self.frame[['B', 'A']] + result2 = self.frame[Index(['B', 'A'])] + + expected = self.frame.ix[:, ['B', 'A']] + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + self.assertRaises(Exception, self.frame.__getitem__, + ['B', 'A', 'foo']) + self.assertRaises(Exception, self.frame.__getitem__, + Index(['B', 'A', 'foo'])) + def test_getitem_boolean(self): # boolean indexing d = self.tsframe.index[10] From cf457c326a9470c25f65fb317080584db44afb53 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 10:51:56 -0400 Subject: [PATCH 059/161] ENH: tuple version of to_object_array...should used fused type --- pandas/io/parsers.py | 13 ------------- pandas/src/parsing.pyx | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6d869a0f0eaa8..8ed64a0f69130 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -229,19 +229,6 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, return DataFrame(data=data, columns=columns, index=index) -def _maybe_convert_int(arr): - if len(arr) == 0: # pragma: no cover - return arr - - try: - if arr.dtype == np.object_: - return lib.maybe_convert_int_object(arr) - return lib.maybe_convert_int(arr) - except (TypeError, ValueError): - pass - - return arr - def _maybe_convert_int_mindex(index, parse_dates, date_parser): for i in range(len(index)): try: diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index 8a67b90d43692..bf6f3be7fce01 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -27,6 +27,30 @@ def to_object_array(list rows): return result +def to_object_array_tuples(list rows): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + tuple row + + n = len(rows) + + k = 0 + for i from 0 <= i < n: + tmp = len(rows[i]) + if tmp > k: + k = tmp + + result = np.empty((n, k), dtype=object) + + for i from 0 <= i < n: + row = rows[i] + + for j from 0 <= j < len(row): + result[i, j] = row[j] + + return result + def maybe_convert_numeric(ndarray[object] values, set na_values): cdef: Py_ssize_t i, n From ee82f898b9b750af59a07f22f705d3c2b215fea1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 11:45:59 -0400 Subject: [PATCH 060/161] ENH: SQL-type converter type inference --- pandas/io/tests/test_parsers.py | 54 +++++++++++++++++++++++++++ pandas/src/parsing.pyx | 65 +++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 4a84d21294318..c98289246649c 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -11,6 +11,7 @@ from pandas import DataFrame, Index from pandas.io.parsers import read_csv, read_table, ExcelFile from pandas.util.testing import assert_almost_equal, assert_frame_equal +import pandas._tseries as lib class TestParsers(unittest.TestCase): @@ -194,6 +195,59 @@ def test_infer_index_col(self): data = read_csv(StringIO(data)) self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) + +def test_convert_sql_column_floats(): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) + +def test_convert_sql_column_strings(): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + assert_same_values_and_dtype(result, expected) + +def test_convert_sql_column_unicode(): + arr = np.array([u'1.5', None, u'3', u'4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u'1.5', np.nan, u'3', u'4.2'], dtype=object) + assert_same_values_and_dtype(result, expected) + +def test_convert_sql_column_ints(): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + assert_same_values_and_dtype(result, expected) + assert_same_values_and_dtype(result2, expected) + +def test_convert_sql_column_bools(): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + assert_same_values_and_dtype(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + assert_same_values_and_dtype(result, expected) + +def test_convert_sql_column_decimals(): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) + +def test_convert_sql_column_other(): + arr = np.array([1.5, None, 3, 4.2]) + +def assert_same_values_and_dtype(res, exp): + assert(res.dtype == exp.dtype) + assert_almost_equal(res, exp) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index bf6f3be7fce01..7a186b2edcdb6 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -94,6 +94,71 @@ def maybe_convert_numeric(ndarray[object] values, set na_values): else: return ints +def convert_sql_column(ndarray[object] objects): + cdef: + Py_ssize_t i, n + ndarray[float64_t] floats + ndarray[int64_t] ints + ndarray[uint8_t, cast=True] bools + bint seen_float = 0 + bint seen_int = 0 + bint seen_object = 0 + bint seen_bool = 0 + bint seen_null = 0 + object val, onan + float64_t fval, fnan + + n = len(objects) + + floats = np.empty(n, dtype='f8') + ints = np.empty(n, dtype='i8') + bools = np.empty(n, dtype=bool) + + onan = np.nan + fnan = np.nan + + for i from 0 <= i < n: + val = objects[i] + + if val is None: + seen_null = 1 + objects[i] = onan + floats[i] = fnan + elif cpython.PyBool_Check(val): + seen_bool = 1 + bools[i] = val + elif cpython.PyInt_Check(val): + seen_int = 1 + floats[i] = val + if not seen_null: + ints[i] = val + elif cpython.PyFloat_Check(val): + floats[i] = val + seen_float = 1 + elif cpython.PyString_Check(val) or cpython.PyUnicode_Check(val): + seen_object = 1 + else: + try: + floats[i] = float(val) + seen_float = 1 + except Exception: + pass + + if seen_null: + if seen_float: + return floats + else: + return objects + else: + if seen_int: + return ints + elif seen_float: + return floats + elif seen_bool: + return bools + elif seen_object: + return objects + def try_parse_dates(ndarray[object] values, parser=None): cdef: Py_ssize_t i, n From c24418d5bf73ae3eff6b8ab0ab9a5c810f90eb9a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 11:50:16 -0400 Subject: [PATCH 061/161] BUG: corner case --- pandas/src/parsing.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index 7a186b2edcdb6..cd8ee0b568b5b 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -102,7 +102,6 @@ def convert_sql_column(ndarray[object] objects): ndarray[uint8_t, cast=True] bools bint seen_float = 0 bint seen_int = 0 - bint seen_object = 0 bint seen_bool = 0 bint seen_null = 0 object val, onan @@ -135,9 +134,7 @@ def convert_sql_column(ndarray[object] objects): elif cpython.PyFloat_Check(val): floats[i] = val seen_float = 1 - elif cpython.PyString_Check(val) or cpython.PyUnicode_Check(val): - seen_object = 1 - else: + elif not (cpython.PyString_Check(val) or cpython.PyUnicode_Check(val)): try: floats[i] = float(val) seen_float = 1 @@ -156,7 +153,7 @@ def convert_sql_column(ndarray[object] objects): return floats elif seen_bool: return bools - elif seen_object: + else: return objects def try_parse_dates(ndarray[object] values, parser=None): From b67708104b7644123ea861d03ac72254083b7e9e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 13:26:20 -0400 Subject: [PATCH 062/161] BUG: parser refactoring, fix GH #257 and #258 --- pandas/io/parsers.py | 75 ++++++++++++---------- pandas/io/tests/test_parsers.py | 109 ++++++++++++++++++-------------- 2 files changed, 104 insertions(+), 80 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8ed64a0f69130..61c2e0575d83f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,15 +2,13 @@ Module contains tools for processing files into DataFrames or other objects """ -from datetime import datetime -from itertools import izip import numpy as np from pandas.core.index import Index, MultiIndex from pandas.core.frame import DataFrame - import pandas._tseries as lib + def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, skiprows=None, na_values=None, parse_dates=False, date_parser=None): @@ -53,7 +51,7 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, return _simple_parser(lines, header=header, index_col=index_col, - colNames=names, + names=names, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser) @@ -116,13 +114,18 @@ def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, """ % (_parser_params % _table_sep) -def _simple_parser(lines, colNames=None, header=0, index_col=0, +def _simple_parser(lines, names=None, header=0, index_col=0, na_values=None, date_parser=None, parse_dates=True): """ Workhorse function for processing nested list into DataFrame Should be replaced by np.genfromtxt eventually? """ + passed_names = names is not None + if passed_names: + names = list(names) + header = None + if header is not None: columns = [] for i, c in enumerate(lines[header]): @@ -141,32 +144,51 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, counts[col] = cur_count + 1 else: ncols = len(lines[0]) - if not colNames: + if not names: columns = ['X.%d' % (i + 1) for i in range(ncols)] else: - assert(len(colNames) == ncols) - columns = colNames + columns = names content = lines + # spaghetti + + # implicitly index_col=0 b/c 1 fewer column names + index_name = None + implicit_first_col = (len(content) > 0 and + len(content[0]) == len(columns) + 1) + + if implicit_first_col: + if index_col is None: + index_col = 0 + index_name = None + elif np.isscalar(index_col): + if passed_names: + index_name = None + else: + index_name = columns.pop(index_col) + elif index_col is not None: + if not passed_names: + cp_cols = list(columns) + index_name = [] + for i in index_col: + name = cp_cols[i] + columns.remove(name) + index_name.append(name) + else: + index_name=None + if len(content) == 0: # pragma: no cover if index_col is not None: if np.isscalar(index_col): - index = Index([], name=columns.pop(index_col)) + index = Index([], name=index_name) else: - cp_cols = list(columns) - names = [] - for i in index_col: - name = cp_cols[i] - columns.remove(name) - names.append(name) index = MultiIndex.fromarrays([[]] * len(index_col), - names=names) + names=index_name) else: index = Index([]) return DataFrame(index=index, columns=columns) - # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', @@ -178,42 +200,29 @@ def _simple_parser(lines, colNames=None, header=0, index_col=0, else: na_values = set(list(na_values)) | NA_VALUES - zipped_content = list(lib.to_object_array(content).T) - if index_col is None and len(content[0]) == len(columns) + 1: - index_col = 0 - # no index column specified, so infer that's what is wanted if index_col is not None: if np.isscalar(index_col): index = zipped_content.pop(index_col) - - if len(content[0]) == len(columns) + 1: - name = None - else: - name = columns.pop(index_col) - else: # given a list of index - idx_names = [] index = [] for idx in index_col: - idx_names.append(columns[idx]) index.append(zipped_content[idx]) #remove index items from content and columns, don't pop in loop for i in range(len(index_col)): - columns.remove(idx_names[i]) zipped_content.remove(index[i]) if np.isscalar(index_col): if parse_dates: index = lib.try_parse_dates(index, parser=date_parser) - index = Index(_convert_types(index, na_values), name=name) + index = Index(_convert_types(index, na_values), + name=index_name) else: arrays = _maybe_convert_int_mindex(index, parse_dates, date_parser) - index = MultiIndex.from_arrays(arrays, - names=idx_names) + index = MultiIndex.from_arrays(arrays, names=index_name) else: index = Index(np.arange(len(content))) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index c98289246649c..632d715ab140b 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -113,10 +113,26 @@ def test_no_header(self): [6,7,8,9,10], [11,12,13,14,15]] assert_almost_equal(df.values, expected) + assert_almost_equal(df.values, df2.values) self.assert_(np.array_equal(df.columns, ['X.1', 'X.2', 'X.3', 'X.4', 'X.5'])) self.assert_(np.array_equal(df2.columns, names)) + def test_header_with_index_col(self): + data = """foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + names = ['A', 'B', 'C'] + df = read_csv(StringIO(data), names=names) + + self.assertEqual(names, ['A', 'B', 'C']) + + data = [[1,2,3],[4,5,6],[7,8,9]] + expected = DataFrame(data, index=['foo','bar','baz'], + columns=['A','B','C']) + assert_frame_equal(df, expected) + def test_read_csv_dataframe(self): df = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = read_table(self.csv1, sep=',', index_col=0, parse_dates=True) @@ -196,53 +212,52 @@ def test_infer_index_col(self): self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) -def test_convert_sql_column_floats(): - arr = np.array([1.5, None, 3, 4.2], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - assert_same_values_and_dtype(result, expected) - -def test_convert_sql_column_strings(): - arr = np.array(['1.5', None, '3', '4.2'], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) - assert_same_values_and_dtype(result, expected) - -def test_convert_sql_column_unicode(): - arr = np.array([u'1.5', None, u'3', u'4.2'], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([u'1.5', np.nan, u'3', u'4.2'], dtype=object) - assert_same_values_and_dtype(result, expected) - -def test_convert_sql_column_ints(): - arr = np.array([1, 2, 3, 4], dtype='O') - arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') - result = lib.convert_sql_column(arr) - result2 = lib.convert_sql_column(arr2) - expected = np.array([1, 2, 3, 4], dtype='i8') - assert_same_values_and_dtype(result, expected) - assert_same_values_and_dtype(result2, expected) - -def test_convert_sql_column_bools(): - arr = np.array([True, False, True, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, True, False], dtype=bool) - assert_same_values_and_dtype(result, expected) - - arr = np.array([True, False, None, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, np.nan, False], dtype=object) - assert_same_values_and_dtype(result, expected) - -def test_convert_sql_column_decimals(): - from decimal import Decimal - arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - assert_same_values_and_dtype(result, expected) - -def test_convert_sql_column_other(): - arr = np.array([1.5, None, 3, 4.2]) +class TestParseSQL(unittest.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u'1.5', None, u'3', u'4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u'1.5', np.nan, u'3', u'4.2'], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + assert_same_values_and_dtype(result, expected) + assert_same_values_and_dtype(result2, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + assert_same_values_and_dtype(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) def assert_same_values_and_dtype(res, exp): assert(res.dtype == exp.dtype) From 2fa995e71704ce1c42d25f24b425b5d3dab06670 Mon Sep 17 00:00:00 2001 From: Luca Beltrame Date: Mon, 17 Oct 2011 14:31:20 +0200 Subject: [PATCH 063/161] Support different delimiters for to_csv --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 850b927c25c92..dbe171134a8c6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -469,7 +469,7 @@ def to_sparse(self, fill_value=None, kind='block'): default_fill_value=fill_value) def to_csv(self, path, nanRep='', cols=None, header=True, - index=True, index_label=None, mode='w'): + index=True, index_label=None, mode='w', delimiter=","): """ Write DataFrame to a comma-separated values (csv) file @@ -491,7 +491,7 @@ def to_csv(self, path, nanRep='', cols=None, header=True, mode : Python write mode, default 'w' """ f = open(path, mode) - csvout = csv.writer(f, lineterminator='\n') + csvout = csv.writer(f, lineterminator='\n', delimiter=delimiter) if cols is None: cols = self.columns From f818650fff6558290a07539030491eb142d946c0 Mon Sep 17 00:00:00 2001 From: Luca Beltrame Date: Mon, 17 Oct 2011 14:37:33 +0200 Subject: [PATCH 064/161] Add documentation for the parameter --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dbe171134a8c6..51caf08c91bea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -489,6 +489,10 @@ def to_csv(self, path, nanRep='', cols=None, header=True, `header` and `index` are True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. mode : Python write mode, default 'w' + delimiter : character, default "," + Field delimiter for the output file. + + """ f = open(path, mode) csvout = csv.writer(f, lineterminator='\n', delimiter=delimiter) From 42efaa12fa091e44d0347e4dfa6096d608527a5f Mon Sep 17 00:00:00 2001 From: Luca Beltrame Date: Mon, 17 Oct 2011 14:38:10 +0200 Subject: [PATCH 065/161] Remove stray whitespace --- pandas/core/frame.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 51caf08c91bea..d1a9297f1f500 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -491,8 +491,6 @@ def to_csv(self, path, nanRep='', cols=None, header=True, mode : Python write mode, default 'w' delimiter : character, default "," Field delimiter for the output file. - - """ f = open(path, mode) csvout = csv.writer(f, lineterminator='\n', delimiter=delimiter) From 91632344094eeb6bcee19de11750e6a8a1fd1e3a Mon Sep 17 00:00:00 2001 From: Aman Thakral Date: Tue, 18 Oct 2011 16:41:02 -0400 Subject: [PATCH 066/161] Fixed issue with plotting a column with all null values. --- pandas/core/frame.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d1a9297f1f500..889b714082e4a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2871,12 +2871,14 @@ def plot(self, subplots=False, sharex=True, sharey=False, use_index=True, x = range(len(self)) for i, col in enumerate(_try_sort(self.columns)): + empty = np.all(np.isnan(self[col].values)) + y = self[col].values if not empty else np.zeros(x.shape) if subplots: ax = axes[i] - ax.plot(x, self[col].values, 'k', label=col, **kwds) + ax.plot(x, y, 'k', label=col, **kwds) ax.legend(loc='best') else: - ax.plot(x, self[col].values, label=col, **kwds) + ax.plot(x, y, label=col, **kwds) ax.grid(grid) From 1f01e49736b2a499ac37e80e6abaab2763373254 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 18 Oct 2011 23:57:29 -0400 Subject: [PATCH 067/161] fixed #251 used empty = self[col].count() == 0 --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 889b714082e4a..b0566e5f25fea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2871,7 +2871,7 @@ def plot(self, subplots=False, sharex=True, sharey=False, use_index=True, x = range(len(self)) for i, col in enumerate(_try_sort(self.columns)): - empty = np.all(np.isnan(self[col].values)) + empty = self[col].count() == 0 y = self[col].values if not empty else np.zeros(x.shape) if subplots: ax = axes[i] From 84b7cfe23c070fb45911bf396d7c15802af9b599 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Sun, 16 Oct 2011 18:46:16 +0100 Subject: [PATCH 068/161] Create simple summaries of Series with object data. Addresses gh-210 --- pandas/core/series.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index bc921dfd31141..3a808355ea746 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,6 +5,7 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 +import collections import csv import itertools import operator @@ -873,12 +874,20 @@ def describe(self): ------- desc : Series """ - names = ['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max'] - - data = [self.count(), self.mean(), self.std(), self.min(), - self.quantile(.25), self.median(), self.quantile(.75), - self.max()] + if self.dtype == object: + names = ['count', 'unique', 'top', 'freq'] + + objcounts = collections.Counter(self) + top, freq = objcounts.most_common(1)[0] + data = [self.count(), len(objcounts), top, freq] + + else: + names = ['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max'] + + data = [self.count(), self.mean(), self.std(), self.min(), + self.quantile(.25), self.median(), self.quantile(.75), + self.max()] return Series(data, index=names) From f5fb61fee452eb5aabab362897fa14b9057556d9 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Sun, 16 Oct 2011 18:51:28 +0100 Subject: [PATCH 069/161] Fix bugs with .max() and .min() for integer columns in DataFrame. --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b0566e5f25fea..6e48360a5eb4f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2511,7 +2511,7 @@ def min(self, axis=0, skipna=True): min : Series """ values = self.values.copy() - if skipna: + if skipna and not issubclass(values.dtype.type, np.int_): np.putmask(values, -np.isfinite(values), np.inf) return Series(values.min(axis), index=self._get_agg_axis(axis)) @@ -2532,7 +2532,7 @@ def max(self, axis=0, skipna=True): max : Series """ values = self.values.copy() - if skipna: + if skipna and not issubclass(values.dtype.type, np.int_): np.putmask(values, -np.isfinite(values), -np.inf) return Series(values.max(axis), index=self._get_agg_axis(axis)) From 0117349cad134368efb62389b8df5118d702a3e6 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Sun, 16 Oct 2011 18:57:11 +0100 Subject: [PATCH 070/161] Add tests for .min and .max on dataframes with integer columns. --- pandas/tests/test_frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 84576c173502f..85d2aa4326bd0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2609,9 +2609,11 @@ def wrapper(x): def test_min(self): self._check_stat_op('min', np.min) + self._check_stat_op('min', np.min, frame=self.intframe) def test_max(self): self._check_stat_op('max', np.max) + self._check_stat_op('max', np.max, frame=self.intframe) def test_mad(self): f = lambda x: np.abs(x - x.mean()).mean() From dbb582e3f3d8c38edf15604b7c84345fb76eb97d Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Sun, 16 Oct 2011 18:59:04 +0100 Subject: [PATCH 071/161] Test .describe() for object series. --- pandas/tests/test_series.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 817721dbcfeb8..9a4ce05843cda 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -550,6 +550,7 @@ def test_quantile(self): def test_describe(self): _ = self.series.describe() _ = self.ts.describe() + _ = self.objSeries.describe() def test_append(self): appendedSeries = self.series.append(self.ts) From 7f3e3b4f3a9173da3930972ccb75bdb3a7ff8911 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 16:53:09 -0400 Subject: [PATCH 072/161] DOC: release notes --- RELEASE.rst | 98 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 41 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 27c77177906c2..0090f76e5fbb8 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -5,6 +5,23 @@ Release Notes This is the list of changes to pandas between each release. For full details, see the commit logs at http://github.com/wesm/pandas +What is it +---------- + +**pandas** is a library of powerful labeled-axis data structures, statistical +tools, and general code for working with relational data sets, including time +series and cross-sectional data. It was designed with the practical needs of +statistical modeling and large, inhomogeneous data sets in mind. It is +particularly well suited for, among other things, financial data analysis +applications. + +Where to get it +--------------- + +Source code: http://github.com/wesm/pandas +Binary installers on PyPI: http://pypi.python.org/pypi/pandas +Documentation: http://pandas.sourceforge.net + pandas 0.5.0 ============ @@ -31,7 +48,7 @@ feedback on the library. - Parsing functions like `read_csv` no longer parse dates by default (GH #225) - Removed `weights` option in panel regression which was not doing anything - principled + principled (GH #155) - Changed `buffer` argument name in `Series.to_string` to `buf` - `Series.to_string` and `DataFrame.to_string` now return strings by default instead of printing to sys.stdout @@ -93,6 +110,10 @@ feedback on the library. - Added column attribute access to DataFrame, e.g. df.A equivalent to df['A'] if 'A' is a column in the DataFrame (PR #213) - Added IPython tab completion hook for DataFrame columns. (PR #233, GH #230) + - Implement `Series.describe` for Series containing objects (PR #241) + - Add inner join option to `DataFrame.join` when joining on key(s) (GH #248) + - Can select set of DataFrame columns by passing a list to `__getitem__` (GH + #253) **Improvements to existing features** @@ -105,37 +126,58 @@ feedback on the library. rather than deferring the check until later - Refactored merging / joining code into a tidy class and disabled unnecessary computations in the float/object case, thus getting about 10% better - performance + performance (GH #211) - Improved speed of `DataFrame.xs` on mixed-type DataFrame objects by about - 5x, regression from 0.3.0 + 5x, regression from 0.3.0 (GH #215) - With new `DataFrame.align` method, speeding up binary operations between differently-indexed DataFrame objects by 10-25%. - - Significantly sped up conversion of nested dict into DataFrame + - Significantly sped up conversion of nested dict into DataFrame (GH #212) - Can pass hierarchical index level name to `groupby` instead of the level number if desired (GH #223) + - Add support for different delimiters in `DataFrame.to_csv` (PR #244) **Bug fixes** - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should be reported upstream to matplotlib (GH #224) - - Fixed problem in which data would get upcasted to object dtype in - GroupBy.apply operations (GH #237) - - Fixed outer join bug with empty DataFrame (GH #238) - DataFrame.iteritems was not returning Series with the name attribute set. Also neither was DataFrame._series - Can store datetime.date objects in HDFStore (GH #231) - Index and Series names are now stored in HDFStore + - Fixed problem in which data would get upcasted to object dtype in + GroupBy.apply operations (GH #237) + - Fixed outer join bug with empty DataFrame (GH #238) - Can create empty Panel (GH #239) + - Fix join on single key when passing list with 1 entry (GH #246) + - Don't raise Exception on plotting DataFrame with an all-NA column (GH #251, + PR #254) + - Bug min/max errors when called on integer DataFrames (PR #241) + - `DataFrame.iteritems` and `DataFrame._series` not assigning name attribute - Panel.__repr__ raised exception on length-0 major/minor axes + - `DataFrame.join` on key with empty DataFrame produced incorrect columns - `read_csv` / `read_table` fixes - Be less aggressive about converting float->int in cases of floating point representations of integers like 1.0, 2.0, etc. - "True"/"False" will not get correctly converted to boolean - Index name attribute will get set when specifying an index column + - Passing column names should force `header=None` (GH #257) + - Don't modify passed column names when `index_col` is not + None (GH #258) + +Thanks +------ + +- Thomas Kluyver +- Daniel Fortunov +- Aman Thakral +- Luca Beltrame pandas 0.4.3 ============ +Release notes +------------- + **Release date:** 10/9/2011 This is largely a bugfix release from 0.4.2 but also includes a handful of new @@ -197,6 +239,9 @@ Thanks pandas 0.4.2 ============ +Release notes +------------- + **Release date:** 10/3/2011 This is a performance optimization release with several bug fixes. The new @@ -272,6 +317,9 @@ Thanks pandas 0.4.1 ============ +Release notes +------------- + **Release date:** 9/25/2011 This is primarily a bug fix release but includes some new features and @@ -342,23 +390,6 @@ Thanks pandas 0.4 ========== -What is it ----------- - -**pandas** is a library of powerful labeled-axis data structures, statistical -tools, and general code for working with relational data sets, including time -series and cross-sectional data. It was designed with the practical needs of -statistical modeling and large, inhomogeneous data sets in mind. It is -particularly well suited for, among other things, financial data analysis -applications. - -Where to get it ---------------- - -Source code: http://github.com/wesm/pandas -Binary installers on PyPI: http://pypi.python.org/pypi/pandas -Documentation: http://pandas.sourceforge.net - Release notes ------------- @@ -619,8 +650,8 @@ Thanks - Skipper Seabold - Chris Jordan-Squire -pandas 0.3 -========== +pandas 0.3.0 +============ This major release of pandas represents approximately 1 year of continuous development work and brings with it many new features, bug fixes, speed @@ -628,21 +659,6 @@ enhancements, and general quality-of-life improvements. The most significant change from the 0.2 release has been the completion of a rigorous unit test suite covering all of the core functionality. -What is it ----------- - -**pandas** is a library of labeled data structures, statistical models, and -general code for working with time series and cross-sectional data. It was -designed with the practical needs of statistical modeling and large, -inhomogeneous data sets in mind. - -Where to get it ---------------- - -Source code: http://github.com/wesm/pandas -Binary installers on PyPI: http://pypi.python.org/pypi/pandas -Documentation: http://pandas.sourceforge.net - Release notes ------------- From d9ea01f8d4371a2f009d540136b7ee709fc3c24a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 17:05:10 -0400 Subject: [PATCH 073/161] BUG: allow casting DataFrame float column to int, GH #252 --- pandas/core/internals.py | 16 +++++++++++++++- pandas/tests/test_frame.py | 10 +++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e3dc1e296affd..05a5526bbbb2b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -171,21 +171,35 @@ def fillna(self, value): class FloatBlock(Block): + def should_store(self, value): + # when inserting a column should not coerce integers to floats + # unnecessarily + return issubclass(value.dtype.type, np.floating) + def can_store(self, value): return issubclass(value.dtype.type, (np.integer, np.floating)) class IntBlock(Block): + def should_store(self, value): + return self.can_store(value) + def can_store(self, value): return issubclass(value.dtype.type, np.integer) class BoolBlock(Block): + def should_store(self, value): + return self.can_store(value) + def can_store(self, value): return issubclass(value.dtype.type, np.bool_) class ObjectBlock(Block): + def should_store(self, value): + return self.can_store(value) + def can_store(self, value): return not issubclass(value.dtype.type, (np.integer, np.floating, np.bool_)) @@ -534,7 +548,7 @@ def set(self, item, value): assert(value.shape[1:] == self.shape[1:]) if item in self.items: i, block = self._find_block(item) - if not block.can_store(value): + if not block.should_store(value): # delete from block, create and append new block self._delete_from_block(i, item) self._add_new_block(item, value) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 85d2aa4326bd0..5aa040d603978 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -160,6 +160,10 @@ def test_setitem_boolean(self): np.putmask(expected.values, mask.values, df.values * 2) assert_frame_equal(df, expected) + def test_setitem_cast(self): + self.frame['D'] = self.frame['D'].astype('i8') + self.assert_(self.frame['D'].dtype == np.int64) + def test_setitem_boolean_column(self): expected = self.frame.copy() mask = self.frame['A'] > 0 @@ -284,8 +288,8 @@ def test_setitem_fancy_2d(self): frame = self.frame.copy() expected = frame.copy() frame.ix[:, ['B', 'A']] = 1 - expected['B'] = 1 - expected['A'] = 1 + expected['B'] = 1. + expected['A'] = 1. assert_frame_equal(frame, expected) # case 2 @@ -370,7 +374,7 @@ def test_fancy_getitem_slice_mixed(self): # get view with single block sliced = self.frame.ix[:, -3:] - sliced['C'] = 4 + sliced['C'] = 4. self.assert_((self.frame['C'] == 4).all()) def test_fancy_setitem_int_labels(self): From 19037a671aa14c7846ab05c5d77e866601dc2de3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 17:16:46 -0400 Subject: [PATCH 074/161] DOC: add more helpful error message when accidentally importing pandas from source directory, address GH #250 --- pandas/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/__init__.py b/pandas/__init__.py index be0388908b3cb..48a46ed98305b 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -6,6 +6,16 @@ import numpy as np +try: + import pandas._tseries as lib +except Exception, e: + if 'No module named' in e.message: + raise ImportError('C extensions not built: if you installed already ' + 'verify that you are not importing from the source ' + 'directory') + else: + raise + from pandas.version import version as __version__ from pandas.info import __doc__ From f785ccbc8400319a456266f72c389ad5951c1337 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 17:22:18 -0400 Subject: [PATCH 075/161] DOC: release notes --- RELEASE.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 0090f76e5fbb8..dc627ee24b927 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -28,16 +28,16 @@ pandas 0.5.0 **Release date:** not yet released This release of pandas includes a number of API changes (see below) and cleanup -of deprecated APIs from pre-0.4.0 releases. There are also bug fixes, some new -features, performance enhancements, and includes a new IPython completer hook to -enable tab completion of DataFrame columns accesses as attributes (a new -feature). +of deprecated APIs from pre-0.4.0 releases. There are also bug fixes, new +features, numerous significant performance enhancements, and includes a new +IPython completer hook to enable tab completion of DataFrame columns accesses +as attributes (a new feature). In addition to the changes listed here from 0.4.3 to 0.5.0, the minor releases 0.4.1, 0.4.2, and 0.4.3 brought some significant new functionality and performance improvements that are worth taking a look at. -Thanks to Thomas Kluyver and others for contributing patches and providing +Thanks to all for bug reports, contributed patches and generally providing feedback on the library. **API Changes** @@ -135,6 +135,8 @@ feedback on the library. - Can pass hierarchical index level name to `groupby` instead of the level number if desired (GH #223) - Add support for different delimiters in `DataFrame.to_csv` (PR #244) + - Add more helpful error message when importing pandas post-installation from + the source directory (GH #250) **Bug fixes** From 5c3214e066a5a64cfb93ab71b5ae5b4ff610414d Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Wed, 19 Oct 2011 22:00:50 +0100 Subject: [PATCH 076/161] Include local copy of Counter for Python < 2.7 --- pandas/core/series.py | 9 +- pandas/util/counter.py | 280 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 287 insertions(+), 2 deletions(-) create mode 100644 pandas/util/counter.py diff --git a/pandas/core/series.py b/pandas/core/series.py index 3a808355ea746..494d188e634e0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,11 +5,16 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 -import collections import csv import itertools import operator +try: + from collections import Counter +except ImportError: + # For Python < 2.7, we include a local copy of this: + from pandas.util.counter import Counter + from numpy import nan, ndarray import numpy as np @@ -877,7 +882,7 @@ def describe(self): if self.dtype == object: names = ['count', 'unique', 'top', 'freq'] - objcounts = collections.Counter(self) + objcounts = Counter(self) top, freq = objcounts.most_common(1)[0] data = [self.count(), len(objcounts), top, freq] diff --git a/pandas/util/counter.py b/pandas/util/counter.py new file mode 100644 index 0000000000000..b7b27d8c98066 --- /dev/null +++ b/pandas/util/counter.py @@ -0,0 +1,280 @@ +# This is copied from collections in Python 2.7, for compatibility with older +# versions of Python. It can be dropped when we depend on Python 2.7/3.1 + +class Counter(dict): + '''Dict subclass for counting hashable items. Sometimes called a bag + or multiset. Elements are stored as dictionary keys and their counts + are stored as dictionary values. + + >>> c = Counter('abcdeabcdabcaba') # count elements from a string + + >>> c.most_common(3) # three most common elements + [('a', 5), ('b', 4), ('c', 3)] + >>> sorted(c) # list all unique elements + ['a', 'b', 'c', 'd', 'e'] + >>> ''.join(sorted(c.elements())) # list elements with repetitions + 'aaaaabbbbcccdde' + >>> sum(c.values()) # total of all counts + 15 + + >>> c['a'] # count of letter 'a' + 5 + >>> for elem in 'shazam': # update counts from an iterable + ... c[elem] += 1 # by adding 1 to each element's count + >>> c['a'] # now there are seven 'a' + 7 + >>> del c['b'] # remove all 'b' + >>> c['b'] # now there are zero 'b' + 0 + + >>> d = Counter('simsalabim') # make another counter + >>> c.update(d) # add in the second counter + >>> c['a'] # now there are nine 'a' + 9 + + >>> c.clear() # empty the counter + >>> c + Counter() + + Note: If a count is set to zero or reduced to zero, it will remain + in the counter until the entry is deleted or the counter is cleared: + + >>> c = Counter('aaabbc') + >>> c['b'] -= 2 # reduce the count of 'b' by two + >>> c.most_common() # 'b' is still in, but its count is zero + [('a', 3), ('c', 1), ('b', 0)] + + ''' + # References: + # http://en.wikipedia.org/wiki/Multiset + # http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html + # http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm + # http://code.activestate.com/recipes/259174/ + # Knuth, TAOCP Vol. II section 4.6.3 + + def __init__(self, iterable=None, **kwds): + '''Create a new, empty Counter object. And if given, count elements + from an input iterable. Or, initialize the count from another mapping + of elements to their counts. + + >>> c = Counter() # a new, empty counter + >>> c = Counter('gallahad') # a new counter from an iterable + >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping + >>> c = Counter(a=4, b=2) # a new counter from keyword args + + ''' + super(Counter, self).__init__() + self.update(iterable, **kwds) + + def __missing__(self, key): + 'The count of elements not in the Counter is zero.' + # Needed so that self[missing_item] does not raise KeyError + return 0 + + def most_common(self, n=None): + '''List the n most common elements and their counts from the most + common to the least. If n is None, then list all element counts. + + >>> Counter('abcdeabcdabcaba').most_common(3) + [('a', 5), ('b', 4), ('c', 3)] + + ''' + # Emulate Bag.sortedByCount from Smalltalk + if n is None: + return sorted(self.iteritems(), key=_itemgetter(1), reverse=True) + return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1)) + + def elements(self): + '''Iterator over elements repeating each as many times as its count. + + >>> c = Counter('ABCABC') + >>> sorted(c.elements()) + ['A', 'A', 'B', 'B', 'C', 'C'] + + # Knuth's example for prime factors of 1836: 2**2 * 3**3 * 17**1 + >>> prime_factors = Counter({2: 2, 3: 3, 17: 1}) + >>> product = 1 + >>> for factor in prime_factors.elements(): # loop over factors + ... product *= factor # and multiply them + >>> product + 1836 + + Note, if an element's count has been set to zero or is a negative + number, elements() will ignore it. + + ''' + # Emulate Bag.do from Smalltalk and Multiset.begin from C++. + return _chain.from_iterable(_starmap(_repeat, self.iteritems())) + + # Override dict methods where necessary + + @classmethod + def fromkeys(cls, iterable, v=None): + # There is no equivalent method for counters because setting v=1 + # means that no element can have a count greater than one. + raise NotImplementedError( + 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.') + + def update(self, iterable=None, **kwds): + '''Like dict.update() but add counts instead of replacing them. + + Source can be an iterable, a dictionary, or another Counter instance. + + >>> c = Counter('which') + >>> c.update('witch') # add elements from another iterable + >>> d = Counter('watch') + >>> c.update(d) # add elements from another counter + >>> c['h'] # four 'h' in which, witch, and watch + 4 + + ''' + # The regular dict.update() operation makes no sense here because the + # replace behavior results in the some of original untouched counts + # being mixed-in with all of the other counts for a mismash that + # doesn't have a straight-forward interpretation in most counting + # contexts. Instead, we implement straight-addition. Both the inputs + # and outputs are allowed to contain zero and negative counts. + + if iterable is not None: + if isinstance(iterable, Mapping): + if self: + self_get = self.get + for elem, count in iterable.iteritems(): + self[elem] = self_get(elem, 0) + count + else: + super(Counter, self).update(iterable) # fast path when counter is empty + else: + self_get = self.get + for elem in iterable: + self[elem] = self_get(elem, 0) + 1 + if kwds: + self.update(kwds) + + def subtract(self, iterable=None, **kwds): + '''Like dict.update() but subtracts counts instead of replacing them. + Counts can be reduced below zero. Both the inputs and outputs are + allowed to contain zero and negative counts. + + Source can be an iterable, a dictionary, or another Counter instance. + + >>> c = Counter('which') + >>> c.subtract('witch') # subtract elements from another iterable + >>> c.subtract(Counter('watch')) # subtract elements from another counter + >>> c['h'] # 2 in which, minus 1 in witch, minus 1 in watch + 0 + >>> c['w'] # 1 in which, minus 1 in witch, minus 1 in watch + -1 + + ''' + if iterable is not None: + self_get = self.get + if isinstance(iterable, Mapping): + for elem, count in iterable.items(): + self[elem] = self_get(elem, 0) - count + else: + for elem in iterable: + self[elem] = self_get(elem, 0) - 1 + if kwds: + self.subtract(kwds) + + def copy(self): + 'Return a shallow copy.' + return self.__class__(self) + + def __reduce__(self): + return self.__class__, (dict(self),) + + def __delitem__(self, elem): + 'Like dict.__delitem__() but does not raise KeyError for missing values.' + if elem in self: + super(Counter, self).__delitem__(elem) + + def __repr__(self): + if not self: + return '%s()' % self.__class__.__name__ + items = ', '.join(map('%r: %r'.__mod__, self.most_common())) + return '%s({%s})' % (self.__class__.__name__, items) + + # Multiset-style mathematical operations discussed in: + # Knuth TAOCP Volume II section 4.6.3 exercise 19 + # and at http://en.wikipedia.org/wiki/Multiset + # + # Outputs guaranteed to only include positive counts. + # + # To strip negative and zero counts, add-in an empty counter: + # c += Counter() + + def __add__(self, other): + '''Add counts from two counters. + + >>> Counter('abbb') + Counter('bcc') + Counter({'b': 4, 'c': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + newcount = count + other[elem] + if newcount > 0: + result[elem] = newcount + for elem, count in other.items(): + if elem not in self and count > 0: + result[elem] = count + return result + + def __sub__(self, other): + ''' Subtract count, but keep only results with positive counts. + + >>> Counter('abbbc') - Counter('bccd') + Counter({'b': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + newcount = count - other[elem] + if newcount > 0: + result[elem] = newcount + for elem, count in other.items(): + if elem not in self and count < 0: + result[elem] = 0 - count + return result + + def __or__(self, other): + '''Union is the maximum of value in either of the input counters. + + >>> Counter('abbb') | Counter('bcc') + Counter({'b': 3, 'c': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + other_count = other[elem] + newcount = other_count if count < other_count else count + if newcount > 0: + result[elem] = newcount + for elem, count in other.items(): + if elem not in self and count > 0: + result[elem] = count + return result + + def __and__(self, other): + ''' Intersection is the minimum of corresponding counts. + + >>> Counter('abbb') & Counter('bcc') + Counter({'b': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + other_count = other[elem] + newcount = count if count < other_count else other_count + if newcount > 0: + result[elem] = newcount + return result From b6ddee7dd295d4c632fbd08f9c4816d365c9bc8f Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Wed, 19 Oct 2011 22:22:44 +0100 Subject: [PATCH 077/161] Add necessary imports for Counter class. --- pandas/util/counter.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/util/counter.py b/pandas/util/counter.py index b7b27d8c98066..f23f6e6fbbad1 100644 --- a/pandas/util/counter.py +++ b/pandas/util/counter.py @@ -1,6 +1,16 @@ # This is copied from collections in Python 2.7, for compatibility with older # versions of Python. It can be dropped when we depend on Python 2.7/3.1 +import heapq as _heapq +from itertools import repeat as _repeat, chain as _chain, starmap as _starmap +from operator import itemgetter as _itemgetter + +try: + from collections import Mapping +except: + # ABCs were only introduced in Python 2.6, so this is a hack for Python 2.5: + Mapping = dict + class Counter(dict): '''Dict subclass for counting hashable items. Sometimes called a bag or multiset. Elements are stored as dictionary keys and their counts From 2e79dc3c552f2da61514f94349e4b819e20e5929 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 16:18:47 -0400 Subject: [PATCH 078/161] ENH: speed up isnull, some bugs to be fixed still --- pandas/src/tseries.pyx | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index 765c5c7eb2dcb..d384fe95169f5 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -223,38 +223,23 @@ def array_to_datetime(ndarray[int64_t, ndim=1] arr): cdef double INF = np.inf cdef double NEGINF = -INF -cdef inline _isnan(object o): - return o != o - cdef inline _checknull(object val): - if isinstance(val, (float, np.floating)): - return val != val or val == INF or val == NEGINF - else: - return val is None + return val is None or val != val cpdef checknull(object val): return _checknull(val) -def isnullobj(ndarray input): - cdef int i, length +def isnullobj(ndarray[object] arr): + cdef Py_ssize_t i, n cdef object val - cdef ndarray[npy_int8, ndim=1] result - cdef flatiter iter - - length = PyArray_SIZE(input) - - result = np.zeros(length, dtype=np.int8) - - iter= PyArray_IterNew(input) - - for i from 0 <= i < length: - val = PyArray_GETITEM(input, PyArray_ITER_DATA(iter)) + cdef ndarray[uint8_t, cast=True] result + n = len(arr) + result = np.zeros(n, dtype=bool) + for i from 0 <= i < n: + val = arr[i] if _checknull(val): result[i] = 1 - - PyArray_ITER_NEXT(iter) - return result def list_to_object_array(list obj): From a46574a970f4b7a1968313f1d1f55e50f71c0fcb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 17:35:42 -0400 Subject: [PATCH 079/161] BUG: check for inf and neginf too --- pandas/src/generate_code.py | 2 +- pandas/src/generated.pyx | 10 +++++----- pandas/src/tseries.pyx | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 48d17b75bcd2a..28bc8f2caa985 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -312,7 +312,7 @@ def is_monotonic_%(name)s(ndarray[%(c_type)s] arr): @cython.boundscheck(False) def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t, cast=True] mask cdef int i, length cdef list members cdef object idx, key diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 2bdf65f0952b4..9d191e0a563b3 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -938,7 +938,7 @@ def is_monotonic_bool(ndarray[uint8_t] arr): @cython.boundscheck(False) def groupby_float64(ndarray[float64_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t, cast=True] mask cdef int i, length cdef list members cdef object idx, key @@ -964,7 +964,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_object(ndarray[object] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t, cast=True] mask cdef int i, length cdef list members cdef object idx, key @@ -990,7 +990,7 @@ def groupby_object(ndarray[object] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_int32(ndarray[int32_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t, cast=True] mask cdef int i, length cdef list members cdef object idx, key @@ -1016,7 +1016,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_int64(ndarray[int64_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t, cast=True] mask cdef int i, length cdef list members cdef object idx, key @@ -1042,7 +1042,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_bool(ndarray[uint8_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t, cast=True] mask cdef int i, length cdef list members cdef object idx, key diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index d384fe95169f5..23a9278e6c265 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -49,9 +49,6 @@ cdef double_t *get_double_ptr(ndarray arr): cdef extern from "math.h": double sqrt(double x) -#cdef extern from "cobject.h": -# pass # for datetime API - cdef extern from "datetime.h": ctypedef class datetime.datetime [object PyDateTime_DateTime]: @@ -227,7 +224,10 @@ cdef inline _checknull(object val): return val is None or val != val cpdef checknull(object val): - return _checknull(val) + if isinstance(val, (float, np.floating)): + return val != val or val == INF or val == NEGINF + else: + return _checknull(val) def isnullobj(ndarray[object] arr): cdef Py_ssize_t i, n From 4662c14aa5342af67c5801b0277c38b751149a85 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Oct 2011 18:19:18 -0400 Subject: [PATCH 080/161] ENH: add Series.histogram, address GH #240 --- pandas/core/series.py | 21 ++++++++++++++++++--- pandas/tests/test_series.py | 11 +++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 494d188e634e0..4e04fca30de9a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -592,6 +592,21 @@ def _count_level(self, level): return Series(result, index=level_index) + def histogram(self): + """ + Returns Series containing counts of unique values. The result Series's + index will be the sorted unique values + + Returns + ------- + histogram : Series + """ + from collections import defaultdict + counter = defaultdict(lambda: 0) + for value in self.values: + counter[value] += 1 + return Series(counter) + def sum(self, axis=0, dtype=None, out=None, skipna=True): """ Sum of values @@ -881,12 +896,12 @@ def describe(self): """ if self.dtype == object: names = ['count', 'unique', 'top', 'freq'] - + objcounts = Counter(self) top, freq = objcounts.most_common(1)[0] data = [self.count(), len(objcounts), top, freq] - - else: + + else: names = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 9a4ce05843cda..b6214fc9461d4 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -759,6 +759,17 @@ def test_count(self): self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum()) + def test_histogram(self): + s = Series(['a', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a']) + hist = s.histogram() + expected = Series([3, 3, 1, 2], index=['a', 'b', 'c', 'd']) + assert_series_equal(hist, expected) + + s = Series({}) + hist = s.histogram() + expected = Series([]) + assert_series_equal(hist, expected) + def test_sort(self): ts = self.ts.copy() ts.sort() From 153b16ccc86f39cc0e1fa97fc1318e51d7c0abe3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 01:23:51 -0400 Subject: [PATCH 081/161] ENH: refactored simple_parser into class, working on iterator. almost there --- pandas/io/parsers.py | 367 +++++++++++++++++++++++++++++-------------- 1 file changed, 246 insertions(+), 121 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 61c2e0575d83f..3ba8eea47bfe4 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -11,7 +11,7 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, skiprows=None, na_values=None, parse_dates=False, - date_parser=None): + date_parser=None, nrows=None, iterator=False, chunksize=None): import csv if hasattr(filepath_or_buffer, 'read'): @@ -38,23 +38,22 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, reader = csv.reader(f, dialect=dia) - if skiprows is not None: - skiprows = set(skiprows) - lines = [l for i, l in enumerate(reader) if i not in skiprows] - else: - lines = [l for l in reader] - f.close() - if date_parser is not None: parse_dates = True - return _simple_parser(lines, - header=header, - index_col=index_col, - names=names, - na_values=na_values, - parse_dates=parse_dates, - date_parser=date_parser) + parser = TextParser(reader, header=header, index_col=index_col, + names=names, na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + chunksize=chunksize) + + if nrows is not None: + return parser.get_chunk(nrows) + elif chunksize or iterator: + return parser + + return parser.get_chunk() def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, @@ -114,129 +113,248 @@ def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, """ % (_parser_params % _table_sep) -def _simple_parser(lines, names=None, header=0, index_col=0, - na_values=None, date_parser=None, parse_dates=True): +class TextParser(object): """ - Workhorse function for processing nested list into DataFrame + Converts lists of lists/tuples into DataFrames with proper type inference + and optional (e.g. string to datetime) conversion. Also enables iterating + lazily over chunks of large files - Should be replaced by np.genfromtxt eventually? + Parameters + ---------- + data : list or csv reader-like object + names : sequence, default + header : int, default 0 + Row to use to parse column labels. Defaults to the first row. Prior + rows will be discarded + index_col : int or list, default None + Column or columns to use as the (possibly hierarchical) index + na_values : iterable, defualt None + Custom NA values + parse_dates : boolean, default False + date_parser : function, default None + skiprows """ - passed_names = names is not None - if passed_names: - names = list(names) - header = None - - if header is not None: - columns = [] - for i, c in enumerate(lines[header]): - if c == '': - columns.append('Unnamed: %d' % i) + + # common NA values + # no longer excluding inf representations + # '1.#INF','-1.#INF', '1.#INF000000', + NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', + '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', + 'nan', '']) + + def __init__(self, data, names=None, header=0, index_col=None, + na_values=None, parse_dates=False, date_parser=None, + chunksize=None, skiprows=None): + """ + Workhorse function for processing nested list into DataFrame + + Should be replaced by np.genfromtxt eventually? + """ + self.data = data + + self.buf = [] + + self.pos = 0 + self.names = list(names) if names is not None else names + self.header = header + self.index_col = index_col + self.parse_dates = parse_dates + self.date_parser = date_parser + self.chunksize = chunksize + self.passed_names = names is not None + self.skiprows = set() if skiprows is None else set(skiprows) + + if na_values is None: + self.na_values = self.NA_VALUES + else: + self.na_values = set(list(na_values)) | self.NA_VALUES + + self.columns = self._infer_columns() + self.index_name = self._get_index_name() + + def _infer_columns(self): + names = self.names + passed_names = self.names is not None + if passed_names: + self.header = None + + if self.header is not None: + line = self._next_line() + while self.header > self.pos: + line = self._next_line() + + columns = [] + for i, c in enumerate(line): + if c == '': + columns.append('Unnamed: %d' % i) + else: + columns.append(c) + + counts = {} + for i, col in enumerate(columns): + cur_count = counts.get(col, 0) + if cur_count > 0: + columns[i] = '%s.%d' % (col, cur_count) + counts[col] = cur_count + 1 + else: + line = self._next_line() + self.buf.append(line) + + ncols = len(line) + if not names: + columns = ['X.%d' % (i + 1) for i in range(ncols)] else: - columns.append(c) + columns = names - content = lines[header+1:] + self._clear_buffer() - counts = {} - for i, col in enumerate(columns): - cur_count = counts.get(col, 0) - if cur_count > 0: - columns[i] = '%s.%d' % (col, cur_count) - counts[col] = cur_count + 1 - else: - ncols = len(lines[0]) - if not names: - columns = ['X.%d' % (i + 1) for i in range(ncols)] + return columns + + def _next_line(self): + if isinstance(self.data, list): + if self.pos in self.skiprows: + self.pos += 1 + + line = self.data[self.pos] else: - columns = names - content = lines + if self.pos in self.skiprows: + self.data.next() + self.pos += 1 + line = self.data.next() + self.pos += 1 + self.buf.append(line) - # spaghetti + return line + + def _clear_buffer(self): + self.buf = [] + + def __iter__(self): + try: + yield self.get_chunk(self.chunksize) + except StopIteration: + pass - # implicitly index_col=0 b/c 1 fewer column names - index_name = None - implicit_first_col = (len(content) > 0 and - len(content[0]) == len(columns) + 1) + def _get_index_name(self): + columns = self.columns - if implicit_first_col: - if index_col is None: - index_col = 0 + try: + line = self._next_line() + except StopIteration: + line = None + + # implicitly index_col=0 b/c 1 fewer column names index_name = None - elif np.isscalar(index_col): - if passed_names: + implicit_first_col = (line is not None and + len(line) == len(columns) + 1) + + passed_names = self.names is not None + + if implicit_first_col: + if self.index_col is None: + self.index_col = 0 index_name = None - else: - index_name = columns.pop(index_col) - elif index_col is not None: - if not passed_names: - cp_cols = list(columns) - index_name = [] - for i in index_col: - name = cp_cols[i] - columns.remove(name) - index_name.append(name) - else: - index_name=None + elif np.isscalar(self.index_col): + if passed_names: + index_name = None + else: + index_name = columns.pop(self.index_col) + elif self.index_col is not None: + if not passed_names: + cp_cols = list(columns) + index_name = [] + for i in self.index_col: + name = cp_cols[i] + columns.remove(name) + index_name.append(name) + else: + index_name=None + + return index_name + + def get_chunk(self, rows=None): + content = self._get_lines(rows) - if len(content) == 0: # pragma: no cover - if index_col is not None: - if np.isscalar(index_col): - index = Index([], name=index_name) + if len(content) == 0: # pragma: no cover + if self.index_col is not None: + if np.isscalar(self.index_col): + index = Index([], name=self.index_name) + else: + index = MultiIndex.from_arrays([[]] * len(self.index_col), + names=self.index_name) else: - index = MultiIndex.fromarrays([[]] * len(index_col), - names=index_name) + index = Index([]) + + return DataFrame(index=index, columns=self.columns) + + zipped_content = list(lib.to_object_array(content).T) + + # no index column specified, so infer that's what is wanted + if self.index_col is not None: + if np.isscalar(self.index_col): + index = zipped_content.pop(self.index_col) + else: # given a list of index + index = [] + for idx in self.index_col: + index.append(zipped_content[idx]) + #remove index items from content and columns, don't pop in loop + for i in range(len(self.index_col)): + zipped_content.remove(index[i]) + + if np.isscalar(self.index_col): + if self.parse_dates: + index = lib.try_parse_dates(index, parser=self.date_parser) + index = Index(_convert_types(index, self.na_values), + name=self.index_name) + else: + arrays = _maybe_convert_int_mindex(index, self.parse_dates, + self.date_parser) + index = MultiIndex.from_arrays(arrays, names=self.index_name) else: - index = Index([]) + index = Index(np.arange(len(content))) - return DataFrame(index=index, columns=columns) + if not index._verify_integrity(): + dups = index._get_duplicates() + raise Exception('Index has duplicates: %s' % str(dups)) - # common NA values - # no longer excluding inf representations - # '1.#INF','-1.#INF', '1.#INF000000', - NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', - 'nan', '']) - if na_values is None: - na_values = NA_VALUES - else: - na_values = set(list(na_values)) | NA_VALUES - - zipped_content = list(lib.to_object_array(content).T) - - # no index column specified, so infer that's what is wanted - if index_col is not None: - if np.isscalar(index_col): - index = zipped_content.pop(index_col) - else: # given a list of index - index = [] - for idx in index_col: - index.append(zipped_content[idx]) - #remove index items from content and columns, don't pop in loop - for i in range(len(index_col)): - zipped_content.remove(index[i]) - - if np.isscalar(index_col): - if parse_dates: - index = lib.try_parse_dates(index, parser=date_parser) - index = Index(_convert_types(index, na_values), - name=index_name) - else: - arrays = _maybe_convert_int_mindex(index, parse_dates, - date_parser) - index = MultiIndex.from_arrays(arrays, names=index_name) - else: - index = Index(np.arange(len(content))) + if len(self.columns) != len(zipped_content): + raise Exception('wrong number of columns') + + data = dict((k, v) for k, v in zip(self.columns, zipped_content)) + data = _convert_to_ndarrays(data, self.na_values) + return DataFrame(data=data, columns=self.columns, index=index) - if not index._verify_integrity(): - dups = index._get_duplicates() - raise Exception('Index has duplicates: %s' % str(dups)) + def _get_lines(self, rows=None): + source = self.data + lines = self.buf - if len(columns) != len(zipped_content): - raise Exception('wrong number of columns') + # already fetched some number + if rows is not None: + rows -= len(self.buf) - data = dict((k, v) for k, v in zip(columns, zipped_content)) - data = _convert_to_ndarrays(data, na_values) - return DataFrame(data=data, columns=columns, index=index) + if isinstance(source, list): + if self.pos >= len(source): + raise StopIteration + if rows is None: + lines.extend(source[self.pos:]) + self.pos = len(source) + else: + lines.extend(source[self.pos:self.pos+rows]) + self.pos += rows + else: + try: + if rows is not None: + for _ in xrange(rows): + lines.append(source.next()) + else: + while True: + lines.append(source.next()) + except StopIteration: + pass + self.buf = [] + + return lines def _maybe_convert_int_mindex(index, parse_dates, date_parser): for i in range(len(index)): @@ -288,7 +406,8 @@ def __repr__(self): return object.__repr__(self) def parse(self, sheetname, header=0, skiprows=None, index_col=None, - parse_dates=False, date_parser=None, na_values=None): + parse_dates=False, date_parser=None, na_values=None, + chunksize=None): """ Read Excel table into DataFrame @@ -337,6 +456,12 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None, value = datetime(*dt) row.append(value) data.append(row) - return _simple_parser(data, header=header, index_col=index_col, - parse_dates=parse_dates, date_parser=date_parser, - na_values=na_values) + + parser = TextParser(data, header=header, index_col=index_col, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + chunksize=chunksize) + + return parser.get_chunk() From 63834368769d8590a753c2ea4f9d50afdd80185d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 10:34:31 -0400 Subject: [PATCH 082/161] ENH: perf enh, only cast to float if numeric already --- pandas/core/groupby.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index eb32db4c37701..f662d8ea01c31 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -333,9 +333,10 @@ def _cython_agg_general(self, how): output = {} cannot_agg = [] for name, obj in self._iterate_slices(): - try: - obj = np.asarray(obj, dtype=float) - except ValueError: + if issubclass(obj.dtype.type, np.number): + if obj.dtype != np.float64: + obj = obj.astype('f8') + else: cannot_agg.append(name) continue From 4a218da5735f0784554eb1b80c6d61a2e0b8cadb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 10:43:22 -0400 Subject: [PATCH 083/161] ENH: speed boost in count by using Series.count --- pandas/core/frame.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6e48360a5eb4f..eca9ee6ea355b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2387,10 +2387,18 @@ def count(self, axis=0, level=None, numeric_only=False): return self._count_level(level, axis=axis, numeric_only=numeric_only) - y, axis_labels = self._get_agg_data(axis, numeric_only=numeric_only, - copy=False) - mask = notnull(y) - return Series(mask.sum(axis), index=axis_labels) + if numeric_only: + frame = self.ix[:, self._get_numeric_columns()] + else: + frame = self + + result = frame.apply(Series.count, axis=axis) + + # what happens with empty DataFrame + if isinstance(result, DataFrame): + result = Series({}) + + return result def _count_level(self, level, axis=0, numeric_only=False): # TODO: deal with sortedness?? From 94724284a940aa84aaa41150efacb61beba41013 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 10:45:02 -0400 Subject: [PATCH 084/161] ENH: handle zip file. pass test suite --- .gitignore | 1 + RELEASE.rst | 5 +++++ pandas/io/parsers.py | 35 ++++++++++++++++----------------- pandas/io/tests/test_parsers.py | 4 ++-- 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 3782509c5c048..564f27c5cd9c2 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ pandas/src/tseries.c pandas/src/sparse.c pandas/version.py doc/source/generated +doc/source/_static *flymake* scikits .coverage \ No newline at end of file diff --git a/RELEASE.rst b/RELEASE.rst index dc627ee24b927..690d9fef2e184 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -138,6 +138,7 @@ feedback on the library. - Add more helpful error message when importing pandas post-installation from the source directory (GH #250) + **Bug fixes** - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should @@ -165,6 +166,8 @@ feedback on the library. - Passing column names should force `header=None` (GH #257) - Don't modify passed column names when `index_col` is not None (GH #258) + - Can sniff CSV separator in zip file (since seek is not supported, was + failing before) Thanks ------ @@ -291,6 +294,8 @@ infrastructure are the main new additions retrieve groups - Added informative Exception when passing dict to DataFrame groupby aggregation with axis != 0 + - Significantly speed up DataFrame `__repr__` and `count` on large mixed-type + DataFrame objects **API Changes** diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3ba8eea47bfe4..6bd33da1b8c60 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,6 +2,8 @@ Module contains tools for processing files into DataFrames or other objects """ +from StringIO import StringIO + import numpy as np from pandas.core.index import Index, MultiIndex @@ -31,10 +33,12 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, dia.delimiter = sep # attempt to sniff the delimiter if sniff_sep: - sample = f.readline() - sniffed = csv.Sniffer().sniff(sample) + line = f.readline() + sniffed = csv.Sniffer().sniff(line) dia.delimiter = sniffed.delimiter - f.seek(0) + buf = list(csv.reader(StringIO(line), dialect=dia)) + else: + buf = [] reader = csv.reader(f, dialect=dia) @@ -46,7 +50,7 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, - chunksize=chunksize) + chunksize=chunksize, buf=buf) if nrows is not None: return parser.get_chunk(nrows) @@ -144,7 +148,7 @@ class TextParser(object): def __init__(self, data, names=None, header=0, index_col=None, na_values=None, parse_dates=False, date_parser=None, - chunksize=None, skiprows=None): + chunksize=None, skiprows=None, buf=None): """ Workhorse function for processing nested list into DataFrame @@ -152,9 +156,10 @@ def __init__(self, data, names=None, header=0, index_col=None, """ self.data = data - self.buf = [] + # can pass rows read so far + self.buf = [] if buf is None else buf + self.pos = len(self.buf) - self.pos = 0 self.names = list(names) if names is not None else names self.header = header self.index_col = index_col @@ -179,7 +184,10 @@ def _infer_columns(self): self.header = None if self.header is not None: - line = self._next_line() + if len(self.buf) > 0: + line = self.buf[0] + else: + line = self._next_line() while self.header > self.pos: line = self._next_line() @@ -196,9 +204,9 @@ def _infer_columns(self): if cur_count > 0: columns[i] = '%s.%d' % (col, cur_count) counts[col] = cur_count + 1 + self._clear_buffer() else: line = self._next_line() - self.buf.append(line) ncols = len(line) if not names: @@ -206,7 +214,6 @@ def _infer_columns(self): else: columns = names - self._clear_buffer() return columns @@ -435,16 +442,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None, datemode = self.book.datemode sheet = self.book.sheet_by_name(sheetname) - if skiprows is None: - skiprows = set() - else: - skiprows = set(skiprows) - data = [] for i in range(sheet.nrows): - if i in skiprows: - continue - row = [] for value, typ in zip(sheet.row_values(i), sheet.row_types(i)): if typ == XL_CELL_DATE: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 632d715ab140b..fb0cf3530bcc6 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -128,8 +128,8 @@ def test_header_with_index_col(self): self.assertEqual(names, ['A', 'B', 'C']) - data = [[1,2,3],[4,5,6],[7,8,9]] - expected = DataFrame(data, index=['foo','bar','baz'], + values = [[1,2,3],[4,5,6],[7,8,9]] + expected = DataFrame(values, index=['foo','bar','baz'], columns=['A','B','C']) assert_frame_equal(df, expected) From 36462470ba72853b161155a65d6f85a9bc80e8d0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 11:21:21 -0400 Subject: [PATCH 085/161] ENH: tweaks. tests needed --- pandas/io/parsers.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6bd33da1b8c60..357ed0e23a879 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3,6 +3,7 @@ """ from StringIO import StringIO +import zipfile import numpy as np @@ -10,7 +11,6 @@ from pandas.core.frame import DataFrame import pandas._tseries as lib - def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, skiprows=None, na_values=None, parse_dates=False, date_parser=None, nrows=None, iterator=False, chunksize=None): @@ -117,6 +117,18 @@ def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, """ % (_parser_params % _table_sep) +class BufferedReader(object): + """ + For handling different kinds of files, e.g. zip files where reading out a + chunk of lines is faster than reading out one line at a time. + """ + + def __init__(self, fh, delimiter=','): + pass + +class BufferedCSVReader(BufferedReader): + pass + class TextParser(object): """ Converts lists of lists/tuples into DataFrames with proper type inference @@ -176,6 +188,7 @@ def __init__(self, data, names=None, header=0, index_col=None, self.columns = self._infer_columns() self.index_name = self._get_index_name() + self._first_chunk = True def _infer_columns(self): names = self.names @@ -238,7 +251,8 @@ def _clear_buffer(self): def __iter__(self): try: - yield self.get_chunk(self.chunksize) + while True: + yield self.get_chunk(self.chunksize) except StopIteration: pass @@ -280,7 +294,16 @@ def _get_index_name(self): return index_name def get_chunk(self, rows=None): - content = self._get_lines(rows) + try: + content = self._get_lines(rows) + except StopIteration: + if self._first_chunk: + content = [] + else: + raise + + # done with first read, next time raise StopIteration + self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: @@ -357,7 +380,9 @@ def _get_lines(self, rows=None): while True: lines.append(source.next()) except StopIteration: - pass + if len(lines) == 0: + raise + self.pos += len(lines) self.buf = [] From 829e9e532baaa92bdd7ab7167a56b9094734d0ea Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 14:40:42 -0400 Subject: [PATCH 086/161] BUG: sql int column with null should result in f8 dtype --- pandas/io/tests/test_parsers.py | 5 +++++ pandas/src/parsing.pyx | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index fb0cf3530bcc6..2da9ebaee90d3 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -241,6 +241,11 @@ def test_convert_sql_column_ints(self): assert_same_values_and_dtype(result, expected) assert_same_values_and_dtype(result2, expected) + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + assert_same_values_and_dtype(result, expected) + def test_convert_sql_column_bools(self): arr = np.array([True, False, True, False], dtype='O') result = lib.convert_sql_column(arr) diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index cd8ee0b568b5b..044c0ac058aca 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -135,6 +135,7 @@ def convert_sql_column(ndarray[object] objects): floats[i] = val seen_float = 1 elif not (cpython.PyString_Check(val) or cpython.PyUnicode_Check(val)): + # this will convert Decimal objects try: floats[i] = float(val) seen_float = 1 @@ -142,7 +143,7 @@ def convert_sql_column(ndarray[object] objects): pass if seen_null: - if seen_float: + if seen_float or seen_int: return floats else: return objects From e7b36bca8b32f48f03fa6d49dee69708459e2193 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 18:25:42 -0400 Subject: [PATCH 087/161] BUG: don't convert Python long to float --- pandas/io/tests/test_parsers.py | 11 +++++++++++ pandas/src/parsing.pyx | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 2da9ebaee90d3..0301942335edd 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -246,6 +246,17 @@ def test_convert_sql_column_ints(self): expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') assert_same_values_and_dtype(result, expected) + def test_convert_sql_column_longs(self): + arr = np.array([1L, 2L, 3L, 4L], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + assert_same_values_and_dtype(result, expected) + + arr = np.array([1L, 2L, 3L, None, 4L], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + assert_same_values_and_dtype(result, expected) + def test_convert_sql_column_bools(self): arr = np.array([True, False, True, False], dtype='O') result = lib.convert_sql_column(arr) diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index 044c0ac058aca..486c2550c40c3 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -126,7 +126,7 @@ def convert_sql_column(ndarray[object] objects): elif cpython.PyBool_Check(val): seen_bool = 1 bools[i] = val - elif cpython.PyInt_Check(val): + elif cpython.PyInt_Check(val) or cpython.PyLong_Check(val): seen_int = 1 floats[i] = val if not seen_null: From 06130c6dd9d6f91fb4c7d1b33b6fcccb39f1f5e5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 18:43:59 -0400 Subject: [PATCH 088/161] ENH: rename histogram->value_counts and sort descending, GH #265 --- pandas/core/series.py | 11 ++++++----- pandas/tests/test_series.py | 10 +++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4e04fca30de9a..0701512c5ae0f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -592,20 +592,21 @@ def _count_level(self, level): return Series(result, index=level_index) - def histogram(self): + def value_counts(self): """ - Returns Series containing counts of unique values. The result Series's - index will be the sorted unique values + Returns Series containing counts of unique values. The resulting Series + will be in descending order so that the first element is the most + frequently-occurring element Returns ------- - histogram : Series + counts : Series """ from collections import defaultdict counter = defaultdict(lambda: 0) for value in self.values: counter[value] += 1 - return Series(counter) + return Series(counter).order(ascending=False) def sum(self, axis=0, dtype=None, out=None, skipna=True): """ diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index b6214fc9461d4..08d32b54c3038 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -759,14 +759,14 @@ def test_count(self): self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum()) - def test_histogram(self): - s = Series(['a', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a']) - hist = s.histogram() - expected = Series([3, 3, 1, 2], index=['a', 'b', 'c', 'd']) + def test_value_counts(self): + s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a']) + hist = s.value_counts() + expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) assert_series_equal(hist, expected) s = Series({}) - hist = s.histogram() + hist = s.value_counts() expected = Series([]) assert_series_equal(hist, expected) From 9ddfc57c383ae4ce36b81fa14a227a1e0037c552 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 18:54:03 -0400 Subject: [PATCH 089/161] BUG: Int64Index.take and MultiIndex.take don't lost names, GH #262 --- RELEASE.rst | 2 ++ pandas/core/index.py | 5 +++-- pandas/tests/test_index.py | 9 +++++++++ pandas/tests/test_multilevel.py | 4 ++++ 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 690d9fef2e184..840f7cd61e051 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -158,6 +158,8 @@ feedback on the library. - `DataFrame.iteritems` and `DataFrame._series` not assigning name attribute - Panel.__repr__ raised exception on length-0 major/minor axes - `DataFrame.join` on key with empty DataFrame produced incorrect columns + - `Int64Index.take` and `MultiIndex.take` lost name field, fix downstream + issue GH #262 - `read_csv` / `read_table` fixes - Be less aggressive about converting float->int in cases of floating point representations of integers like 1.0, 2.0, etc. diff --git a/pandas/core/index.py b/pandas/core/index.py index d54e97fdd45dc..4a59722a1337f 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -759,7 +759,7 @@ def take(self, *args, **kwargs): Analogous to ndarray.take """ taken = self.values.take(*args, **kwargs) - return Int64Index(taken) + return Int64Index(taken, name=self.name) class DateIndex(Index): pass @@ -1064,7 +1064,8 @@ def take(self, *args, **kwargs): Analogous to ndarray.take """ new_labels = [lab.take(*args, **kwargs) for lab in self.labels] - return MultiIndex(levels=self.levels, labels=new_labels) + return MultiIndex(levels=self.levels, labels=new_labels, + names=self.names) def append(self, other): """ diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 13b08a25aff79..43c1b44c1ae5c 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -556,6 +556,11 @@ def test_prevent_casting(self): result = self.index.astype('O') self.assert_(result.dtype == np.object_) + def test_take_preserve_name(self): + index = Int64Index([1,2,3,4], name='foo') + taken = index.take([3,0,1]) + self.assertEqual(index.name, taken.name) + class TestMultiIndex(unittest.TestCase): def setUp(self): @@ -967,6 +972,10 @@ def test_insert(self): # key wrong length self.assertRaises(Exception, self.index.insert, 0, ('foo2',)) + def test_take_preserve_name(self): + taken = self.index.take([3,0,1]) + self.assertEqual(taken.names, self.index.names) + class TestFactor(unittest.TestCase): def setUp(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index f4a93a5c0bb6f..ed15d49e0c2d5 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -75,6 +75,10 @@ def test_reindex_preserve_levels(self): chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) + def test_sort_index_preserve_levels(self): + result = self.frame.sort_index() + self.assertEquals(result.index.names, self.frame.index.names) + def test_repr_to_string(self): repr(self.frame) repr(self.ymd) From 92dea5e6c3f3b4e1d635cc13072d9d0019bcbadb Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Thu, 20 Oct 2011 17:50:51 +0100 Subject: [PATCH 090/161] Fix for IPython completion with non-string column names. --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eca9ee6ea355b..b8f54e867ff3e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3216,7 +3216,7 @@ def install_ipython_completers(): @complete_object.when_type(DataFrame) def complete_dataframe(obj, prev_completions): return prev_completions + [c for c in obj.columns \ - if py3compat.isidentifier(c)] + if isinstance(c, basestring) and py3compat.isidentifier(c)] # Importing IPython brings in about 200 modules, so we want to avoid it unless # we're in IPython (when those modules are loaded anyway). From d076438105a0eb88124d656f6b2c14231a79a21c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 20 Oct 2011 23:50:51 -0400 Subject: [PATCH 091/161] BUG: csv reader bugfixes and test coverage --- pandas/io/parsers.py | 29 +++++------- pandas/io/tests/test_parsers.py | 82 ++++++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 19 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 357ed0e23a879..4ca12c3debd6b 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -201,7 +201,8 @@ def _infer_columns(self): line = self.buf[0] else: line = self._next_line() - while self.header > self.pos: + + while self.pos <= self.header: line = self._next_line() columns = [] @@ -227,7 +228,6 @@ def _infer_columns(self): else: columns = names - return columns def _next_line(self): @@ -258,6 +258,7 @@ def __iter__(self): def _get_index_name(self): columns = self.columns + passed_names = self.names is not None try: line = self._next_line() @@ -265,31 +266,23 @@ def _get_index_name(self): line = None # implicitly index_col=0 b/c 1 fewer column names - index_name = None implicit_first_col = (line is not None and len(line) == len(columns) + 1) - passed_names = self.names is not None - + index_name = None if implicit_first_col: if self.index_col is None: self.index_col = 0 index_name = None elif np.isscalar(self.index_col): - if passed_names: - index_name = None - else: - index_name = columns.pop(self.index_col) + index_name = columns.pop(self.index_col) elif self.index_col is not None: - if not passed_names: - cp_cols = list(columns) - index_name = [] - for i in self.index_col: - name = cp_cols[i] - columns.remove(name) - index_name.append(name) - else: - index_name=None + cp_cols = list(columns) + index_name = [] + for i in self.index_col: + name = cp_cols[i] + columns.remove(name) + index_name.append(name) return index_name diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 0301942335edd..cc34af97c8008 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -14,6 +14,14 @@ import pandas._tseries as lib class TestParsers(unittest.TestCase): + data1 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" def setUp(self): self.dirpath = curpath() @@ -181,7 +189,9 @@ def test_read_table_duplicate_index(self): foo,12,13,14,15 bar,12,13,14,15 """ - self.assertRaises(Exception, read_csv, StringIO(data), index_col=0) + + self.assertRaises(Exception, read_csv, StringIO(data), + index_col=0) def test_parse_bools(self): data = """A,B @@ -211,6 +221,76 @@ def test_infer_index_col(self): data = read_csv(StringIO(data)) self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) + def test_read_nrows(self): + df = read_csv(StringIO(self.data1), nrows=3) + expected = read_csv(StringIO(self.data1))[:3] + assert_frame_equal(df, expected) + + def test_read_chunksize(self): + reader = read_csv(StringIO(self.data1), index_col=0, chunksize=2) + df = read_csv(StringIO(self.data1), index_col=0) + + chunks = list(reader) + + assert_frame_equal(chunks[0], df[:2]) + assert_frame_equal(chunks[1], df[2:4]) + assert_frame_equal(chunks[2], df[4:]) + + def test_iterator(self): + reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) + df = read_csv(StringIO(self.data1), index_col=0) + + chunk = reader.get_chunk(3) + assert_frame_equal(chunk, df[:3]) + + last_chunk = reader.get_chunk(5) + assert_frame_equal(last_chunk, df[3:]) + + def test_header_not_first_line(self): + data = """got,to,ignore,this,line +got,to,ignore,this,line +index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + data2 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + + df = read_csv(StringIO(data), header=2, index_col=0) + expected = read_csv(StringIO(data2), header=0, index_col=0) + assert_frame_equal(df, expected) + + def test_pass_names_with_index(self): + lines = self.data1.split('\n') + no_header = '\n'.join(lines[1:]) + + # regular index + names = ['index', 'A', 'B', 'C', 'D'] + df = read_csv(StringIO(no_header), index_col=0, names=names) + expected = read_csv(StringIO(self.data1), index_col=0) + assert_frame_equal(df, expected) + + # multi index + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['index1', 'index2', 'A', 'B', 'C', 'D'] + df = read_csv(StringIO(no_header), index_col=[0, 1], names=names) + expected = read_csv(StringIO(data), index_col=[0, 1]) + assert_frame_equal(df, expected) + + def test_multi_index_no_level_names(self): + pass class TestParseSQL(unittest.TestCase): From 89fecc6af099175c78613c583132f2e042dfbbfe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 00:06:46 -0400 Subject: [PATCH 092/161] TST: parser test coverage --- pandas/io/parsers.py | 11 ++++++----- pandas/io/tests/test_parsers.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4ca12c3debd6b..f792e175061da 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -258,7 +258,6 @@ def __iter__(self): def _get_index_name(self): columns = self.columns - passed_names = self.names is not None try: line = self._next_line() @@ -266,13 +265,15 @@ def _get_index_name(self): line = None # implicitly index_col=0 b/c 1 fewer column names - implicit_first_col = (line is not None and - len(line) == len(columns) + 1) + if line is not None: + implicit_first_cols = len(line) - len(columns) + else: + implicit_first_cols = 0 index_name = None - if implicit_first_col: + if implicit_first_cols > 0: if self.index_col is None: - self.index_col = 0 + self.index_col = range(implicit_first_cols) index_name = None elif np.isscalar(self.index_col): index_name = columns.pop(self.index_col) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index cc34af97c8008..dc253dffdf11a 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1,5 +1,6 @@ from cStringIO import StringIO from datetime import datetime +import csv import os import unittest @@ -9,7 +10,7 @@ import numpy as np from pandas import DataFrame, Index -from pandas.io.parsers import read_csv, read_table, ExcelFile +from pandas.io.parsers import read_csv, read_table, ExcelFile, TextParser from pandas.util.testing import assert_almost_equal, assert_frame_equal import pandas._tseries as lib @@ -246,6 +247,17 @@ def test_iterator(self): last_chunk = reader.get_chunk(5) assert_frame_equal(last_chunk, df[3:]) + # pass list + lines = list(csv.reader(StringIO(self.data1))) + parser = TextParser(lines, index_col=0, chunksize=2) + + df = read_csv(StringIO(self.data1), index_col=0) + + chunks = list(parser) + assert_frame_equal(chunks[0], df[:2]) + assert_frame_equal(chunks[1], df[2:4]) + assert_frame_equal(chunks[2], df[4:]) + def test_header_not_first_line(self): data = """got,to,ignore,this,line got,to,ignore,this,line @@ -290,7 +302,19 @@ def test_pass_names_with_index(self): assert_frame_equal(df, expected) def test_multi_index_no_level_names(self): - pass + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['A', 'B', 'C', 'D'] + df = read_csv(StringIO(no_header), index_col=[0, 1], names=names) + expected = read_csv(StringIO(data), index_col=[0, 1]) + assert_frame_equal(df, expected) class TestParseSQL(unittest.TestCase): From 24ed9ed8c9502ebae552d691d815e1c58a2c0fd8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 00:29:42 -0400 Subject: [PATCH 093/161] BUG: implement MultiIndex.diff, add & and | for intersection/union, GH #260 --- RELEASE.rst | 2 ++ pandas/core/index.py | 28 ++++++++++++++++++++++++++-- pandas/tests/test_index.py | 19 +++++++++++++++++-- 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 840f7cd61e051..b32616965744a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -114,6 +114,7 @@ feedback on the library. - Add inner join option to `DataFrame.join` when joining on key(s) (GH #248) - Can select set of DataFrame columns by passing a list to `__getitem__` (GH #253) + - Can use & and | to intersection / union Index objects, respectively **Improvements to existing features** @@ -158,6 +159,7 @@ feedback on the library. - `DataFrame.iteritems` and `DataFrame._series` not assigning name attribute - Panel.__repr__ raised exception on length-0 major/minor axes - `DataFrame.join` on key with empty DataFrame produced incorrect columns + - Implemented `MultiIndex.diff` (GH #260) - `Int64Index.take` and `MultiIndex.take` lost name field, fix downstream issue GH #262 - `read_csv` / `read_table` fixes diff --git a/pandas/core/index.py b/pandas/core/index.py index 4a59722a1337f..a692da4218e6b 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -259,6 +259,15 @@ def __add__(self, other): __le__ = _indexOp('__le__') __ge__ = _indexOp('__ge__') + def __sub__(self, other): + return self.diff(other) + + def __and__(self, other): + return self.intersection(other) + + def __or__(self, other): + return self.union(other) + def union(self, other): """ Form the union of two Index objects and sorts if possible @@ -363,8 +372,6 @@ def diff(self, other): theDiff = sorted(set(self) - set(otherArr)) return Index(theDiff) - __sub__ = diff - def get_loc(self, key): """ Get integer location for requested label @@ -1514,6 +1521,23 @@ def intersection(self, other): uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0) + def diff(self, other): + """ + Compute sorted set difference of two MultiIndex objects + + Returns + ------- + diff : MultiIndex + """ + self._assert_can_do_setop(other) + + if self.equals(other): + return self[:0] + + difference = sorted(set(self.values) - set(other.values)) + return MultiIndex.from_tuples(difference, sortorder=0, + names=self.names) + def _assert_can_do_setop(self, other): if not isinstance(other, MultiIndex): raise TypeError('can only call with other hierarchical ' diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 43c1b44c1ae5c..9bb2a048fd93a 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -863,7 +863,7 @@ def test_union(self): piece1 = self.index[:5][::-1] piece2 = self.index[3:] - the_union = piece1.union(piece2) + the_union = piece1 | piece2 tups = sorted(self.index.get_tuple_index()) expected = MultiIndex.from_tuples(tups) @@ -884,7 +884,7 @@ def test_intersection(self): piece1 = self.index[:5][::-1] piece2 = self.index[3:] - the_int = piece1.intersection(piece2) + the_int = piece1 & piece2 tups = sorted(self.index[3:5].get_tuple_index()) expected = MultiIndex.from_tuples(tups) self.assert_(the_int.equals(expected)) @@ -896,6 +896,21 @@ def test_intersection(self): self.assertRaises(TypeError, self.index.intersection, self.index.get_tuple_index()) + def test_diff(self): + first = self.index + result = first - self.index[-3:] + expected = MultiIndex.from_tuples(sorted(self.index[:-3].values), + sortorder=0, + names=self.index.names) + + self.assert_(isinstance(result, MultiIndex)) + self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + result = first - first + expected = first[:0] + self.assert_(result.equals(expected)) + def test_argsort(self): result = self.index.argsort() expected = self.index.get_tuple_index().argsort() From c93ded77c823092bb4bd94c492b7c340bc8b5f3c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 00:48:19 -0400 Subject: [PATCH 094/161] DOC: release notes --- RELEASE.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index b32616965744a..9ddc1bedad715 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -104,6 +104,9 @@ feedback on the library. - Added `DataFrame.align` method with standard join options - Added `parse_dates` option to `read_csv` and `read_table` methods to optionally try to parse dates in the index columns + - Add `nrows`, `chunksize`, and `iterator` arguments to `read_csv` and + `read_table`. The last two return a new `TextParser` class capable of + lazily iterating through chunks of a flat file (GH #242) - Added ability to join on multiple columns in `DataFrame.join` (GH #214) - Added private `_get_duplicates` function to `Index` for identifying duplicate values more easily @@ -114,7 +117,8 @@ feedback on the library. - Add inner join option to `DataFrame.join` when joining on key(s) (GH #248) - Can select set of DataFrame columns by passing a list to `__getitem__` (GH #253) - - Can use & and | to intersection / union Index objects, respectively + - Can use & and | to intersection / union Index objects, respectively (GH + #261) **Improvements to existing features** From 1b30cfc4244206e411de2e5c3734c6d6f6164bbe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 11:42:29 -0400 Subject: [PATCH 095/161] TST: be less lazy about empty MultiIndex set difference, check names --- pandas/core/index.py | 8 ++++++-- pandas/tests/test_index.py | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index a692da4218e6b..b7e4830c26e3e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1531,12 +1531,16 @@ def diff(self, other): """ self._assert_can_do_setop(other) + result_names = self.names if self.names == other.names else None + if self.equals(other): - return self[:0] + return MultiIndex(levels=[[]]*self.nlevels, + labels=[[]]*self.nlevels, + names=result_names) difference = sorted(set(self.values) - set(other.values)) return MultiIndex.from_tuples(difference, sortorder=0, - names=self.names) + names=result_names) def _assert_can_do_setop(self, other): if not isinstance(other, MultiIndex): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 9bb2a048fd93a..0b6ff7abbfd73 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -907,10 +907,17 @@ def test_diff(self): self.assert_(result.equals(expected)) self.assertEqual(result.names, self.index.names) + # empty difference result = first - first expected = first[:0] self.assert_(result.equals(expected)) + # names not the same + chunklet = self.index[-3:] + chunklet.names = ['foo', 'baz'] + result = first - chunklet + self.assertEqual(result.names, [None, None]) + def test_argsort(self): result = self.index.argsort() expected = self.index.get_tuple_index().argsort() From 20ae0eda38971e92f9f90d19eb8e41e8c7e7ba9d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 12:07:06 -0400 Subject: [PATCH 096/161] BUG: should not pass empty list to from_tuples in diff --- pandas/core/index.py | 10 ++++++++-- pandas/tests/test_index.py | 4 ++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index b7e4830c26e3e..962a68b0e61cf 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1539,8 +1539,14 @@ def diff(self, other): names=result_names) difference = sorted(set(self.values) - set(other.values)) - return MultiIndex.from_tuples(difference, sortorder=0, - names=result_names) + + if not difference: + return MultiIndex(levels=[[]]*self.nlevels, + labels=[[]]*self.nlevels, + names=result_names) + else: + return MultiIndex.from_tuples(difference, sortorder=0, + names=result_names) def _assert_can_do_setop(self, other): if not isinstance(other, MultiIndex): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 0b6ff7abbfd73..376a9150a01a9 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -918,6 +918,10 @@ def test_diff(self): result = first - chunklet self.assertEqual(result.names, [None, None]) + # empty, but non-equal + result = self.index - self.index.sortlevel(1)[0] + self.assert_(len(result) == 0) + def test_argsort(self): result = self.index.argsort() expected = self.index.get_tuple_index().argsort() From b82a93fa5b8245e460e80338f287398c8e90d6af Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 12:08:29 -0400 Subject: [PATCH 097/161] BUG: raise more helpful exception when passing empty list to MultiIndex.from_tuples --- pandas/core/index.py | 2 ++ pandas/tests/test_index.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/pandas/core/index.py b/pandas/core/index.py index 962a68b0e61cf..eb170c8a9d11c 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -999,6 +999,8 @@ def from_tuples(cls, tuples, sortorder=None, names=None): ------- index : MultiIndex """ + if len(tuples) == 0: + raise Exception('Cannot infer number of levels from empty list') arrays = zip(*tuples) return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 376a9150a01a9..d2308f4e5e126 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -922,6 +922,9 @@ def test_diff(self): result = self.index - self.index.sortlevel(1)[0] self.assert_(len(result) == 0) + def test_from_tuples(self): + self.assertRaises(Exception, MultiIndex.from_tuples, []) + def test_argsort(self): result = self.index.argsort() expected = self.index.get_tuple_index().argsort() From f789bb7c8f7825ca287e0927fa11c6b1513fff3e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 13:12:01 -0400 Subject: [PATCH 098/161] BUG: passing list of tuples to Series constructor failed, GH #270 --- RELEASE.rst | 21 +++++++++++---------- pandas/core/groupby.py | 7 ++++++- pandas/core/series.py | 10 ++++++---- pandas/tests/test_groupby.py | 2 +- pandas/tests/test_series.py | 5 +++++ 5 files changed, 29 insertions(+), 16 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 9ddc1bedad715..4accf2ade357c 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -146,6 +146,16 @@ feedback on the library. **Bug fixes** + - `read_csv` / `read_table` fixes + - Be less aggressive about converting float->int in cases of floating point + representations of integers like 1.0, 2.0, etc. + - "True"/"False" will not get correctly converted to boolean + - Index name attribute will get set when specifying an index column + - Passing column names should force `header=None` (GH #257) + - Don't modify passed column names when `index_col` is not + None (GH #258) + - Can sniff CSV separator in zip file (since seek is not supported, was + failing before) - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should be reported upstream to matplotlib (GH #224) - DataFrame.iteritems was not returning Series with the name attribute @@ -166,16 +176,7 @@ feedback on the library. - Implemented `MultiIndex.diff` (GH #260) - `Int64Index.take` and `MultiIndex.take` lost name field, fix downstream issue GH #262 - - `read_csv` / `read_table` fixes - - Be less aggressive about converting float->int in cases of floating point - representations of integers like 1.0, 2.0, etc. - - "True"/"False" will not get correctly converted to boolean - - Index name attribute will get set when specifying an index column - - Passing column names should force `header=None` (GH #257) - - Don't modify passed column names when `index_col` is not - None (GH #258) - - Can sniff CSV separator in zip file (since seek is not supported, was - failing before) + - Can pass list of tuples to `Series` (GH #270) Thanks ------ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f662d8ea01c31..216669c1dbf29 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -760,7 +760,10 @@ def _aggregate_simple(self, func, *args, **kwargs): values = self.obj.values result = {} for k, v in self.primary.indices.iteritems(): - result[k] = func(values.take(v), *args, **kwargs) + agged = func(values.take(v), *args, **kwargs) + if isinstance(output, np.ndarray): + raise Exception('Must produce aggregated value') + result[k] = agged return result @@ -771,6 +774,8 @@ def _aggregate_named(self, func, *args, **kwargs): grp = self.get_group(name) grp.name = name output = func(grp, *args, **kwargs) + if isinstance(output, np.ndarray): + raise Exception('Must produce aggregated value') result[name] = output return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 0701512c5ae0f..f3c32ece030b8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,7 +19,8 @@ import numpy as np from pandas.core.common import (isnull, notnull, _is_bool_indexer, - _default_index, _maybe_upcast) + _default_index, _maybe_upcast, + _asarray_tuplesafe) from pandas.core.daterange import DateRange from pandas.core.generic import PandasObject from pandas.core.index import Index, MultiIndex, _ensure_index @@ -109,7 +110,6 @@ def __new__(cls, data, index=None, dtype=None, name=None, copy=False): index = Index(sorted(data.keys())) data = [data.get(idx, np.nan) for idx in index] - # Create array, do *not* copy data by default, infer type try: subarr = np.array(data, dtype=dtype, copy=copy) except ValueError: @@ -139,9 +139,11 @@ def __new__(cls, data, index=None, dtype=None, name=None, copy=False): subarr.fill(value) else: return subarr.item() - elif subarr.ndim > 1: - raise Exception('Data must be 1-dimensional') + if isinstance(data, np.ndarray): + raise Exception('Data must be 1-dimensional') + else: + subarr = _asarray_tuplesafe(data, dtype=dtype) if index is None: index = _default_index(len(subarr)) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index d7c8b679a2105..13204044df9f1 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -298,7 +298,7 @@ def test_series_describe_multikey(self): def test_series_describe_single(self): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) - result = grouped.agg(lambda x: x.describe()) + result = grouped.apply(lambda x: x.describe()) expected = grouped.describe() assert_frame_equal(result, expected) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 08d32b54c3038..08c6826f8b872 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -157,6 +157,11 @@ def test_constructor_dict(self): expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a']) assert_series_equal(result, expected) + def test_constructor_tuples(self): + data = [(1, 1), (2, 2), (2, 3)] + s = Series(data) + self.assertEqual(list(s), data) + def test_fromDict(self): data = {'a' : 0, 'b' : 1, 'c' : 2, 'd' : 3} From a6634122b4ec2f249714cb20629edb17ac37a312 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 13:40:16 -0400 Subject: [PATCH 099/161] ENH: pivot table docs and basic tests, GH #234 --- pandas/__init__.py | 1 + pandas/tools/pivot.py | 44 ++++++++++++++++++++++++-- pandas/tools/tests/test_pivot.py | 54 ++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 pandas/tools/tests/test_pivot.py diff --git a/pandas/__init__.py b/pandas/__init__.py index 48a46ed98305b..ae69b6f7a907f 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -26,3 +26,4 @@ from pandas.stats.api import * from pandas.util.testing import debug +from pandas.tools.pivot import pivot_table diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index cb235f5b07341..132639d8672a4 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -4,9 +4,49 @@ def pivot_table(data, values=None, xby=None, yby=None, aggfunc=np.mean, fill_value=None): """ - + Create a spreadsheet-style pivot table as a DataFrame. The levels in the + pivot table will be stored in MultiIndex objects (hierarchical indexes) on + the index and columns of the result DataFrame + + Parameters + ---------- + data : DataFrame + values : column to aggregate, optional + xby : list + Columns to group on the x-axis of the pivot table + yby : list + Columns to group on the x-axis of the pivot table + aggfunc : function, default numpy.mean + fill_value : scalar, default None + Value to replace missing values with + + Examples + -------- + >>> df + A B C D + 0 foo one small 1 + 1 foo one large 2 + 2 foo one large 2 + 3 foo two small 3 + 4 foo two small 3 + 5 bar one large 4 + 6 bar one small 5 + 7 bar two small 6 + 8 bar two large 7 + + >>> table = pivot_table(df, values='D', xby=['A, 'B'], + yby=['C'], aggfunc=np.sum) + >>> table + small large + foo one 1 4 + two 6 NaN + bar one 5 4 + two 6 7 + + Returns + ------- + table : DataFrame """ - xby = [] if xby is None else list(xby) yby = [] if yby is None else list(yby) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py new file mode 100644 index 0000000000000..2b8b473dc40f6 --- /dev/null +++ b/pandas/tools/tests/test_pivot.py @@ -0,0 +1,54 @@ +import unittest + +import numpy as np + +from pandas import DataFrame +from pandas.tools.pivot import pivot_table +from pandas.util.testing import assert_frame_equal + +class TestPivotTable(unittest.TestCase): + + def setUp(self): + self.data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B' : ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C' : ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D' : np.random.randn(11), + 'E' : np.random.randn(11)}) + + def test_pivot_table(self): + xby = ['A', 'B'] + yby= ['C'] + table = pivot_table(self.data, values='D', xby=xby, yby=yby) + + if len(xby) > 1: + self.assertEqual(table.index.names, xby) + else: + self.assertEqual(table.index.name, xby[0]) + + if len(yby) > 1: + self.assertEqual(table.columns.names, yby) + else: + self.assertEqual(table.columns.name, yby[0]) + + expected = self.data.groupby(xby + yby)['D'].agg(np.mean).unstack() + assert_frame_equal(table, expected) + + def test_pivot_table_multiple(self): + xby = ['A', 'B'] + yby= ['C'] + table = pivot_table(self.data, xby=xby, yby=yby) + expected = self.data.groupby(xby + yby).agg(np.mean).unstack() + assert_frame_equal(table, expected) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + + From cb819780b598cbfe885a8040a2db32fa103bdcfb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 14:08:59 -0400 Subject: [PATCH 100/161] ENH: implement Panel.rename_axis, GH #243 --- RELEASE.rst | 2 ++ pandas/core/common.py | 1 + pandas/core/frame.py | 21 ++++----------------- pandas/core/generic.py | 37 +++++++++++++++++++++++++++++++++++++ pandas/core/series.py | 23 ++++++++++++++--------- pandas/tests/test_panel.py | 21 ++++++++++++++++++++- 6 files changed, 78 insertions(+), 27 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 4accf2ade357c..df1b5cf8d5443 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -119,6 +119,8 @@ feedback on the library. #253) - Can use & and | to intersection / union Index objects, respectively (GH #261) + - Added `pivot_table` convenience function to pandas namespace (GH #234) + - Implemented `Panel.rename_axis` function (GH #243) **Improvements to existing features** diff --git a/pandas/core/common.py b/pandas/core/common.py index 715c91a990ab7..2d6ec55c6b103 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -531,3 +531,4 @@ def _asarray_tuplesafe(values, dtype=None): result[:] = values return result + diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b8f54e867ff3e..7386a987c87ce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1474,27 +1474,14 @@ def rename(self, index=None, columns=None, copy=True): ------- renamed : DataFrame (new object) """ - if isinstance(index, (dict, Series)): - def index_f(x): - if x in index: - return index[x] - else: - return x - else: - index_f = index - - if isinstance(columns, (dict, Series)): - def columns_f(x): - if x in columns: - return columns[x] - else: - return x - else: - columns_f = columns + from pandas.core.series import _get_rename_function if index is None and columns is None: raise Exception('must pass either index or columns') + index_f = _get_rename_function(index) + columns_f = _get_rename_function(columns) + self._consolidate_inplace() result = self.copy(deep=copy) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index be61619b74424..c108201f54905 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -458,3 +458,40 @@ def add_suffix(self, suffix): """ new_data = self._data.add_suffix(suffix) return self._constructor(new_data) + + def rename_axis(self, mapper, axis=0, copy=True): + """ + Alter index and / or columns using input function or + functions. Function / dict values must be unique (1-to-1). Labels not + contained in a dict / Series will be left as-is. + + Parameters + ---------- + index : dict-like or function, optional + Transformation to apply to index values + columns : dict-like or function, optional + Transformation to apply to column values + copy : boolean, default True + Also copy underlying data + + See also + -------- + Series.rename + + Returns + ------- + renamed : DataFrame (new object) + """ + # should move this at some point + from pandas.core.series import _get_rename_function + + mapper_f = _get_rename_function(mapper) + + if axis == 0: + new_data = self._data.rename_items(mapper_f, copydata=copy) + else: + new_data = self._data.rename_axis(mapper_f, axis=axis) + if copy: + new_data = new_data.copy() + + return self._constructor(new_data) diff --git a/pandas/core/series.py b/pandas/core/series.py index f3c32ece030b8..c79cddcc3e53f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1858,15 +1858,7 @@ def rename(self, mapper): ------- renamed : Series (new object) """ - if isinstance(mapper, (dict, Series)): - def mapper_f(x): - if x in mapper: - return mapper[x] - else: - return x - else: - mapper_f = mapper - + mapper_f = _get_rename_function(mapper) result = self.copy() result.index = [mapper_f(x) for x in self.index] @@ -1888,3 +1880,16 @@ def remove_na(arr): Return array containing only true/non-NaN values, possibly empty. """ return arr[notnull(arr)] + + +def _get_rename_function(mapper): + if isinstance(mapper, (dict, Series)): + def f(x): + if x in mapper: + return mapper[x] + else: + return x + else: + f = mapper + + return f diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index cb3e7b4a993b3..387839c09506a 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -830,6 +830,26 @@ def test_repr_empty(self): empty = Panel() repr(empty) + def test_rename(self): + mapper = { + 'ItemA' : 'foo', + 'ItemB' : 'bar', + 'ItemC' : 'baz' + } + + renamed = self.panel.rename_axis(mapper, axis=0) + exp = Index(['foo', 'bar', 'baz']) + self.assert_(renamed.items.equals(exp)) + + renamed = self.panel.rename_axis(str.lower, axis=2) + exp = Index(['a', 'b', 'c', 'd']) + self.assert_(renamed.minor_axis.equals(exp)) + + # don't copy + renamed_nocopy = self.panel.rename_axis(mapper, axis=0, copy=False) + renamed_nocopy['foo'] = 3. + self.assert_((self.panel['ItemA'].values == 3).all()) + class TestLongPanel(unittest.TestCase): def setUp(self): @@ -1149,7 +1169,6 @@ def test_pivot(self): df = pivot(np.array([]), np.array([]), np.array([])) - def test_group_agg(): values = np.ones((10, 2)) * np.arange(10).reshape((10, 1)) bounds = np.arange(5) * 2 From 62243c1a692f6e884de0c596c6551724a34f04fe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 14:17:49 -0400 Subject: [PATCH 101/161] ENH: add general save/load functions to pandas namespace --- pandas/core/api.py | 3 ++- pandas/core/common.py | 38 ++++++++++++++++++++++++++++++++++++++ pandas/core/generic.py | 33 ++++++++++++--------------------- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index c6552e2df3caf..3b4e2c4e50cce 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -5,7 +5,7 @@ from pandas.core.datetools import DateOffset import pandas.core.datetools as datetools -from pandas.core.common import isnull, notnull, set_printoptions +from pandas.core.common import isnull, notnull, set_printoptions, save, load from pandas.core.index import Index, Int64Index, Factor, MultiIndex from pandas.core.daterange import DateRange from pandas.core.series import Series, TimeSeries @@ -17,3 +17,4 @@ DataMatrix = DataFrame WidePanel = Panel + diff --git a/pandas/core/common.py b/pandas/core/common.py index 2d6ec55c6b103..259c428c1e140 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1,6 +1,7 @@ """ Misc tools for implementing data structures """ +import cPickle try: from io import BytesIO except ImportError: # Python < 2.6 @@ -532,3 +533,40 @@ def _asarray_tuplesafe(values, dtype=None): return result + +def save(obj, path): + """ + Pickle (serialize) object to input file path + + Parameters + ---------- + obj : any object + path : string + File path + """ + f = open(path, 'wb') + try: + cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) + finally: + f.close() + + +def load(path): + """ + Load pickled pandas object (or any other pickled object) from the specified + file path + + Parameters + ---------- + path : string + File path + + Returns + ------- + unpickled : type of object stored in file + """ + f = open(path, 'rb') + try: + return cPickle.load(f) + finally: + f.close() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c108201f54905..8689542035107 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,6 +1,7 @@ import numpy as np import cPickle +from pandas.core.common import save, load from pandas.core.index import Index, MultiIndex, _ensure_index import pandas.core.datetools as datetools @@ -9,20 +10,12 @@ class Picklable(object): - def save(self, fileName): - f = open(fileName, 'wb') - try: - cPickle.dump(self, f, protocol=cPickle.HIGHEST_PROTOCOL) - finally: - f.close() + def save(self, path): + save(self, path) @classmethod - def load(cls, fileName): - f = open(fileName, 'rb') - try: - return cPickle.load(f) - finally: - f.close() + def load(cls, path): + return load(path) class PandasError(Exception): pass @@ -461,26 +454,24 @@ def add_suffix(self, suffix): def rename_axis(self, mapper, axis=0, copy=True): """ - Alter index and / or columns using input function or - functions. Function / dict values must be unique (1-to-1). Labels not - contained in a dict / Series will be left as-is. + Alter index and / or columns using input function or functions. + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. Parameters ---------- - index : dict-like or function, optional - Transformation to apply to index values - columns : dict-like or function, optional - Transformation to apply to column values + mapper : dict-like or function, optional + axis : int, default 0 copy : boolean, default True Also copy underlying data See also -------- - Series.rename + DataFrame.rename Returns ------- - renamed : DataFrame (new object) + renamed : type of caller """ # should move this at some point from pandas.core.series import _get_rename_function From 17f5ef9a4030e02cb5b28a8350ae12199baccdb8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 14:43:24 -0400 Subject: [PATCH 102/161] DOC: release notes --- RELEASE.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index df1b5cf8d5443..fe7e77b6c713e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -8,12 +8,12 @@ see the commit logs at http://github.com/wesm/pandas What is it ---------- -**pandas** is a library of powerful labeled-axis data structures, statistical -tools, and general code for working with relational data sets, including time -series and cross-sectional data. It was designed with the practical needs of -statistical modeling and large, inhomogeneous data sets in mind. It is -particularly well suited for, among other things, financial data analysis -applications. +pandas is a Python package providing fast, flexible, and expressive data +structures designed to make working with “relational” or “labeled” data both +easy and intuitive. It aims to be the fundamental high-level building block for +doing practical, real world data analysis in Python. Additionally, it has the +broader goal of becoming the most powerful and flexible open source data +analysis / manipulation tool available in any language. Where to get it --------------- From fd4c4c91e99ac680c8b20726045212917a0c42a5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 15:12:43 -0400 Subject: [PATCH 103/161] DOC: update read_csv docs --- RELEASE.rst | 6 +++--- pandas/io/parsers.py | 25 ++++++++++++++++++++----- pandas/io/tests/test_parsers.py | 5 +++++ 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index fe7e77b6c713e..8ecca84ed865d 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -18,9 +18,9 @@ analysis / manipulation tool available in any language. Where to get it --------------- -Source code: http://github.com/wesm/pandas -Binary installers on PyPI: http://pypi.python.org/pypi/pandas -Documentation: http://pandas.sourceforge.net +* Source code: http://github.com/wesm/pandas +* Binary installers on PyPI: http://pypi.python.org/pypi/pandas +* Documentation: http://pandas.sourceforge.net pandas 0.5.0 ============ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f792e175061da..36323414ae72d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -62,13 +62,17 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, names=None, skiprows=None, na_values=None, parse_dates=False, - date_parser=None): + date_parser=None, nrows=None, iterator=False, chunksize=None): return read_csv(filepath_or_buffer, sep=sep, header=header, skiprows=skiprows, index_col=index_col, na_values=na_values, date_parser=date_parser, - names=names, parse_dates=parse_dates) + names=names, parse_dates=parse_dates, + nrows=nrows, iterator=iterator, chunksize=chunksize) -_parser_params = """Parameters +_parser_params = """Also supports optionally iterating or breaking of the file +into chunks. + +Parameters ---------- filepath_or_buffer : string or file handle / StringIO %s @@ -79,6 +83,8 @@ def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, index_col : int or sequence, default None Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. +names : array-like + List of column names na_values : list-like, default None List of additional strings to recognize as NA/NaN parse_dates : boolean, default False @@ -86,8 +92,17 @@ def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, date_parser : function Function to use for converting dates to strings. Defaults to dateutil.parser -names : array-like - List of column names""" +nrows : int, default None + Number of rows of file to read. Useful for reading pieces of large files +iterator : boolean, default False + Return TextParser object +chunksize : int, default None + Return TextParser object for iteration + +Returns +------- +result : DataFrame or TextParser +""" _csv_sep = """sep : string, default None Delimiter to use. By default will try to automatically determine diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index dc253dffdf11a..79fc39f776b2d 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -239,6 +239,7 @@ def test_read_chunksize(self): def test_iterator(self): reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) + df = read_csv(StringIO(self.data1), index_col=0) chunk = reader.get_chunk(3) @@ -258,6 +259,10 @@ def test_iterator(self): assert_frame_equal(chunks[1], df[2:4]) assert_frame_equal(chunks[2], df[4:]) + treader = read_table(StringIO(self.data1), sep=',', index_col=0, + iterator=True) + self.assert_(isinstance(treader, TextParser)) + def test_header_not_first_line(self): data = """got,to,ignore,this,line got,to,ignore,this,line From b1330135008d7c19c407d9a297ee806b6f51cca0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 17:54:09 -0400 Subject: [PATCH 104/161] BUG: could not parse dates with implicit first column --- pandas/io/parsers.py | 5 ++++- pandas/io/tests/test_parsers.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 36323414ae72d..8ac9ed5697e16 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -288,7 +288,10 @@ def _get_index_name(self): index_name = None if implicit_first_cols > 0: if self.index_col is None: - self.index_col = range(implicit_first_cols) + if implicit_first_cols == 1: + self.index_col = 0 + else: + self.index_col = range(implicit_first_cols) index_name = None elif np.isscalar(self.index_col): index_name = columns.pop(self.index_col) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 79fc39f776b2d..11b47dfb45991 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -110,6 +110,17 @@ def test_csv_custom_parser(self): expected = read_csv(StringIO(data), parse_dates=True) assert_frame_equal(df, expected) + def test_parse_dates_implicit_first_col(self): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + df = read_csv(StringIO(data), parse_dates=True) + expected = read_csv(StringIO(data), index_col=0, parse_dates=True) + self.assert_(isinstance(df.index[0], datetime)) + assert_frame_equal(df, expected) + def test_no_header(self): data = """1,2,3,4,5 6,7,8,9,10 @@ -222,6 +233,15 @@ def test_infer_index_col(self): data = read_csv(StringIO(data)) self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) + def test_sniff_delimiter(self): + data = """index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + data = read_csv(StringIO(data), index_col=0) + self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) + def test_read_nrows(self): df = read_csv(StringIO(self.data1), nrows=3) expected = read_csv(StringIO(self.data1))[:3] From 196acb8e63af57eeecf6144eaab5a47882793367 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 18:18:23 -0400 Subject: [PATCH 105/161] DOC: update docs about file parsing functions --- RELEASE.rst | 5 +- TODO.rst | 50 +++++++++++-- doc/source/io.rst | 174 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 185 insertions(+), 44 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 8ecca84ed865d..8eab725b43a1f 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -144,7 +144,8 @@ feedback on the library. - Add support for different delimiters in `DataFrame.to_csv` (PR #244) - Add more helpful error message when importing pandas post-installation from the source directory (GH #250) - + - Significantly speed up DataFrame `__repr__` and `count` on large mixed-type + DataFrame objects **Bug fixes** @@ -305,8 +306,6 @@ infrastructure are the main new additions retrieve groups - Added informative Exception when passing dict to DataFrame groupby aggregation with axis != 0 - - Significantly speed up DataFrame `__repr__` and `count` on large mixed-type - DataFrame objects **API Changes** diff --git a/TODO.rst b/TODO.rst index c879c1fdff57d..31cf38c754a22 100644 --- a/TODO.rst +++ b/TODO.rst @@ -5,10 +5,50 @@ DONE TODO ---- -- .name pickling / unpicking / HDFStore handling -- Is there a way to write hierarchical columns to csv? -- Possible to blow away existing name when creating MultiIndex? -- prettytable output with index names -- Add load/save functions to top level pandas namespace - _consolidate, does it always copy? - Series.align with fill method. Will have to generate more Cython code + +TODO docs +--------- + +- read_csv / read_table + - auto-sniff delimiter + - MultiIndex + - generally more documentation + +- pivot_table + +- Set mixed-type values with .ix +- get_dtype_counts / dtypes +- save / load functions +- combine_first +- describe for Series +- DataFrame.to_string +- Index / MultiIndex names +- Unstack / stack by level name +- ignore_index in DataFrame.append +- Inner join on key +- Multi-key joining +- as_index=False in groupby +- is_monotonic +- isnull/notnull as instance methods +- name attribute on Series +- DataFrame.to_csv: different delimiters? +- groupby with level name +- MultiIndex + - get_level_values + +- Update to reflect Python 3 support in intro +- align functions +- df[col_list] +- Panel.rename_axis +- & and | for intersection / union +- IPython tab complete hook + +Performance blog +---------------- +- Series / Time series data alignment +- DataFrame alignment +- Groupby +- joining +- Take diff --git a/doc/source/io.rst b/doc/source/io.rst index f0b58af32e026..5771dade8f7f4 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -6,6 +6,7 @@ :suppress: import numpy as np + import os np.random.seed(123456) from pandas import * from StringIO import StringIO @@ -29,9 +30,8 @@ data into a DataFrame object. They can take a number of arguments: - ``path_or_buffer``: Either a string path to a file, or any object with a ``read`` method (such as an open file or ``StringIO``). - - ``delimiter``: For ``read_table`` only, a regular expression to split - fields on. ``read_csv`` uses the ``csv`` module to do this and hence only - supports comma-separated values. + - ``sep``: A delimiter / separator to split fields on. `read_csv` is capable + of inferring automatically "sniffing" the delimiter in some cases - ``header``: row number to use as the column names, and the start of the data. Defaults to 0 (first row); specify None if there is no header row. - ``names``: List of column names to use if header is None. @@ -47,45 +47,89 @@ data into a DataFrame object. They can take a number of arguments: ``dateutil.parser``. Specifying this implicitly sets ``parse_dates`` as True. - ``na_values``: optional list of strings to recognize as NaN (missing values), in addition to a default set. - - -.. code-block:: ipython - - In [1]: print open('foo.csv').read() - date,A,B,C - 20090101,a,1,2 - 20090102,b,3,4 - 20090103,c,4,5 - - # A basic index is created by default: - In [3]: read_csv('foo.csv') - Out[3]: - date A B C - 0 20090101 a 1 2 - 1 20090102 b 3 4 - 2 20090103 c 4 5 - - # Use a column as an index, and parse it as dates. - In [3]: df = read_csv('foo.csv', index_col=0, parse_dates=True) - - In [4]: df - Out[4]: - A B C - 2009-01-01 a 1 2 - 2009-01-02 b 3 4 - 2009-01-03 c 4 5 - - # These are python datetime objects - In [16]: df.index - Out[16]: Index([2009-01-01 00:00:00, 2009-01-02 00:00:00, - 2009-01-03 00:00:00], dtype=object) + - ``nrows``: Number of rows to read out of the file. Useful to only read a + small portion of a large file + - ``chunksize``: An number of rows to be used to "chunk" a file into + pieces. Will cause an ``TextParser`` object to be returned. More on this + below in the section on :ref:`iterating and chunking ` + - ``iterator``: If True, return a ``TextParser`` to enable reading a file + into memory piece by piece + +.. ipython:: python + :suppress: + + f = open('foo.csv', 'w') + f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f.close() + +Consider a typical CSV file containing, in this case, some time series data: + +.. ipython:: python + + print open('foo.csv').read() +The default for `read_csv` is to create a DataFrame with simple numbered rows: + +.. ipython:: python + + read_csv('foo.csv') + +In the case of indexed data, you can pass the column number (or a list of +column numbers, for a hierarchical index) you wish to use as the index. If the +index values are dates and you want them to be converted to ``datetime`` +objects, pass ``parse_dates=True``: + +.. ipython:: python + + # Use a column as an index, and parse it as dates. + df = read_csv('foo.csv', index_col=0, parse_dates=True) + df + # These are python datetime objects + df.index + +.. ipython:: python + :suppress: + + os.remove('foo.csv') The parsers make every attempt to "do the right thing" and not be very fragile. Type inference is a pretty big deal. So if a column can be coerced to integer dtype without altering the contents, it will do so. Any non-numeric columns will come through as object dtype as with the rest of pandas objects. +Files with an "implicit" index column +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + :suppress: + + f = open('foo.csv', 'w') + f.write('A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f.close() + +Consider a file with one less entry in the header than the number of data +column: + +.. ipython:: python + + print open('foo.csv').read() + +In this special case, ``read_csv`` assumes that the first column is to be used +as the index of the DataFrame: + +.. ipython:: python + + read_csv('foo.csv') + +Note that the dates weren't automatically parsed. In that case you would need +to do as before: + +.. ipython:: python + + df = read_csv('foo.csv', parse_dates=True) + df.index + + Reading DataFrame objects with ``MultiIndex`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -104,6 +148,65 @@ column numbers to turn multiple columns into a ``MultiIndex``: df df.ix[1978] +.. .. _io.sniff: + +.. Automatically "sniffing" the delimiter +.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ``read_csv`` is capable of inferring delimited, but not necessarily +.. comma-separated, files in some cases: + +.. .. ipython:: python + +.. print open('tmp.csv').read() +.. read_csv('tmp.csv') + + + +.. _io.chunking: + +Iterating through files chunk by chunk +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose you wish to iterate through a (potentially very large) file lazily +rather than reading the entire file into memory, such as the following: + +.. ipython:: python + :suppress: + + df[:7].to_csv('tmp.sv', delimiter='|') + +.. ipython:: python + + print open('tmp.sv').read() + table = read_table('tmp.sv', sep='|') + table + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +By specifiying a ``chunksize`` to ``read_csv`` or ``read_table``, the return +value will be an iterable object of type ``TextParser``: + +.. ipython:: + + In [1]: reader = read_table('tmp.sv', sep='|', chunksize=4) + + In [1]: reader + + In [2]: for chunk in reader: + ...: print chunk + ...: + +Specifying ``iterator=True`` will also return the ``TextParser`` object: + +.. ipython:: python + + reader = read_table('tmp.sv', sep='|', iterator=True) + reader.get_chunk(5) + Excel 2003 files ---------------- @@ -132,7 +235,6 @@ performance HDF5 format using the excellent `PyTables .. ipython:: python :suppress: - import os os.remove('store.h5') .. ipython:: python From 7754985ecfce2dc75f5719fc255f0898660cb490 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 19:29:10 -0400 Subject: [PATCH 106/161] REF: deprecate nanRep in favor of na_rep, GH #275 --- pandas/core/common.py | 12 ++++++------ pandas/core/frame.py | 27 ++++++++++++++++++++------- pandas/core/series.py | 14 ++++++++++---- pandas/tests/test_frame.py | 2 +- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 259c428c1e140..fcbb2abf27838 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -356,11 +356,11 @@ def set_printoptions(precision=None, column_space=None): _float_format = lambda x: '%.4g' % x _column_space = 12 -def _pfixed(s, space, nanRep=None, float_format=None): +def _pfixed(s, space, na_rep=None, float_format=None): if isinstance(s, float): - if nanRep is not None and isnull(s): + if na_rep is not None and isnull(s): if np.isnan(s): - s = nanRep + s = na_rep return (' %s' % s).ljust(space) if float_format: @@ -386,11 +386,11 @@ def _stringify(col): else: return '%s' % col -def _format(s, nanRep=None, float_format=None): +def _format(s, na_rep=None, float_format=None): if isinstance(s, float): - if nanRep is not None and isnull(s): + if na_rep is not None and isnull(s): if np.isnan(s): - s = nanRep + s = na_rep return ' %s' % s if float_format: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7386a987c87ce..f63540e7cca23 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -468,8 +468,9 @@ def to_sparse(self, fill_value=None, kind='block'): default_kind=kind, default_fill_value=fill_value) - def to_csv(self, path, nanRep='', cols=None, header=True, - index=True, index_label=None, mode='w', delimiter=","): + def to_csv(self, path, na_rep='', cols=None, header=True, + index=True, index_label=None, mode='w', delimiter=",", + nanRep=None): """ Write DataFrame to a comma-separated values (csv) file @@ -495,6 +496,12 @@ def to_csv(self, path, nanRep='', cols=None, header=True, f = open(path, mode) csvout = csv.writer(f, lineterminator='\n', delimiter=delimiter) + if nanRep is not None: # pragma: no cover + import warnings + warnings.warn("nanRep is deprecated, use na_rep", + FutureWarning) + na_rep = nanRep + if cols is None: cols = self.columns @@ -532,7 +539,7 @@ def to_csv(self, path, nanRep='', cols=None, header=True, for i, col in enumerate(cols): val = series[col].get(idx) if isnull(val): - val = nanRep + val = na_rep row_fields.append(val) @@ -541,10 +548,16 @@ def to_csv(self, path, nanRep='', cols=None, header=True, f.close() def to_string(self, buf=None, columns=None, colSpace=None, - nanRep='NaN', formatters=None, float_format=None, - sparsify=True): + na_rep='NaN', formatters=None, float_format=None, + sparsify=True, nanRep=None): from pandas.core.common import _format, adjoin + if nanRep is not None: # pragma: no cover + import warnings + warnings.warn("nanRep is deprecated, use na_rep", + FutureWarning) + na_rep = nanRep + return_ = False if buf is None: # pragma: no cover buf = StringIO() @@ -552,11 +565,11 @@ def to_string(self, buf=None, columns=None, colSpace=None, if colSpace is None: def _myformat(v): - return _format(v, nanRep=nanRep, + return _format(v, na_rep=na_rep, float_format=float_format) else: def _myformat(v): - return _pfixed(v, colSpace, nanRep=nanRep, + return _pfixed(v, colSpace, na_rep=na_rep, float_format=float_format) if formatters is None: diff --git a/pandas/core/series.py b/pandas/core/series.py index c79cddcc3e53f..840e03e32a449 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -383,14 +383,20 @@ def _tidy_repr(self, max_vals=20): result = '%s\nName: %s, Length: %d' % (result, self.name, len(self)) return result - def to_string(self, buf=None, nanRep='NaN'): - the_repr = self._get_repr(nanRep=nanRep) + def to_string(self, buf=None, na_rep='NaN', nanRep=None): + if nanRep is not None: # pragma: no cover + import warnings + warnings.warn("nanRep is deprecated, use na_rep", + FutureWarning) + na_rep = nanRep + + the_repr = self._get_repr(na_rep=na_rep) if buf is None: return the_repr else: print >> buf, the_repr - def _get_repr(self, name=False, nanRep='NaN'): + def _get_repr(self, name=False, na_rep='NaN'): vals = self.values index = self.index @@ -400,7 +406,7 @@ def _get_repr(self, name=False, nanRep='NaN'): def _format_float(k, v): if np.isnan(v): - v = nanRep + v = na_rep else: v = str(v) return '%s %s' % (str(k).ljust(padSpace), v) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 5aa040d603978..294de46984fcf 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1626,7 +1626,7 @@ def test_to_csv_float32_nanrep(self): df[1] = np.nan pth = '__tmp__.csv' - df.to_csv(pth, nanRep=999) + df.to_csv(pth, na_rep=999) lines = open(pth).readlines() self.assert_(lines[1].split(',')[2] == '999') From 990026e99bc406e3a16e83603a08c2c7cc1a3115 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 19:33:40 -0400 Subject: [PATCH 107/161] DOC: pivot_table docs --- RELEASE.rst | 2 ++ TODO.rst | 4 +-- doc/source/io.rst | 3 +- doc/source/reshaping.rst | 61 ++++++++++++++++++++++++++++++++++------ 4 files changed, 58 insertions(+), 12 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 8eab725b43a1f..a2c3334a1e8b1 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -52,6 +52,8 @@ feedback on the library. - Changed `buffer` argument name in `Series.to_string` to `buf` - `Series.to_string` and `DataFrame.to_string` now return strings by default instead of printing to sys.stdout + - Deprecated `nanRep` argument in various `to_string` and `to_csv` functions + in favor of `na_rep`. Will be removed in 0.6 (GH #275) - Series functions renamed (and thus deprecated) in 0.4 series have been removed: diff --git a/TODO.rst b/TODO.rst index 31cf38c754a22..0646b05388064 100644 --- a/TODO.rst +++ b/TODO.rst @@ -11,14 +11,14 @@ TODO TODO docs --------- -- read_csv / read_table +- DONE read_csv / read_table - auto-sniff delimiter - MultiIndex - generally more documentation - pivot_table -- Set mixed-type values with .ix +- DONE Set mixed-type values with .ix - get_dtype_counts / dtypes - save / load functions - combine_first diff --git a/doc/source/io.rst b/doc/source/io.rst index 5771dade8f7f4..1c169589e763a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -34,7 +34,8 @@ data into a DataFrame object. They can take a number of arguments: of inferring automatically "sniffing" the delimiter in some cases - ``header``: row number to use as the column names, and the start of the data. Defaults to 0 (first row); specify None if there is no header row. - - ``names``: List of column names to use if header is None. + - ``names``: List of column names to use. If passed, header will be + implicitly set to None. - ``skiprows``: A collection of numbers for rows in the file to skip. - ``index_col``: column number, or list of column numbers, to use as the ``index`` (row labels) of the resulting DataFrame. By default, it will number diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index c1388edc789e2..99eba88c6cc44 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -11,15 +11,9 @@ randn = np.random.randn np.set_printoptions(precision=4, suppress=True) -*************************** -Pivoting and reshaping data -*************************** - -.. note:: - - Since some of the functionality documented in this section is very new, the - user should keep an eye on any changes to the API or behavior which may - occur by the next release. +********************** +Reshaping fundamentals +********************** Reshaping by pivoting DataFrame objects --------------------------------------- @@ -195,3 +189,52 @@ some very expressive and fast data manipulations. df.stack().groupby(level=1).mean() df.mean().unstack(0) + + +********************************** +Pivot tables and cross-tabulations +********************************** + +The function `pandas.pivot_table` can be used to create spreadsheet-style pivot +tables. It takes a number of arguments + +- ``data``: A DataFrame object +- ``values``: column to aggregate +- ``xby``: list of columns to group by on the `x`-axis +- ``yby``: list of columns to group by on the `y`-axis +- ``aggfunc``: function to use for aggregation, defaulting to ``numpy.mean`` + +Consider a data set like this: + +.. ipython:: python + + df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, + 'B' : ['A', 'B', 'C'] * 4, + 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, + 'D' : np.random.randn(12), + 'E' : np.random.randn(12)}) + df + +We can produce pivot tables from this data very easily: + +.. ipython:: python + + pivot_table(df, values='D', xby=['A', 'B'], yby=['C']) + pivot_table(df, values='D', xby=['B'], yby=['A', 'C'], aggfunc=np.sum) + +The result object is a DataFrame having potentially hierarchical indexes on the +rows and columns. If the ``values`` column name is not given, the pivot table +will include all of the data that can be aggregated in an additional level of +hierarchy in the columns: + +.. ipython:: python + + pivot_table(df, xby=['A', 'B'], yby=['C']) + +You can render a nice output of the table omitting the missing values by +calling ``to_string`` if you wish: + +.. ipython:: python + + table = pivot_table(df, xby=['A', 'B'], yby=['C']) + print table.to_string(na_rep='') From 6e418a97858b771fc58255f4307271abc969487b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 21:47:12 -0400 Subject: [PATCH 108/161] REF: rename delimiter to sep in DataFrame.from_csv and to_csv --- RELEASE.rst | 1 + pandas/core/frame.py | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index a2c3334a1e8b1..44e02e338377d 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -54,6 +54,7 @@ feedback on the library. instead of printing to sys.stdout - Deprecated `nanRep` argument in various `to_string` and `to_csv` functions in favor of `na_rep`. Will be removed in 0.6 (GH #275) + - Renamed `delimiter` to `sep` in `DataFrame.from_csv` for consistency - Series functions renamed (and thus deprecated) in 0.4 series have been removed: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f63540e7cca23..574d73476577b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -422,7 +422,7 @@ def to_records(self, index=True): return np.rec.fromarrays(arrays, names=names) @classmethod - def from_csv(cls, path, header=0, delimiter=',', index_col=0, + def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True): """ Read delimited file into DataFrame @@ -432,7 +432,8 @@ def from_csv(cls, path, header=0, delimiter=',', index_col=0, path : string header : int, default 0 Row to use at header (skip prior rows) - delimiter : string, default ',' + sep : string, default ',' + Field delimiter index_col : int or sequence, default 0 Column to use for index. If a sequence is given, a MultiIndex is used. @@ -447,7 +448,7 @@ def from_csv(cls, path, header=0, delimiter=',', index_col=0, y : DataFrame or DataFrame """ from pandas.io.parsers import read_table - return read_table(path, header=header, sep=delimiter, + return read_table(path, header=header, sep=sep, parse_dates=parse_dates, index_col=index_col) def to_sparse(self, fill_value=None, kind='block'): @@ -469,7 +470,7 @@ def to_sparse(self, fill_value=None, kind='block'): default_fill_value=fill_value) def to_csv(self, path, na_rep='', cols=None, header=True, - index=True, index_label=None, mode='w', delimiter=",", + index=True, index_label=None, mode='w', sep=",", nanRep=None): """ Write DataFrame to a comma-separated values (csv) file @@ -490,11 +491,11 @@ def to_csv(self, path, na_rep='', cols=None, header=True, `header` and `index` are True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. mode : Python write mode, default 'w' - delimiter : character, default "," + sep : character, default "," Field delimiter for the output file. """ f = open(path, mode) - csvout = csv.writer(f, lineterminator='\n', delimiter=delimiter) + csvout = csv.writer(f, lineterminator='\n', delimiter=sep) if nanRep is not None: # pragma: no cover import warnings From 32d65ddb78cf4bccfc3fff9471f058fe845db453 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 21:57:38 -0400 Subject: [PATCH 109/161] DOC: more docs loose ends --- TODO.rst | 31 ++++++++++++++------------- doc/source/basics.rst | 37 ++++++++++++++++++++++++++------- doc/source/dsintro.rst | 45 +++++++++++++++++++++++++++++++++++++--- doc/source/groupby.rst | 2 ++ doc/source/io.rst | 2 +- doc/source/merging.rst | 25 ++++++++++++++++++++-- doc/source/reshaping.rst | 2 +- 7 files changed, 114 insertions(+), 30 deletions(-) diff --git a/TODO.rst b/TODO.rst index 0646b05388064..e81f58197ccbe 100644 --- a/TODO.rst +++ b/TODO.rst @@ -15,25 +15,25 @@ TODO docs - auto-sniff delimiter - MultiIndex - generally more documentation - -- pivot_table - +- DONE pivot_table - DONE Set mixed-type values with .ix -- get_dtype_counts / dtypes -- save / load functions -- combine_first -- describe for Series -- DataFrame.to_string -- Index / MultiIndex names -- Unstack / stack by level name -- ignore_index in DataFrame.append +- DONE get_dtype_counts / dtypes +- DONE save / load functions +- DONE isnull/notnull as instance methods +- DONE DataFrame.to_string +- DONE IPython tab complete hook +- DONE ignore_index in DataFrame.append +- DONE describe for Series with dtype=object +- DONE as_index=False in groupby +- DONOTWANT is_monotonic +- DONE DataFrame.to_csv: different delimiters - Inner join on key - Multi-key joining -- as_index=False in groupby -- is_monotonic -- isnull/notnull as instance methods +- Index / MultiIndex names + +- combine_first +- Unstack / stack by level name - name attribute on Series -- DataFrame.to_csv: different delimiters? - groupby with level name - MultiIndex - get_level_values @@ -43,7 +43,6 @@ TODO docs - df[col_list] - Panel.rename_axis - & and | for intersection / union -- IPython tab complete hook Performance blog ---------------- diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 0b652e8eacf48..11c17b9d1878f 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -242,9 +242,9 @@ will exclude NAs on Series input by default: Summarizing data: describe ~~~~~~~~~~~~~~~~~~~~~~~~~~ -For floating point data, there is a convenient ``describe`` function which -computes a variety of summary statistics about a Series or the columns of a -DataFrame (excluding NAs of course): +There is a convenient ``describe`` function which computes a variety of summary +statistics about a Series or the columns of a DataFrame (excluding NAs of +course): .. ipython:: python @@ -255,6 +255,16 @@ DataFrame (excluding NAs of course): frame.ix[::2] = np.nan frame.describe() +For a non-numerical Series object, `describe` will give a simple summary of the +number of unique values and most frequently occurring values: + + +.. ipython:: python + + s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s.describe() + + Correlations between objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -657,15 +667,28 @@ alternately passing the ``dtype`` keyword argument to the object constructor. Pickling and serialization -------------------------- -All pandas objects are equipped with ``save`` and ``load`` methods which use -Python's ``cPickle`` module to save and load data structures to disk using the -pickle format. +All pandas objects are equipped with ``save`` methods which use Python's +``cPickle`` module to save data structures to disk using the pickle format. .. ipython:: python df df.save('foo.pickle') - DataFrame.load('foo.pickle') + +The ``load`` function in the ``pandas`` namespace can be used to load any +pickled pandas object (or any other pickled object) from file: + + +.. ipython:: python + + load('foo.pickle') + +There is also a ``save`` function which takes any object as its first argument: + +.. ipython:: python + + save(df, 'foo.pickle') + load('foo.pickle') .. ipython:: python :suppress: diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index eca3e2ccde4c6..15192f2f5eac1 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -439,12 +439,51 @@ R package): baseball = read_csv('data/baseball.csv') baseball -However, using ``to_string`` will display any DataFrame in tabular form, though -it won't always fit the console width: +However, using ``to_string`` will return a string representation of the +DataFrame in tabular form, though it won't always fit the console width: .. ipython:: python - baseball.ix[-20:, :12].to_string() + print baseball.ix[-20:, :12].to_string() + +DataFrame column types +~~~~~~~~~~~~~~~~~~~~~~ + +The four main types stored in pandas objects are float, int, boolean, and +object. A convenient ``dtypes`` attribute return a Series with the data type of +each column: + +.. ipython:: python + + baseball.dtypes + +The related method ``get_dtype_counts`` will return the number of columns of +each type: + +.. ipython:: python + + baseball.get_dtype_counts() + +DataFrame column attribute access and IPython completion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a DataFrame column label is a valid Python variable name, the column can be +accessed like attributes: + +.. ipython:: python + + df = DataFrame({'foo1' : np.random.randn(5), + 'foo2' : np.random.randn(5)}) + df + df.foo1 + +The columns are also connected to the `IPython `__ +completion mechanism so they can be tab-completed: + +.. code-block:: ipython + + In [5]: df.fo + df.foo1 df.foo2 .. _basics.panel: diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index bc2e2c10e9419..b520a33572e2b 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -250,6 +250,8 @@ changed by using the ``as_index`` option: grouped = df.groupby(['A', 'B'], as_index=False) grouped.aggregate(np.sum) + df.groupby('A', as_index=False).sum() + Note that you could use the ``delevel`` DataFrame function to achieve the same result as the column names are stored in the resulting ``MultiIndex``: diff --git a/doc/source/io.rst b/doc/source/io.rst index 1c169589e763a..ff5896bdcff7a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -175,7 +175,7 @@ rather than reading the entire file into memory, such as the following: .. ipython:: python :suppress: - df[:7].to_csv('tmp.sv', delimiter='|') + df[:7].to_csv('tmp.sv', sep='|') .. ipython:: python diff --git a/doc/source/merging.rst b/doc/source/merging.rst index c404e53554d38..21ff4dfd3ca45 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -14,8 +14,8 @@ Merging / Joining data sets *************************** -Appending disjoint objects --------------------------- +Appending DataFrame objects +--------------------------- Series and DataFrame have an ``append`` method which will glue together objects each of whose ``index`` (Series labels or DataFrame rows) is mutually @@ -40,6 +40,27 @@ In the case of DataFrame, the indexes must be disjoint but the columns do not ne df2 df1.append(df2) +Appending record-array like DataFrames +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For DataFrames which don't have a meaningful index, you may wish to append them +and ignore the fact that they may have overlapping indexes: + +.. ipython:: python + + df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) + df2 = DataFrame(randn(3, 4), columns=['A', 'B', 'C', 'D']) + + df1 + df2 + +To do this, use the ``ignore_index`` argument: + +.. ipython:: python + + df1.append(df2, ignore_index=True) + + Joining / merging DataFrames ---------------------------- diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 99eba88c6cc44..332424c4e4b05 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -195,7 +195,7 @@ some very expressive and fast data manipulations. Pivot tables and cross-tabulations ********************************** -The function `pandas.pivot_table` can be used to create spreadsheet-style pivot +The function ``pandas.pivot_table`` can be used to create spreadsheet-style pivot tables. It takes a number of arguments - ``data``: A DataFrame object From de98e242caa7c11e59618a4dbdafed03e8d23cf2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 22:12:19 -0400 Subject: [PATCH 110/161] ENH: specify dependency list for tseries Cython module, GH #271 --- Makefile | 1 - RELEASE.rst | 1 + setup.py | 18 +++++++++++++----- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index a4a700a81959e..a4861c1477d8e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,6 @@ clean: -rm -rf build dist tseries: pandas/src/tseries.pyx - touch pandas/src/tseries.pyx python setup.py build_ext --inplace sparse: pandas/src/sparse.pyx diff --git a/RELEASE.rst b/RELEASE.rst index 44e02e338377d..24d4768fbceb8 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -149,6 +149,7 @@ feedback on the library. the source directory (GH #250) - Significantly speed up DataFrame `__repr__` and `count` on large mixed-type DataFrame objects + - Better handling of pyx file dependencies in Cython module build (GH #271) **Bug fixes** diff --git a/setup.py b/setup.py index 633be5ebc96fa..4a3059954a795 100755 --- a/setup.py +++ b/setup.py @@ -274,22 +274,30 @@ def run(self): cmdclass['build_ext'] = build_ext cmdclass['sdist'] = CheckSDist -tseries_depends = ['reindex', 'io', 'common', 'groupby' - 'skiplist', 'isnull', 'moments', 'operators'] - +tseries_depends = ['reindex', 'groupby', 'skiplist', 'moments', + 'generated', 'parsing'] def srcpath(name=None, suffix='.pyx', subdir='src'): return pjoin('pandas', subdir, name+suffix) +if suffix == '.pyx': + tseries_depends = [srcpath(f, suffix='.pyx') + for f in tseries_depends] +else: + tseries_depends = None + +print tseries_depends + tseries_ext = Extension('pandas._tseries', + depends=tseries_depends, sources=[srcpath('tseries', suffix=suffix)], - # depends=[srcpath(f, suffix='.pyx') - # for f in tseries_depends], include_dirs=[np.get_include()]) + sparse_ext = Extension('pandas._sparse', sources=[srcpath('sparse', suffix=suffix)], include_dirs=[np.get_include()]) extensions = [tseries_ext, sparse_ext] + # if _have_setuptools: # setuptools_args["test_suite"] = "nose.collector" From 566304fa23f171813c2b9df7ee90e6adb3b917e2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 22:17:06 -0400 Subject: [PATCH 111/161] TST: DataFrame.append test, address concern in GH #276 --- pandas/tests/test_frame.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 294de46984fcf..bb99e55a7cb6d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1712,6 +1712,19 @@ def test_append_records(self): expected = DataFrame(np.concatenate((arr1, arr2))) assert_frame_equal(result, expected) + def test_append_different_columns(self): + df = DataFrame({'bools' : np.random.randn(10) > 0, + 'ints' : np.random.randint(0, 10, 10), + 'floats' : np.random.randn(10), + 'strings' : ['foo', 'bar'] * 5}) + + a = df[:5].ix[:, ['bools', 'ints', 'floats']] + b = df[5:].ix[:, ['strings', 'ints', 'floats']] + + appended = a.append(b) + self.assert_(isnull(appended['strings'][:5]).all()) + self.assert_(isnull(appended['bools'][5:]).all()) + def test_asfreq(self): offset_monthly = self.tsframe.asfreq(datetools.bmonthEnd) rule_monthly = self.tsframe.asfreq('EOM') From 1bcf68e58e27d76974a054fbe1f050853fb3f621 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 22:21:30 -0400 Subject: [PATCH 112/161] BUG: small API change in Series.clip arg order, enable np.clip to be used also, GH #272 --- RELEASE.rst | 3 +++ pandas/core/series.py | 5 ++++- pandas/tests/test_series.py | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index 24d4768fbceb8..8ffdaa4478003 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -55,6 +55,9 @@ feedback on the library. - Deprecated `nanRep` argument in various `to_string` and `to_csv` functions in favor of `na_rep`. Will be removed in 0.6 (GH #275) - Renamed `delimiter` to `sep` in `DataFrame.from_csv` for consistency + - Changed order of `Series.clip` arguments to match those of `numpy.clip` and + added (unimplemented) `out` argument so `numpy.clip` can be called on a + Series (GH #272) - Series functions renamed (and thus deprecated) in 0.4 series have been removed: diff --git a/pandas/core/series.py b/pandas/core/series.py index 840e03e32a449..511fd79254662 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -967,7 +967,7 @@ def autocorr(self): """ return self.corr(self.shift(1)) - def clip(self, upper=None, lower=None): + def clip(self, lower=None, upper=None, out=None): """ Trim values at input threshold(s) @@ -980,6 +980,9 @@ def clip(self, upper=None, lower=None): ------- clipped : Series """ + if out is not None: # pragma: no cover + raise Exception('out argument is not supported yet') + result = self if lower is not None: result = result.clip_lower(lower) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 08c6826f8b872..c14d04d066b9a 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -843,6 +843,11 @@ def test_clip(self): self.assertEqual(self.ts.clip(lower=val).min(), val) self.assertEqual(self.ts.clip(upper=val).max(), val) + result = self.ts.clip(-0.5, 0.5) + expected = np.clip(self.ts, -0.5, 0.5) + assert_series_equal(result, expected) + self.assert_(isinstance(expected, Series)) + def test_valid(self): ts = self.ts.copy() ts[::2] = np.NaN From dd3cd7131d985c5c94eb18291592482656276b6e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 22:35:34 -0400 Subject: [PATCH 113/161] BUG: handle NAs in Series.value_counts and describe with dtype=object, GH #277 --- pandas/core/series.py | 6 +++--- pandas/tests/test_series.py | 14 +++++++++++++- pandas/util/testing.py | 4 +++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 511fd79254662..c804e4dfa34ba 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -604,7 +604,7 @@ def value_counts(self): """ Returns Series containing counts of unique values. The resulting Series will be in descending order so that the first element is the most - frequently-occurring element + frequently-occurring element. Excludes NA values Returns ------- @@ -612,7 +612,7 @@ def value_counts(self): """ from collections import defaultdict counter = defaultdict(lambda: 0) - for value in self.values: + for value in self.dropna().values: counter[value] += 1 return Series(counter).order(ascending=False) @@ -906,7 +906,7 @@ def describe(self): if self.dtype == object: names = ['count', 'unique', 'top', 'freq'] - objcounts = Counter(self) + objcounts = Counter(self.dropna().values) top, freq = objcounts.most_common(1)[0] data = [self.count(), len(objcounts), top, freq] diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index c14d04d066b9a..07ada8ae1fb37 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -555,7 +555,13 @@ def test_quantile(self): def test_describe(self): _ = self.series.describe() _ = self.ts.describe() - _ = self.objSeries.describe() + + def test_describe_objects(self): + s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) + result = s.describe() + expected = Series({'count' : 7, 'unique' : 4, + 'top' : 'a', 'freq' : 3}, index=result.index) + assert_series_equal(result, expected) def test_append(self): appendedSeries = self.series.append(self.ts) @@ -770,6 +776,12 @@ def test_value_counts(self): expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) assert_series_equal(hist, expected) + # handle NA's properly + s[5:7] = np.nan + hist = s.value_counts() + expected = s.dropna().value_counts() + assert_series_equal(hist, expected) + s = Series({}) hist = s.value_counts() expected = Series([]) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 92982e100b3aa..a4b914e437dd2 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1,3 +1,5 @@ +from __future__ import division + # pylint: disable-msg=W0402 from datetime import datetime @@ -116,7 +118,7 @@ def assert_dict_equal(a, b, compare_keys=True): assert_almost_equal(a[k], b[k]) def assert_series_equal(left, right): - assert_almost_equal(left, right) + assert_almost_equal(left.values, right.values) assert(left.dtype == right.dtype) assert(left.index.equals(right.index)) From e9dec0d968a688d9d9d65f2a55659ee70f3fb33b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 22:41:04 -0400 Subject: [PATCH 114/161] DOC: fix up DataFrame.from_csv docs. make up mind about default arguments, GH #274 --- pandas/core/frame.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 574d73476577b..ba12e91b610fa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -436,16 +436,19 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, Field delimiter index_col : int or sequence, default 0 Column to use for index. If a sequence is given, a MultiIndex - is used. + is used. Different default from read_table + parse_dates : boolean, default True + Parse dates. Different default from read_table Notes ----- - Will attempt to convert index to datetimes for time series - data. Use read_table for more options + Preferable to use read_table for most general purposes but from_csv + makes for an easy roundtrip to and from file, especially with a + DataFrame of time series data Returns ------- - y : DataFrame or DataFrame + y : DataFrame """ from pandas.io.parsers import read_table return read_table(path, header=header, sep=sep, @@ -469,9 +472,8 @@ def to_sparse(self, fill_value=None, kind='block'): default_kind=kind, default_fill_value=fill_value) - def to_csv(self, path, na_rep='', cols=None, header=True, - index=True, index_label=None, mode='w', sep=",", - nanRep=None): + def to_csv(self, path, sep=",", na_rep='', cols=None, header=True, + index=True, index_label=None, mode='w', nanRep=None): """ Write DataFrame to a comma-separated values (csv) file @@ -482,6 +484,7 @@ def to_csv(self, path, na_rep='', cols=None, header=True, nanRep : string, default '' Missing data rep'n cols : sequence, optional + Columns to write header : boolean, default True Write out column names index : boolean, default True From 62eb71c2a2ceddefd66eca865f760f0ff56ba136 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Oct 2011 23:16:14 -0400 Subject: [PATCH 115/161] DOC: document combine first, GH #161 --- doc/source/basics.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 11c17b9d1878f..ebfeabb2021fe 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -157,6 +157,29 @@ replace NaN with some other value using ``fillna`` if you wish). df + df2 df.add(df2, fill_value=0) +Combining overlapping data sets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A problem occasionally arising is the combination of two similar data sets +where values in one are preferred over the other. An example would be two data +series representing a particular economic indicator where one is considered to +be of "higher quality". However, the lower quality series might extend further +back in history or have more complete data coverage. As such, we would like to +combine two DataFrame objects where missing values in one DataFrame are +conditionally filled with like-labeled values from the other DataFrame. The +function implementing this operation is ``combine_first``, which we illustrate: + +.. ipython:: python + + df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan], + 'B' : [np.nan, 2., 3., np.nan, 6.]}) + df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], + 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 + df2 + df1.combine_first(df2) + + .. _basics.stats: Descriptive statistics From c328c997a14254f7abdeb520aea19a51c541114a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 22 Oct 2011 14:00:09 -0400 Subject: [PATCH 116/161] BLD: add tools submodule --- pandas/tools/tests/__init__.py | 1 + setup.py | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 pandas/tools/tests/__init__.py diff --git a/pandas/tools/tests/__init__.py b/pandas/tools/tests/__init__.py new file mode 100644 index 0000000000000..8b137891791fe --- /dev/null +++ b/pandas/tools/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/setup.py b/setup.py index 4a3059954a795..76a629025f7dd 100755 --- a/setup.py +++ b/setup.py @@ -312,6 +312,8 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): 'pandas.stats', 'pandas.util', 'pandas.tests', + 'pandas.tools', + 'pandas.tools.tests', 'pandas.io.tests', 'pandas.stats.tests', ], From 6ad6298bb5ef1b0b6846b5720b32a4fea9459bb5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 22 Oct 2011 14:09:51 -0400 Subject: [PATCH 117/161] BLD: add __init__.py --- pandas/tools/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tools/__init__.py diff --git a/pandas/tools/__init__.py b/pandas/tools/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d From d61d0149ca6e0d6d0d980972a6eb116154f00d9d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 22 Oct 2011 13:59:31 -0400 Subject: [PATCH 118/161] REF: to_string refactor --- pandas/core/frame.py | 171 ++++++++++++++++++------------- pandas/tools/pivot.py | 13 ++- pandas/tools/tests/test_pivot.py | 8 +- 3 files changed, 115 insertions(+), 77 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ba12e91b610fa..ea98ba72ea38b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -553,8 +553,8 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True, def to_string(self, buf=None, columns=None, colSpace=None, na_rep='NaN', formatters=None, float_format=None, - sparsify=True, nanRep=None): - from pandas.core.common import _format, adjoin + sparsify=True, nanRep=None, index_names=True): + if nanRep is not None: # pragma: no cover import warnings @@ -562,77 +562,16 @@ def to_string(self, buf=None, columns=None, colSpace=None, FutureWarning) na_rep = nanRep - return_ = False - if buf is None: # pragma: no cover - buf = StringIO() - return_ = True - - if colSpace is None: - def _myformat(v): - return _format(v, na_rep=na_rep, - float_format=float_format) - else: - def _myformat(v): - return _pfixed(v, colSpace, na_rep=na_rep, - float_format=float_format) - - if formatters is None: - formatters = {} - - def _format_col(col): - formatter = formatters.get(col, _myformat) - return [formatter(x) for x in self[col]] - - if columns is None: - columns = self.columns - else: - columns = [c for c in columns if c in self] - - to_write = [] - - if len(columns) == 0 or len(self.index) == 0: - to_write.append('Empty %s' % type(self).__name__) - to_write.append(repr(self.index)) - else: - (str_index, - str_columns) = self._get_formatted_labels(sparsify=sparsify) - stringified = [str_columns[i] + _format_col(c) - for i, c in enumerate(columns)] - to_write.append(adjoin(1, str_index, *stringified)) - - for s in to_write: - if isinstance(s, unicode): - to_write = [unicode(s) for s in to_write] - break - - for s in to_write: - print >> buf, s - - if return_: - return buf.getvalue() - - def _get_formatted_labels(self, sparsify=True): - from pandas.core.index import _sparsify - - if isinstance(self.index, MultiIndex): - fmt_index = self.index.format(sparsify=sparsify) - else: - fmt_index = self.index.format() - - if isinstance(self.columns, MultiIndex): - fmt_columns = self.columns.format(sparsify=False, adjoin=False) - str_columns = zip(*[[' %s' % y for y in x] - for x in zip(*fmt_columns)]) - if sparsify: - str_columns = _sparsify(str_columns) - str_columns = [list(x) for x in zip(*str_columns)] - str_index = [''] * self.columns.nlevels + fmt_index - else: - str_columns = [[' %s' % x] for x in self.columns.format()] - str_index = [''] + fmt_index + formatter = _DataFrameFormatter(buf=buf, columns=columns, + col_space=colSpace, na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + index_names=index_names) - return str_index, str_columns + if buf is None: + return formatter.get_result() def info(self, verbose=True, buf=None): """ @@ -2962,6 +2901,96 @@ def combineMult(self, other): """ return self.mul(other, fill_value=1.) + +class _DataFrameFormatter(object): + + def __init__(self, frame, buf=None, columns=None, col_space=None, + na_rep='NaN', formatters=None, float_format=None, + sparsify=True, index_names=True): + + self.frame = frame + self.buf = buf if buf is None else StringIO() + self.index_names = index_names + + if columns is None: + self.columns = frame.columns + else: + self.columns = [c for c in columns if c in frame] + + def get_result(self): + pass + + def _write_to_buffer(self): + from pandas.core.common import adjoin + + to_write = [] + + if len(columns) == 0 or len(self.index) == 0: + to_write.append('Empty %s' % type(self).__name__) + to_write.append(repr(self.index)) + else: + (str_index, + str_columns) = self._get_formatted_labels(sparsify=sparsify) + stringified = [str_columns[i] + _format_col(c) + for i, c in enumerate(columns)] + to_write.append(adjoin(1, str_index, *stringified)) + + for s in to_write: + if isinstance(s, unicode): + to_write = [unicode(s) for s in to_write] + break + + for s in to_write: + print >> buf, s + + def _get_column_formatter(self): + from pandas.core.common import _format + + na_rep = self.na_rep + float_format = self.float_format + col_space = self.col_space + + if col_space is None: + def _myformat(v): + return _format(v, na_rep=na_rep, + float_format=float_format) + else: + def _myformat(v): + return _pfixed(v, col_space, na_rep=na_rep, + float_format=float_format) + + formatters = {} if self.formatters is None else self.formatters + + def _format_col(col): + formatter = formatters.get(col, _myformat) + return [formatter(x) for x in col] + + return _format_col + + def _get_formatted_labels(self, sparsify=True): + from pandas.core.index import _sparsify + + if isinstance(self.index, MultiIndex): + fmt_index = self.index.format(sparsify=sparsify) + else: + fmt_index = self.index.format() + + if isinstance(self.columns, MultiIndex): + fmt_columns = self.columns.format(sparsify=False, adjoin=False) + str_columns = zip(*[[' %s' % y for y in x] + for x in zip(*fmt_columns)]) + if sparsify: + str_columns = _sparsify(str_columns) + + str_columns = [list(x) for x in zip(*str_columns)] + str_index = [''] * self.columns.nlevels + fmt_index + else: + str_columns = [[' %s' % x] for x in self.columns.format()] + str_index = [''] + fmt_index + + return str_index, str_columns + + def group_agg(values, bounds, f): """ R-style aggregator diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 132639d8672a4..858d02d5a5f03 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -47,8 +47,8 @@ def pivot_table(data, values=None, xby=None, yby=None, aggfunc=np.mean, ------- table : DataFrame """ - xby = [] if xby is None else list(xby) - yby = [] if yby is None else list(yby) + xby = _convert_by(xby) + yby = _convert_by(yby) keys = xby + yby grouped = data.groupby(keys) @@ -67,6 +67,15 @@ def pivot_table(data, values=None, xby=None, yby=None, aggfunc=np.mean, return table +def _convert_by(by): + if by is None: + by = [] + elif np.isscalar(by): + by = [by] + else: + by = list(by) + return by + def pprint_table(table): pass diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 2b8b473dc40f6..ad34b3f697303 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -23,7 +23,7 @@ def setUp(self): def test_pivot_table(self): xby = ['A', 'B'] - yby= ['C'] + yby= 'C' table = pivot_table(self.data, values='D', xby=xby, yby=yby) if len(xby) > 1: @@ -36,14 +36,14 @@ def test_pivot_table(self): else: self.assertEqual(table.columns.name, yby[0]) - expected = self.data.groupby(xby + yby)['D'].agg(np.mean).unstack() + expected = self.data.groupby(xby + [yby])['D'].agg(np.mean).unstack() assert_frame_equal(table, expected) def test_pivot_table_multiple(self): xby = ['A', 'B'] - yby= ['C'] + yby= 'C' table = pivot_table(self.data, xby=xby, yby=yby) - expected = self.data.groupby(xby + yby).agg(np.mean).unstack() + expected = self.data.groupby(xby + [yby]).agg(np.mean).unstack() assert_frame_equal(table, expected) if __name__ == '__main__': From 0102dbedd925ccdc20fa40228ea3340cedadacb5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 22 Oct 2011 15:32:46 -0400 Subject: [PATCH 119/161] ENH: DataFrame.to_string will print index names --- pandas/core/frame.py | 132 +++++++++++++++++++++++++++++-------------- pandas/core/index.py | 33 ++++++++--- 2 files changed, 115 insertions(+), 50 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ea98ba72ea38b..ecbbcbbad39b7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -20,7 +20,7 @@ from numpy import nan import numpy as np -from pandas.core.common import (isnull, notnull, PandasError, +from pandas.core.common import (isnull, notnull, PandasError, adjoin, _try_sort, _pfixed, _default_index, _infer_dtype, _stringify, _maybe_upcast) from pandas.core.daterange import DateRange @@ -563,7 +563,7 @@ def to_string(self, buf=None, columns=None, colSpace=None, na_rep = nanRep - formatter = _DataFrameFormatter(buf=buf, columns=columns, + formatter = _DataFrameFormatter(self, buf=buf, columns=columns, col_space=colSpace, na_rep=na_rep, formatters=formatters, float_format=float_format, @@ -571,7 +571,7 @@ def to_string(self, buf=None, columns=None, colSpace=None, index_names=index_names) if buf is None: - return formatter.get_result() + return formatter.buf.getvalue() def info(self, verbose=True, buf=None): """ @@ -2903,36 +2903,43 @@ def combineMult(self, other): class _DataFrameFormatter(object): - + """ + Render a console-friendly tabular output of a DataFrame + """ def __init__(self, frame, buf=None, columns=None, col_space=None, na_rep='NaN', formatters=None, float_format=None, sparsify=True, index_names=True): self.frame = frame - self.buf = buf if buf is None else StringIO() - self.index_names = index_names + self.buf = buf if buf is not None else StringIO() + self.show_index_names = index_names + self.sparsify = sparsify + self.float_format = float_format + self.formatters = formatters + self.na_rep = na_rep + self.col_space = col_space + self.column_filter = frame.columns if columns is None else set(columns) - if columns is None: - self.columns = frame.columns - else: - self.columns = [c for c in columns if c in frame] - - def get_result(self): - pass + self._write_to_buffer() def _write_to_buffer(self): - from pandas.core.common import adjoin + frame = self.frame + format_col = self._get_column_formatter() to_write = [] - if len(columns) == 0 or len(self.index) == 0: + if len(frame.columns) == 0 or len(frame.index) == 0: to_write.append('Empty %s' % type(self).__name__) - to_write.append(repr(self.index)) + to_write.append(repr(frame.index)) else: - (str_index, - str_columns) = self._get_formatted_labels(sparsify=sparsify) - stringified = [str_columns[i] + _format_col(c) - for i, c in enumerate(columns)] + # may include levels names also + str_index = self._get_formatted_index() + str_columns = self._get_formatted_column_labels() + + stringified = [str_columns[i] + format_col(c) + for i, c in enumerate(frame.columns) + if c in self.column_filter] + to_write.append(adjoin(1, str_index, *stringified)) for s in to_write: @@ -2940,56 +2947,99 @@ def _write_to_buffer(self): to_write = [unicode(s) for s in to_write] break - for s in to_write: - print >> buf, s + self.buf.writelines(to_write) def _get_column_formatter(self): from pandas.core.common import _format - na_rep = self.na_rep - float_format = self.float_format col_space = self.col_space if col_space is None: def _myformat(v): - return _format(v, na_rep=na_rep, - float_format=float_format) + return _format(v, na_rep=self.na_rep, + float_format=self.float_format) else: def _myformat(v): - return _pfixed(v, col_space, na_rep=na_rep, - float_format=float_format) + return _pfixed(v, col_space, na_rep=self.na_rep, + float_format=self.float_format) formatters = {} if self.formatters is None else self.formatters def _format_col(col): formatter = formatters.get(col, _myformat) - return [formatter(x) for x in col] + return [formatter(x) for x in self.frame[col]] return _format_col - def _get_formatted_labels(self, sparsify=True): + def _get_formatted_column_labels(self): from pandas.core.index import _sparsify - if isinstance(self.index, MultiIndex): - fmt_index = self.index.format(sparsify=sparsify) - else: - fmt_index = self.index.format() + columns = self.frame.columns - if isinstance(self.columns, MultiIndex): - fmt_columns = self.columns.format(sparsify=False, adjoin=False) + if isinstance(columns, MultiIndex): + fmt_columns = columns.format(sparsify=False, adjoin=False) str_columns = zip(*[[' %s' % y for y in x] for x in zip(*fmt_columns)]) - if sparsify: + if self.sparsify: str_columns = _sparsify(str_columns) str_columns = [list(x) for x in zip(*str_columns)] - str_index = [''] * self.columns.nlevels + fmt_index else: - str_columns = [[' %s' % x] for x in self.columns.format()] - str_index = [''] + fmt_index + str_columns = [[' %s' % x] for x in columns.format()] + + if self.show_index_names and self.has_index_names: + for x in str_columns: + x.append('') + + return str_columns + + @property + def has_index_names(self): + return _has_names(self.frame.index) + + @property + def has_column_names(self): + return _has_names(self.frame.columns) + + def _get_formatted_index(self): + index = self.frame.index + columns = self.frame.columns + + show_index_names = self.show_index_names and self.has_index_names + show_col_names = self.show_index_names and self.has_column_names + + if isinstance(index, MultiIndex): + fmt_index = index.format(sparsify=self.sparsify, adjoin=False, + names=show_index_names) + else: + fmt_index = [index.format(name=show_index_names)] + + # empty space for columns + padding = [''] * columns.nlevels + fmt_index = [padding + list(rows) for rows in fmt_index] + + if show_col_names: + namecol = self._get_column_name_list() + namecol = namecol + [''] * (len(fmt_index[0]) - len(namecol)) + fmt_index.append(namecol) - return str_index, str_columns + return adjoin(1, *fmt_index).split('\n') + def _get_column_name_list(self): + names = [] + columns = self.frame.columns + if isinstance(columns, MultiIndex): + names.extend('' if name is None else name + for name in columns.names) + else: + names.append('' if columns.name is None else columns.name) + return names + +def _has_names(index): + if isinstance(index, MultiIndex): + return any([x is not None for x in index.names]) + else: + return index.name is not None def group_agg(values, bounds, f): """ diff --git a/pandas/core/index.py b/pandas/core/index.py index eb170c8a9d11c..a4b28ecf2d33e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -63,6 +63,10 @@ def __array_finalize__(self, obj): def dtype(self): return self.values.dtype + @property + def nlevels(self): + return 1 + def summary(self): if len(self) > 0: index_summary = ', %s to %s' % (str(self[0]), str(self[-1])) @@ -177,20 +181,26 @@ def take(self, *args, **kwargs): taken = self.view(np.ndarray).take(*args, **kwargs) return Index(taken, name=self.name) - def format(self, vertical=False): + def format(self, name=False): """ Render a string representation of the Index """ + result = [] + + if name: + result.append(self.name if self.name is not None else '') + if self.is_all_dates(): - to_join = [] zero_time = time(0, 0) for dt in self: if dt.time() != zero_time or dt.tzinfo is not None: return ['%s' % x for x in self] - to_join.append(dt.strftime("%Y-%m-%d")) - return to_join + result.append(dt.strftime("%Y-%m-%d")) + return result - return [_stringify(x) for x in self] + result.extend(_stringify(x) for x in self) + + return result def equals(self, other): """ @@ -917,16 +927,21 @@ def __contains__(self, key): except Exception: return False - def format(self, space=2, sparsify=True, vertical=False, adjoin=True): + def format(self, space=2, sparsify=True, adjoin=True, names=False): if len(self) == 0: return [] stringified_levels = [lev.format() for lev in self.levels] result_levels = [] - for lab, lev in zip(self.labels, stringified_levels): - taken = np.array(lev, dtype=object).take(lab) - result_levels.append(taken) + for lab, lev, name in zip(self.labels, stringified_levels, self.names): + level = [] + + if names: + level.append(name if name is not None else '') + + level.extend(np.array(lev, dtype=object).take(lab)) + result_levels.append(level) if sparsify: result_levels = _sparsify(result_levels) From 242f9195c1f9eb9903790691584492f96a8504fe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 22 Oct 2011 15:58:11 -0400 Subject: [PATCH 120/161] ENH: change display to be tighter, less wasted space --- pandas/core/frame.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ecbbcbbad39b7..183ba6d89a5b3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3014,16 +3014,15 @@ def _get_formatted_index(self): else: fmt_index = [index.format(name=show_index_names)] - # empty space for columns - padding = [''] * columns.nlevels - fmt_index = [padding + list(rows) for rows in fmt_index] + adjoined = adjoin(1, *fmt_index).split('\n') + # empty space for columns if show_col_names: - namecol = self._get_column_name_list() - namecol = namecol + [''] * (len(fmt_index[0]) - len(namecol)) - fmt_index.append(namecol) + col_header = [' %s' % x for x in self._get_column_name_list()] + else: + col_header = [''] * columns.nlevels - return adjoin(1, *fmt_index).split('\n') + return col_header + adjoined def _get_column_name_list(self): names = [] From f85a9273f3bb3e7396008c74b949a74383d77629 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 22 Oct 2011 22:09:26 -0400 Subject: [PATCH 121/161] DOC: some more docs, getting closer to complete for 0.5.0 --- RELEASE.rst | 1 + TODO.rst | 14 +++++++------- doc/source/groupby.rst | 9 ++++++++- doc/source/indexing.rst | 34 ++++++++++++++++++++++++++-------- 4 files changed, 42 insertions(+), 16 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 8ffdaa4478003..e3db0d17189b0 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -127,6 +127,7 @@ feedback on the library. #261) - Added `pivot_table` convenience function to pandas namespace (GH #234) - Implemented `Panel.rename_axis` function (GH #243) + - DataFrame will show index level names in console output **Improvements to existing features** diff --git a/TODO.rst b/TODO.rst index e81f58197ccbe..90a59e3749e17 100644 --- a/TODO.rst +++ b/TODO.rst @@ -27,16 +27,16 @@ TODO docs - DONE as_index=False in groupby - DONOTWANT is_monotonic - DONE DataFrame.to_csv: different delimiters -- Inner join on key -- Multi-key joining -- Index / MultiIndex names +- DONE combine_first +- DONE groupby with level name +- DONE MultiIndex get_level_values -- combine_first +- Index / MultiIndex names - Unstack / stack by level name - name attribute on Series -- groupby with level name -- MultiIndex - - get_level_values + +- Inner join on key +- Multi-key joining - Update to reflect Python 3 support in intro - align functions diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index b520a33572e2b..57aafef866bed 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -159,7 +159,7 @@ natural to group by one of the levels of the hierarchy. ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) tuples - index = MultiIndex.from_tuples(tuples) + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) s = Series(randn(8), index=index) .. ipython:: python @@ -168,6 +168,13 @@ natural to group by one of the levels of the hierarchy. grouped = s.groupby(level=0) grouped.sum() +If the MultiIndex has names specified, these can be passed instead of the level +number: + +.. ipython:: python + + s.groupby(level='second').sum() + More on the ``sum`` function and aggregation later. Grouping with multiple levels (as opposed to a single level) is not yet supported, though implementing it is not difficult. diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 905f004e955f1..e51dae00bdc37 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -346,10 +346,18 @@ can think of ``MultiIndex`` an array of tuples where each tuple is unique. A ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) tuples - index = MultiIndex.from_tuples(tuples) + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) s = Series(randn(8), index=index) s +All of the ``MultiIndex`` constructors accept a ``names`` argument which stores +string names for the levels themselves. If no names are provided, some +arbitrary ones will be assigned: + +.. ipython:: python + + index.names + This index can back any axis of a pandas object, and the number of **levels** of the index is up to you: @@ -376,17 +384,17 @@ can find yourself working with hierarchically-indexed data without creating a ``MultiIndex`` explicitly yourself. However, when loading data from a file, you may wish to generate your own ``MultiIndex`` when preparing the data set. -Level names -~~~~~~~~~~~ +Reconstructing the level labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -All of the ``MultiIndex`` constructors accept a ``names`` argument which stores -string names for the levels themselves. This will get increasingly integrated -in to groupby and reshaping routines. If no names are provided, some arbitrary -ones will be assigned: +The method ``get_level_values`` will return a vector of the labels for each +location at a particular level: .. ipython:: python - index.names + index.get_level_values(0) + index.get_level_values(1) + Basic indexing on axis with MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -576,6 +584,16 @@ To do this, use the ``swaplevels`` function: df df.swaplevels(0, 1) +Index methods +------------- + +The pandas Index class and its subclasses can be viewed as implementing an +*ordered set* in addition to providing the support infrastructure necessary for +lookups, data alignment, and reindexing. + +Set operations on Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Indexing internal details ------------------------- From 0fe0425b21777ed6849eaf6c58b40cdbcadcc0dc Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Sat, 22 Oct 2011 23:49:11 +0100 Subject: [PATCH 122/161] Update install docs now that pandas works on Python 3. --- doc/source/install.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index cd1c814c4d8c2..444d8d5c3cfd2 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -16,9 +16,9 @@ compiler (MinGW or Visual Studio) installed. `How-to install MinGW on Windows Python version support ~~~~~~~~~~~~~~~~~~~~~~ -Officially Python 2.5 to 2.7. I will aim for Python 3.x support in the next -release. Python 2.4 support is being phased out since the userbase has shrunk -significantly. Continuing Python 2.4 support will require either monetary +Officially Python 2.5 to 2.7 and Python 3.1+, although Python 3 support is less +well tested. Python 2.4 support is being phased out since the userbase has +shrunk significantly. Continuing Python 2.4 support will require either monetary development support or someone contributing to the project to maintain compatibility. From 1078fc372e8e382baa4b78bfa3a34f7cd932af20 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 23 Oct 2011 13:12:45 -0400 Subject: [PATCH 123/161] ENH: print index name in Series.__repr__ --- pandas/core/series.py | 11 ++++++++--- setup.py | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index c804e4dfa34ba..2763e5216bcc3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -369,11 +369,16 @@ def __setslice__(self, i, j, value): def __repr__(self): """Clean string representation of a Series""" if len(self.index) > 500: - return self._tidy_repr(30) + result = self._tidy_repr(30) elif len(self.index) > 0: - return self._get_repr(name=True) + result = self._get_repr(name=True) else: - return '%s' % ndarray.__repr__(self) + result = '%s' % ndarray.__repr__(self) + + if self.index.name is not None: + result = '%s\n%s' % (self.index.name, result) + + return result def _tidy_repr(self, max_vals=20): num = max_vals // 2 diff --git a/setup.py b/setup.py index 76a629025f7dd..bf997d8c8ecac 100755 --- a/setup.py +++ b/setup.py @@ -308,6 +308,7 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): packages=['pandas', 'pandas.core', 'pandas.io', + 'pandas.rpy', 'pandas.sandbox', 'pandas.stats', 'pandas.util', From f838ff9734e21d16a967860f4fda5c6af3cc57f9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 23 Oct 2011 13:56:45 -0400 Subject: [PATCH 124/161] BUG: can pass level name to DataFrame.stack --- RELEASE.rst | 1 + TODO.rst | 6 +++--- pandas/core/frame.py | 9 +++++++-- pandas/core/index.py | 6 ++++++ pandas/core/reshape.py | 2 ++ pandas/tests/test_multilevel.py | 12 +++++++++--- 6 files changed, 28 insertions(+), 8 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index e3db0d17189b0..be988e82158fa 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -188,6 +188,7 @@ feedback on the library. - `Int64Index.take` and `MultiIndex.take` lost name field, fix downstream issue GH #262 - Can pass list of tuples to `Series` (GH #270) + - Can pass level name to `DataFrame.stack` Thanks ------ diff --git a/TODO.rst b/TODO.rst index 90a59e3749e17..4b0d05632aed2 100644 --- a/TODO.rst +++ b/TODO.rst @@ -30,19 +30,19 @@ TODO docs - DONE combine_first - DONE groupby with level name - DONE MultiIndex get_level_values +- DONE & and | for intersection / union +- DONE Update to reflect Python 3 support in intro +- DONE Index / MultiIndex names -- Index / MultiIndex names - Unstack / stack by level name - name attribute on Series - Inner join on key - Multi-key joining -- Update to reflect Python 3 support in intro - align functions - df[col_list] - Panel.rename_axis -- & and | for intersection / union Performance blog ---------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 183ba6d89a5b3..5532d2a91812e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1739,6 +1739,11 @@ def stack(self, level=-1, dropna=True): Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index + Parameters + ---------- + level : int or string, default last level + Level to stack, can pass level name + Returns ------- stacked : Series @@ -1752,8 +1757,8 @@ def unstack(self, level=-1): Parameters ---------- - level : int, default last level - Level to unstack + level : int or string, default last level + Level to unstack, can pass level name Examples -------- diff --git a/pandas/core/index.py b/pandas/core/index.py index a4b28ecf2d33e..ba46d4ddda129 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -97,6 +97,12 @@ def indexMap(self): raise Exception('Index cannot contain duplicate values!') return self._indexMap + def _get_level_number(self, level): + if not isinstance(level, int): + assert(level == self.name) + level = 0 + return level + def _verify_integrity(self): if self._indexMap is None: try: diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index ebaa82d061a35..a746f1286781a 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -287,6 +287,8 @@ def stack(frame, level=-1, dropna=True): stacked : Series """ N, K = frame.shape + level = frame.columns._get_level_number(level) + if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level=level, dropna=True) elif isinstance(frame.index, MultiIndex): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index ed15d49e0c2d5..53f1d286e962d 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -5,7 +5,7 @@ from numpy.random import randn import numpy as np -from pandas.core.index import MultiIndex +from pandas.core.index import Index, MultiIndex from pandas import Panel, DataFrame, Series, notnull, isnull from pandas.util.testing import (assert_almost_equal, @@ -23,7 +23,7 @@ def setUp(self): [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], @@ -376,7 +376,7 @@ def test_unstack_bug(self): def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, 'first') - self.assertEquals(unstacked.columns.names, [None, 'second']) + self.assertEquals(unstacked.columns.names, ['exp', 'second']) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) @@ -386,6 +386,12 @@ def test_unstack_level_name(self): expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) + def test_stack_level_name(self): + unstacked = self.frame.unstack('second') + result = unstacked.stack('exp') + expected = self.frame.unstack().stack(0) + assert_frame_equal(result, expected) + def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) From 8ec1c977c07f97dd39dcf361c9fa6ad7089443db Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 23 Oct 2011 14:13:26 -0400 Subject: [PATCH 125/161] DOC: more miscellaneous docs about new 0.5 features --- RELEASE.rst | 1 + TODO.rst | 6 ++-- doc/source/dsintro.rst | 14 ++++++++ doc/source/indexing.rst | 73 ++++++++++++++++++++++++++++++++-------- doc/source/reshaping.rst | 22 ++++++++---- pandas/core/common.py | 1 + 6 files changed, 92 insertions(+), 25 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index be988e82158fa..a10daa819e8d0 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -197,6 +197,7 @@ Thanks - Daniel Fortunov - Aman Thakral - Luca Beltrame +- Wouter Overmeire pandas 0.4.3 ============ diff --git a/TODO.rst b/TODO.rst index 4b0d05632aed2..015c41edb8254 100644 --- a/TODO.rst +++ b/TODO.rst @@ -33,13 +33,11 @@ TODO docs - DONE & and | for intersection / union - DONE Update to reflect Python 3 support in intro - DONE Index / MultiIndex names - -- Unstack / stack by level name -- name attribute on Series +- DONE Unstack / stack by level name +- DONE name attribute on Series - Inner join on key - Multi-key joining - - align functions - df[col_list] - Panel.rename_axis diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 15192f2f5eac1..c0e5cf073d40e 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -181,6 +181,20 @@ tools for working with labeled data. of course have the option of dropping labels with missing data via the **dropna** function. +Name attribute +~~~~~~~~~~~~~~ + +Series can also have a ``name`` attribute: + +.. ipython:: python + + s = Series(np.random.randn(5), name='something') + s + s.name + +The Series ``name`` will be assigned automatically in many cases, in particular +when taking 1D slices of DataFrame as you will see below. + .. _basics.dataframe: DataFrame diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index e51dae00bdc37..214c74a740a6d 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -302,6 +302,58 @@ values, though setting arbitrary vectors is not yet supported: df2.ix[3] = np.nan df2 +.. _indexing.class: + +Index objects +------------- + +The pandas Index class and its subclasses can be viewed as implementing an +*ordered set* in addition to providing the support infrastructure necessary for +lookups, data alignment, and reindexing. The easiest way to create one directly +is to pass a list or other sequence to ``Index``: + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b']) + index + 'd' in index + +You can also pass a ``name`` to be stored in the index: + + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b'], name='something') + index.name + +Starting with pandas 0.5, the name, if set, will be shown in the console +display: + +.. ipython:: python + + index = Index(range(5), name='rows') + columns = Index(['A', 'B', 'C'], name='cols') + df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) + df + df['A'] + + +Set operations on Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The three main operations are ``union (|)``, ``intersection (&)``, and ``diff +(-)``. These can be directly called as instance methods or used via overloaded +operators: + +.. ipython:: python + + a = Index(['c', 'b', 'a']) + b = Index(['c', 'e', 'd']) + a.union(b) + a | b + a & b + a - b + .. _indexing.hierarchical: Hierarchical indexing (MultiIndex) @@ -558,14 +610,15 @@ attribute. These will get automatically assigned in various places where Some gory internal details ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Internally, the ``MultiIndex`` consists of two things: the **levels** and the -**labels**: +Internally, the ``MultiIndex`` consists of a few things: the **levels**, the +integer **labels**, and the level **names**: .. ipython:: python index index.levels index.labels + index.names You can probably guess that the labels determine which unique element is identified with that location at each layer of the index. It's important to @@ -584,16 +637,6 @@ To do this, use the ``swaplevels`` function: df df.swaplevels(0, 1) -Index methods -------------- - -The pandas Index class and its subclasses can be viewed as implementing an -*ordered set* in addition to providing the support infrastructure necessary for -lookups, data alignment, and reindexing. - -Set operations on Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Indexing internal details ------------------------- @@ -603,13 +646,15 @@ Indexing internal details codebase. And the source code is still the best place to look at the specifics of how things are implemented. -In pandas there are 3 distinct objects which can serve as valid containers for -the axis labels: +In pandas there are a few objects implemented which can serve as valid +containers for the axis labels: - ``Index``: the generic "ordered set" object, an ndarray of object dtype assuming nothing about its contents. The labels must be hashable (and likely immutable) and unique. Populates a dict of label to location in Cython to do :math:`O(1)` lookups. + - ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer + data, such as time stamps - ``MultiIndex``: the standard hierarchical index object - ``DateRange``: fixed frequency date range generated from a time rule or DateOffset. An ndarray of Python datetime objects diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 332424c4e4b05..cfa6bdf2267c3 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -11,9 +11,9 @@ randn = np.random.randn np.set_printoptions(precision=4, suppress=True) -********************** -Reshaping fundamentals -********************** +************************** +Reshaping and Pivot Tables +************************** Reshaping by pivoting DataFrame objects --------------------------------------- @@ -113,7 +113,7 @@ take a prior example data set from the hierarchical indexing section: 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]) - index = MultiIndex.from_tuples(tuples) + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) df = DataFrame(randn(8, 2), index=index, columns=['A', 'B']) df2 = df[:4] df2 @@ -142,6 +142,13 @@ unstacks the **last level**: stacked.unstack(1) stacked.unstack(0) +If the indexes have names, you can use the level names instead of specifying +the level numbers: + +.. ipython:: python + + stacked.unstack('second') + These functions are very intelligent about handling missing data and do not expect each subgroup within the hierarchical index to have the same set of labels. They also can handle the index being unsorted (but you can make it @@ -150,7 +157,8 @@ sorted by calling ``sortlevel``, of course). Here is a more complex example: .. ipython:: python columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), - ('B', 'cat'), ('A', 'dog')]) + ('B', 'cat'), ('A', 'dog')], + names=['exp', 'animal']) df = DataFrame(randn(8, 4), index=index, columns=columns) df2 = df.ix[[0, 1, 2, 4, 5, 7]] df2 @@ -160,8 +168,8 @@ which level in the columns to stack: .. ipython:: python - df2.stack(1) - df2.stack(0) + df2.stack('exp') + df2.stack('animal') Unstacking when the columns are a ``MultiIndex`` is also careful about doing the right thing: diff --git a/pandas/core/common.py b/pandas/core/common.py index fcbb2abf27838..01d8073c88e14 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -16,6 +16,7 @@ # XXX: HACK for NumPy 1.5.1 to suppress warnings try: np.seterr(all='ignore') + np.set_printoptions(suppress=True) except Exception: # pragma: no cover pass From 44a15a67eb1c36b371b9a9b37828fbdec2e529a9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 23 Oct 2011 17:43:25 -0400 Subject: [PATCH 126/161] DOC: more docs holes on joining, etc. --- TODO.rst | 4 ++-- doc/source/merging.rst | 43 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/TODO.rst b/TODO.rst index 015c41edb8254..5a9723b853fa2 100644 --- a/TODO.rst +++ b/TODO.rst @@ -35,9 +35,9 @@ TODO docs - DONE Index / MultiIndex names - DONE Unstack / stack by level name - DONE name attribute on Series +- DONE Multi-key joining +- DONE Inner join on key -- Inner join on key -- Multi-key joining - align functions - df[col_list] - Panel.rename_axis diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 21ff4dfd3ca45..a5b639f3d18f2 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -89,8 +89,9 @@ Joining on a key ~~~~~~~~~~~~~~~~ ``join`` takes an optional ``on`` argument which should be a column name in the -calling DataFrame which will be used to "align" the passed DataFrame. This is -best illustrated by example: +calling DataFrame which will be used to "align" the passed DataFrame. The +joining currently aligns the calling DataFrame's column (or columns) on the +passed DataFrame's index. This is best illustrated by example: .. ipython:: python @@ -101,6 +102,44 @@ best illustrated by example: to_join df.join(to_join, on='key') +To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1' : key1, 'key2' : key2, + 'data' : data}) + data + to_join + + +.. ipython:: python + + data.join(to_join, on=['key1', 'key2']) + +This is by default a "many-to-one" or "VLOOKUP"-style left join operation. An +inner join is also supported: + +.. ipython:: python + + data.join(to_join, on=['key1', 'key2'], how='inner') + +This drops any rows where there was no match. + Merging ordered records ~~~~~~~~~~~~~~~~~~~~~~~ From 5a9dca049865dfb39a74b7c8eefc0a8c46041f5f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 23 Oct 2011 21:40:47 -0400 Subject: [PATCH 127/161] DOC: mostly finished with doc updates --- TODO.rst | 7 +- doc/source/api.rst | 161 ++++++++++++++++++++++------------------ doc/source/basics.rst | 35 ++++++++- doc/source/indexing.rst | 11 +++ 4 files changed, 137 insertions(+), 77 deletions(-) diff --git a/TODO.rst b/TODO.rst index 5a9723b853fa2..be67694659e98 100644 --- a/TODO.rst +++ b/TODO.rst @@ -37,10 +37,9 @@ TODO docs - DONE name attribute on Series - DONE Multi-key joining - DONE Inner join on key - -- align functions -- df[col_list] -- Panel.rename_axis +- DONE align functions +- DONE df[col_list] +- DONE Panel.rename_axis Performance blog ---------------- diff --git a/doc/source/api.rst b/doc/source/api.rst index 8760df4608e38..610afb99141b7 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -5,6 +5,88 @@ API Reference ************* +.. _api.functions: + +General functions +----------------- + +Data manipulations +~~~~~~~~~~~~~~~~~~ +.. currentmodule:: pandas.tools.pivot + +.. autosummary:: + :toctree: generated/ + + pivot_table + +Pickling +~~~~~~~~ + +.. currentmodule:: pandas.core.common + +.. autosummary:: + :toctree: generated/ + + load + save + +File IO +~~~~~~~ + +.. currentmodule:: pandas.io.parsers + +.. autosummary:: + :toctree: generated/ + + read_table + read_csv + ExcelFile.parse + +HDFStore: PyTables (HDF5) +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. currentmodule:: pandas.io.pytables + +.. autosummary:: + :toctree: generated/ + + HDFStore.put + HDFStore.get + +Standard moving window functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. currentmodule:: pandas.stats.moments + +.. autosummary:: + :toctree: generated/ + + rolling_count + rolling_sum + rolling_mean + rolling_median + rolling_var + rolling_std + rolling_corr + rolling_cov + rolling_skew + rolling_kurt + rolling_apply + rolling_quantile + +Exponentially-weighted moving window functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + ewma + ewmstd + ewmvar + ewmcorr + ewmcov + +.. currentmodule:: pandas + .. _api.series: Series @@ -20,6 +102,8 @@ Attributes and underlying data Series.values Series.dtype + Series.isnull + Series.notnull Conversion / Constructors ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -89,12 +173,14 @@ Computations / Descriptive Stats Series.std Series.sum Series.var + Series.value_counts Reindexing / Selection / Label manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: generated/ + Series.align Series.drop Series.reindex Series.reindex_like @@ -178,6 +264,8 @@ Attributes and underlying data :toctree: generated/ DataFrame.as_matrix + DataFrame.dtypes + DataFrame.get_dtype_counts DataFrame.values DataFrame.axes DataFrame.ndim @@ -267,6 +355,7 @@ Reindexing / Selection / Label manipulation DataFrame.add_prefix DataFrame.add_suffix + DataFrame.align DataFrame.drop DataFrame.filter DataFrame.reindex @@ -355,75 +444,3 @@ Panel Computations / Descriptive Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Input / Output --------------- - -.. currentmodule:: pandas.io.parsers - -File IO -~~~~~~~ - -.. autosummary:: - :toctree: generated/ - - read_table - read_csv - ExcelFile.parse - -HDFStore: PyTables (HDF5) -~~~~~~~~~~~~~~~~~~~~~~~~~ -.. currentmodule:: pandas.io.pytables - -.. autosummary:: - :toctree: generated/ - - HDFStore.put - HDFStore.get - -GroupBy -------- - -.. currentmodule:: pandas.core.groupby - -.. autosummary:: - :toctree: generated/ - - groupby - -Moving window statistics ------------------------- - -Standard moving window functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. currentmodule:: pandas.stats.moments - -.. autosummary:: - :toctree: generated/ - - rolling_count - rolling_sum - rolling_mean - rolling_median - rolling_var - rolling_std - rolling_corr - rolling_cov - rolling_skew - rolling_kurt - rolling_apply - rolling_quantile - -Exponentially-weighted moving window functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: generated/ - - ewma - ewmstd - ewmvar - ewmcorr - ewmcov - diff --git a/doc/source/basics.rst b/doc/source/basics.rst index ebfeabb2021fe..03c663565dc34 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -467,7 +467,7 @@ Reindexing to align with another object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You may wish to take an object and reindex its axes to be labeled the same as -another object. While the syntax for this is straightforwad albeit verbose, it +another object. While the syntax for this is straightforward albeit verbose, it is a common enough operation that the ``reindex_like`` method is available to make this simpler: @@ -484,6 +484,36 @@ make this simpler: df2 df.reindex_like(df2) +.. _basics.align: + +Aligning objects with each other with ``align`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``align`` method is the fastest way to simultaneously align two objects. It +supports a ``join`` argument (related to :ref:`joining and merging `): + + - ``join='outer'``: take the union of the indexes + - ``join='left'``: use the calling object's index + - ``join='right'``: use the passed object's index + - ``join='inner'``: intersect the indexes + +It returns a tuple with both of the reindexed Series: + +.. ipython:: python + + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s1 = s[:4] + s2 = s[1:] + s1.align(s2) + s1.align(s2, join='inner') + s1.align(s2, join='left') + +For DataFrames, the join method will be applied to both the + +.. ipython:: python + + df.align(df2, join='inner') + .. _basics.reindex_fill: Filling while reindexing @@ -573,6 +603,9 @@ Series, it need only contain a subset of the labels as keys: df.rename(columns={'one' : 'foo', 'two' : 'bar'}, index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'}) +The Panel class has an a related ``rename_axis`` class which can rename any of +its three axes. + Iteration --------- diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 214c74a740a6d..bdab2ce154df6 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -187,6 +187,17 @@ As we will see later on, the same operation could be accomplished by reindexing. However, the syntax would be more verbose; hence, the inclusion of this indexing method. +Selecting DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can pass a list of columns to ``[]`` to select columns in that order: + +.. ipython:: python + + df[['C', 'A', 'B']] + +If a column is not contained in the DataFrame, an exception will be raised: + .. _indexing.advanced: Advanced indexing with labels From a324d8dd95fcc252b03043511b97737ace56e57f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 23 Oct 2011 21:42:46 -0400 Subject: [PATCH 128/161] REF: renamed xby/yby to rows/cols in pivot_table --- doc/source/reshaping.rst | 12 ++++++------ pandas/tools/pivot.py | 20 ++++++++++---------- pandas/tools/tests/test_pivot.py | 28 ++++++++++++++-------------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index cfa6bdf2267c3..c2227511d9d40 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -208,8 +208,8 @@ tables. It takes a number of arguments - ``data``: A DataFrame object - ``values``: column to aggregate -- ``xby``: list of columns to group by on the `x`-axis -- ``yby``: list of columns to group by on the `y`-axis +- ``rows``: list of columns to group by on the table rows +- ``cols``: list of columns to group by on the table columns - ``aggfunc``: function to use for aggregation, defaulting to ``numpy.mean`` Consider a data set like this: @@ -227,8 +227,8 @@ We can produce pivot tables from this data very easily: .. ipython:: python - pivot_table(df, values='D', xby=['A', 'B'], yby=['C']) - pivot_table(df, values='D', xby=['B'], yby=['A', 'C'], aggfunc=np.sum) + pivot_table(df, values='D', rows=['A', 'B'], cols=['C']) + pivot_table(df, values='D', rows=['B'], cols=['A', 'C'], aggfunc=np.sum) The result object is a DataFrame having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table @@ -237,12 +237,12 @@ hierarchy in the columns: .. ipython:: python - pivot_table(df, xby=['A', 'B'], yby=['C']) + pivot_table(df, rows=['A', 'B'], cols=['C']) You can render a nice output of the table omitting the missing values by calling ``to_string`` if you wish: .. ipython:: python - table = pivot_table(df, xby=['A', 'B'], yby=['C']) + table = pivot_table(df, rows=['A', 'B'], cols=['C']) print table.to_string(na_rep='') diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 858d02d5a5f03..070fddb8f9e15 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -1,7 +1,7 @@ from pandas import DataFrame import numpy as np -def pivot_table(data, values=None, xby=None, yby=None, aggfunc=np.mean, +def pivot_table(data, values=None, rows=None, cols=None, aggfunc=np.mean, fill_value=None): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the @@ -12,9 +12,9 @@ def pivot_table(data, values=None, xby=None, yby=None, aggfunc=np.mean, ---------- data : DataFrame values : column to aggregate, optional - xby : list + rows : list Columns to group on the x-axis of the pivot table - yby : list + cols : list Columns to group on the x-axis of the pivot table aggfunc : function, default numpy.mean fill_value : scalar, default None @@ -34,8 +34,8 @@ def pivot_table(data, values=None, xby=None, yby=None, aggfunc=np.mean, 7 bar two small 6 8 bar two large 7 - >>> table = pivot_table(df, values='D', xby=['A, 'B'], - yby=['C'], aggfunc=np.sum) + >>> table = pivot_table(df, values='D', rows=['A, 'B'], + cols=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 @@ -47,10 +47,10 @@ def pivot_table(data, values=None, xby=None, yby=None, aggfunc=np.mean, ------- table : DataFrame """ - xby = _convert_by(xby) - yby = _convert_by(yby) + rows = _convert_by(rows) + cols = _convert_by(cols) - keys = xby + yby + keys = rows + cols grouped = data.groupby(keys) if values is not None: @@ -59,7 +59,7 @@ def pivot_table(data, values=None, xby=None, yby=None, aggfunc=np.mean, agged = grouped.agg(aggfunc) table = agged - for k in yby: + for k in cols: table = table.unstack(level=k) if fill_value is not None: @@ -100,5 +100,5 @@ def _sample(values, n): data = DataFrame(data) table = pivot_table(data, values='values', - xby=['k1', 'k2'], yby=['k3', 'k4']) + rows=['k1', 'k2'], cols=['k3', 'k4']) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index ad34b3f697303..b1cf2546817b2 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -22,28 +22,28 @@ def setUp(self): 'E' : np.random.randn(11)}) def test_pivot_table(self): - xby = ['A', 'B'] - yby= 'C' - table = pivot_table(self.data, values='D', xby=xby, yby=yby) + rows = ['A', 'B'] + cols= 'C' + table = pivot_table(self.data, values='D', rows=rows, cols=cols) - if len(xby) > 1: - self.assertEqual(table.index.names, xby) + if len(rows) > 1: + self.assertEqual(table.index.names, rows) else: - self.assertEqual(table.index.name, xby[0]) + self.assertEqual(table.index.name, rows[0]) - if len(yby) > 1: - self.assertEqual(table.columns.names, yby) + if len(cols) > 1: + self.assertEqual(table.columns.names, cols) else: - self.assertEqual(table.columns.name, yby[0]) + self.assertEqual(table.columns.name, cols[0]) - expected = self.data.groupby(xby + [yby])['D'].agg(np.mean).unstack() + expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() assert_frame_equal(table, expected) def test_pivot_table_multiple(self): - xby = ['A', 'B'] - yby= 'C' - table = pivot_table(self.data, xby=xby, yby=yby) - expected = self.data.groupby(xby + [yby]).agg(np.mean).unstack() + rows = ['A', 'B'] + cols= 'C' + table = pivot_table(self.data, rows=rows, cols=cols) + expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() assert_frame_equal(table, expected) if __name__ == '__main__': From eddd5c9a997c712808513cfb37e2168357d995de Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 23 Oct 2011 22:41:54 -0400 Subject: [PATCH 129/161] TST: frame.py test coverage --- pandas/core/common.py | 6 +++-- pandas/core/frame.py | 46 ++++++-------------------------------- pandas/tests/test_frame.py | 32 +++++++++++++++++++++----- 3 files changed, 38 insertions(+), 46 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 01d8073c88e14..b321b827c5dcf 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -4,7 +4,8 @@ import cPickle try: from io import BytesIO -except ImportError: # Python < 2.6 +except ImportError: # pragma: no cover + # Python < 2.6 from cStringIO import StringIO as BytesIO import itertools @@ -486,7 +487,8 @@ def __init__(self, seq, key=lambda x:x): self.setdefault(k, []).append(value) try: __iter__ = dict.iteritems - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 def __iter__(self): return iter(dict.items(self)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5532d2a91812e..7f1beca73304e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -279,7 +279,7 @@ def iteritems(self): return ((k, self[k]) for k in self.columns) iterkv = iteritems - if py3compat.PY3: + if py3compat.PY3: # pragma: no cover items = iteritems def __len__(self): @@ -726,25 +726,6 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover self._data = dm._data - #---------------------------------------------------------------------- - # Private helper methods - - def _intersect_index(self, other): - common_index = self.index - - if not common_index.equals(other.index): - common_index = common_index.intersection(other.index) - - return common_index - - def _intersect_columns(self, other): - common_cols = self.columns - - if not common_cols.equals(other.columns): - common_cols = common_cols.intersection(other.columns) - - return common_cols - #---------------------------------------------------------------------- # Array interface @@ -981,13 +962,9 @@ def xs(self, key, axis=0, copy=True): return Series(new_values, index=self.columns, name=key) else: new_data = self._data.xs(key, axis=1, copy=copy) - if new_data.ndim == 1: - return Series(new_data.as_matrix(), index=self.columns, - name=key) - else: - result = DataFrame(new_data) - result.index = _maybe_droplevels(result.index, key) - return result + result = DataFrame(new_data) + result.index = _maybe_droplevels(result.index, key) + return result #---------------------------------------------------------------------- # Reindexing and alignment @@ -1465,15 +1442,6 @@ def _combine_frame(self, other, func, fill_value=None): this, other = self.align(other, join='outer', copy=False) new_index, new_columns = this.index, this.columns - # some shortcuts - if fill_value is None: - if not self and not other: - return self._constructor(index=new_index) - elif not self: - return other * nan - elif not other: - return self * nan - this_vals = this.values other_vals = other.values @@ -2174,7 +2142,7 @@ def join(self, other, on=None, how=None, lsuffix='', rsuffix=''): return self._join_index(other, how, lsuffix, rsuffix) def _join_on(self, other, on, how, lsuffix, rsuffix): - if how not in ['left', 'inner']: + if how not in ('left', 'inner'): # pragma: no cover raise Exception('Only inner / left joins currently supported') if isinstance(other, Series): @@ -3295,7 +3263,7 @@ def _homogenize(data, index, columns, dtype=None): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) -def install_ipython_completers(): +def install_ipython_completers(): # pragma: no cover """Register the DataFrame type with IPython's tab completion machinery, so that it knows about accessing column names as attributes.""" from IPython.utils.generics import complete_object @@ -3307,7 +3275,7 @@ def complete_dataframe(obj, prev_completions): # Importing IPython brings in about 200 modules, so we want to avoid it unless # we're in IPython (when those modules are loaded anyway). -if "IPython" in sys.modules: +if "IPython" in sys.modules: # pragma: no cover try: install_ipython_completers() except Exception: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index bb99e55a7cb6d..a763dc8cac458 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1095,6 +1095,14 @@ def test_constructor_Series_copy_bug(self): df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A']) df.copy() + def test_constructor_mixed_dict_and_Series(self): + data = {} + data['A'] = {'foo' : 1, 'bar' : 2, 'baz' : 3} + data['B'] = Series([4, 3, 2, 1], index=['bar', 'qux', 'baz', 'foo']) + + result = DataFrame(data) + self.assert_(result.index.is_monotonic) + def test_astype(self): casted = self.frame.astype(int) expected = DataFrame(self.frame.values.astype(int), @@ -1398,6 +1406,13 @@ def test_arith_flex_frame(self): const_add = self.frame.add(1) assert_frame_equal(const_add, self.frame + 1) + # corner cases + result = self.frame.add(self.frame[:0]) + assert_frame_equal(result, self.frame * np.nan) + + result = self.frame[:0].add(self.frame) + assert_frame_equal(result, self.frame * np.nan) + def test_arith_flex_series(self): df = self.simple @@ -2160,8 +2175,13 @@ def test_reindex_columns(self): newFrame = self.frame.reindex(columns=[]) self.assert_(not newFrame) - def test_reindex_mixed(self): - pass + def test_align(self): + + af, bf = self.frame.align(self.frame) + self.assert_(af._data is not self.frame._data) + + af, bf = self.frame.align(self.frame, copy=False) + self.assert_(af._data is self.frame._data) #---------------------------------------------------------------------- # Transposing @@ -3195,6 +3215,11 @@ def test_join_on_multikey(self): # TODO: columns aren't in the same order yet assert_frame_equal(joined, expected.ix[:, joined.columns]) + def test_join_on_series(self): + result = self.target.join(self.source['MergedA'], on='C') + expected = self.target.join(self.source[['MergedA']], on='C') + assert_frame_equal(result, expected) + def test_join_index_mixed(self): df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, @@ -3234,9 +3259,6 @@ def test_join_index_mixed(self): expected = _join_by_hand(df2, df1, how=kind) assert_frame_equal(joined, expected) - def test_join_on_series(self): - pass - def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() From 394bb0d9d84fb6628d212c6a5ad0c38de9d06a7c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 00:59:28 -0400 Subject: [PATCH 130/161] ENH: add Panel.take, implement set ops between MultiIndex and Index. plus test coverage --- RELEASE.rst | 2 + pandas/__init__.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 24 +++++++++- pandas/core/index.py | 80 +++++++++++++++------------------ pandas/core/internals.py | 29 +++++------- pandas/core/panel.py | 6 ++- pandas/core/reshape.py | 5 ++- pandas/core/series.py | 17 +++---- pandas/tests/test_index.py | 42 ++++++++++++++--- pandas/tests/test_multilevel.py | 8 ++++ pandas/tests/test_panel.py | 33 ++++++++++++-- 12 files changed, 161 insertions(+), 89 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index a10daa819e8d0..9e49899be125f 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -128,6 +128,7 @@ feedback on the library. - Added `pivot_table` convenience function to pandas namespace (GH #234) - Implemented `Panel.rename_axis` function (GH #243) - DataFrame will show index level names in console output + - Implemented `Panel.take` **Improvements to existing features** @@ -189,6 +190,7 @@ feedback on the library. issue GH #262 - Can pass list of tuples to `Series` (GH #270) - Can pass level name to `DataFrame.stack` + - Support set operations between MultiIndex and Index Thanks ------ diff --git a/pandas/__init__.py b/pandas/__init__.py index ae69b6f7a907f..fb7f14c522daa 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -8,7 +8,7 @@ try: import pandas._tseries as lib -except Exception, e: +except Exception, e: # pragma: no cover if 'No module named' in e.message: raise ImportError('C extensions not built: if you installed already ' 'verify that you are not importing from the source ' diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7f1beca73304e..834e1f03ac468 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2308,7 +2308,7 @@ def count(self, axis=0, level=None, numeric_only=False): else: frame = self - result = frame.apply(Series.count, axis=axis) + result = DataFrame.apply(frame, Series.count, axis=axis) # what happens with empty DataFrame if isinstance(result, DataFrame): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8689542035107..f4ff2ab0936d5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -204,7 +204,7 @@ def sort_index(self, axis=0, ascending=True): def ix(self): raise NotImplementedError - def reindex(self, **kwds): + def reindex(self, *args, **kwds): raise NotImplementedError class NDFrame(PandasObject): @@ -486,3 +486,25 @@ def rename_axis(self, mapper, axis=0, copy=True): new_data = new_data.copy() return self._constructor(new_data) + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take + + Parameters + ---------- + indices : list / array of ints + axis : int, default 0 + + Returns + ------- + taken : type of caller + """ + if axis == 0: + labels = self._get_axis(axis) + new_items = labels.take(indices) + new_data = self._data.reindex_items(new_items) + else: + new_data = self._data.take(indices, axis=axis) + return self._constructor(new_data) + diff --git a/pandas/core/index.py b/pandas/core/index.py index ba46d4ddda129..914ac9fd6d543 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -39,6 +39,11 @@ class Index(np.ndarray): ---- An Index instance can **only** contain hashable objects """ + _map_indices = lib.map_indices_object + _is_monotonic = lib.is_monotonic_object + _groupby = lib.groupby_object + _arrmap = lib.arrmap_object + name = None def __new__(cls, data, dtype=None, copy=False, name=None): if isinstance(data, np.ndarray): @@ -67,6 +72,10 @@ def dtype(self): def nlevels(self): return 1 + @property + def _constructor(self): + return Index + def summary(self): if len(self) > 0: index_summary = ', %s to %s' % (str(self[0]), str(self[-1])) @@ -82,15 +91,16 @@ def values(self): @cache_readonly def is_monotonic(self): - return lib.is_monotonic_object(self) + return self._is_monotonic(self) _indexMap = None _integrity = False + @property def indexMap(self): "{label -> location}" if self._indexMap is None: - self._indexMap = lib.map_indices_object(self) + self._indexMap = self._map_indices(self) self._integrity = len(self._indexMap) == len(self) if not self._integrity: @@ -185,7 +195,7 @@ def take(self, *args, **kwargs): Analogous to ndarray.take """ taken = self.view(np.ndarray).take(*args, **kwargs) - return Index(taken, name=self.name) + return self._constructor(taken, name=self.name) def format(self, name=False): """ @@ -305,7 +315,7 @@ def union(self, other): return _ensure_index(other) if self.is_monotonic and other.is_monotonic: - result = lib.outer_join_indexer_object(self, other)[0] + result = lib.outer_join_indexer_object(self, other.values)[0] else: indexer = self.get_indexer(other) indexer = (indexer == -1).nonzero()[0] @@ -356,9 +366,10 @@ def intersection(self, other): other = other.astype(object) if self.is_monotonic and other.is_monotonic: - return Index(lib.inner_join_indexer_object(self, other)[0]) + return Index(lib.inner_join_indexer_object(self, + other.values)[0]) else: - indexer = self.get_indexer(other) + indexer = self.get_indexer(other.values) indexer = indexer.take((indexer != -1).nonzero()[0]) return self.take(indexer) @@ -446,10 +457,10 @@ def get_indexer(self, target, method=None): return indexer def groupby(self, to_groupby): - return lib.groupby_object(self.values, to_groupby) + return self._groupby(self.values, to_groupby) def map(self, mapper): - return lib.arrmap_object(self.values, mapper) + return self._arrmap(self.values, mapper) def _get_method(self, method): if method: @@ -621,6 +632,11 @@ def copy(self, order='C'): class Int64Index(Index): + _map_indices = lib.map_indices_int64 + _is_monotonic = lib.is_monotonic_int64 + _groupby = lib.groupby_int64 + _arrmap = lib.arrmap_int64 + def __new__(cls, data, dtype=None, copy=False, name=None): if not isinstance(data, np.ndarray): if np.isscalar(data): @@ -648,6 +664,10 @@ def __new__(cls, data, dtype=None, copy=False, name=None): subarr.name = name return subarr + @property + def _constructor(self): + return Int64Index + def astype(self, dtype): return Index(self.values.astype(dtype)) @@ -655,22 +675,6 @@ def astype(self, dtype): def dtype(self): return np.dtype('int64') - @cache_readonly - def is_monotonic(self): - return lib.is_monotonic_int64(self) - - @property - def indexMap(self): - "{label -> location}" - if self._indexMap is None: - self._indexMap = lib.map_indices_int64(self) - self._integrity = len(self._indexMap) == len(self) - - if not self._integrity: - raise Exception('Index cannot contain duplicate values!') - - return self._indexMap - def is_all_dates(self): """ Checks that all the labels are datetime objects @@ -771,19 +775,6 @@ def union(self, other): return Int64Index(result) union.__doc__ = Index.union.__doc__ - def groupby(self, to_groupby): - return lib.groupby_int64(self, to_groupby) - - def map(self, mapper): - return lib.arrmap_int64(self, mapper) - - def take(self, *args, **kwargs): - """ - Analogous to ndarray.take - """ - taken = self.values.take(*args, **kwargs) - return Int64Index(taken, name=self.name) - class DateIndex(Index): pass @@ -1267,16 +1258,9 @@ def get_indexer(self, target, method=None): """ method = self._get_method(method) + target_index = target if isinstance(target, MultiIndex): target_index = target.get_tuple_index() - else: - if len(target) > 0: - val = target[0] - if not isinstance(val, tuple) or len(val) != self.nlevels: - raise ValueError('can only pass MultiIndex or ' - 'array of tuples') - - target_index = target self_index = self.get_tuple_index() @@ -1509,6 +1493,9 @@ def union(self, other): ------- Index """ + if not isinstance(other, MultiIndex): + return other.union(self) + self._assert_can_do_setop(other) if len(other) == 0 or self.equals(other): @@ -1533,6 +1520,9 @@ def intersection(self, other): ------- Index """ + if not isinstance(other, MultiIndex): + return other.intersection(self) + self._assert_can_do_setop(other) if self.equals(other): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 05a5526bbbb2b..27fc245bf0547 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -176,31 +176,19 @@ def should_store(self, value): # unnecessarily return issubclass(value.dtype.type, np.floating) - def can_store(self, value): - return issubclass(value.dtype.type, (np.integer, np.floating)) - class IntBlock(Block): def should_store(self, value): - return self.can_store(value) - - def can_store(self, value): return issubclass(value.dtype.type, np.integer) class BoolBlock(Block): def should_store(self, value): - return self.can_store(value) - - def can_store(self, value): return issubclass(value.dtype.type, np.bool_) class ObjectBlock(Block): def should_store(self, value): - return self.can_store(value) - - def can_store(self, value): return not issubclass(value.dtype.type, (np.integer, np.floating, np.bool_)) @@ -676,21 +664,24 @@ def reindex_items(self, new_items): return BlockManager(new_blocks, new_axes) - def take(self, indexer, axis=1, pandas_indexer=False): + def take(self, indexer, axis=1): if axis == 0: raise NotImplementedError - if pandas_indexer: - take_f = lambda arr: common.take_fast(arr, indexer, - None, False, axis=axis) - else: - take_f = lambda arr: arr.take(indexer, axis=axis) + indexer = np.asarray(indexer, dtype='i4') + + n = len(self.axes[axis]) + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception('Indices must be nonzero and less than ' + 'the axis length') new_axes = list(self.axes) new_axes[axis] = self.axes[axis].take(indexer) new_blocks = [] for blk in self.blocks: - newb = make_block(take_f(blk.values), blk.items, self.items) + new_values = common.take_fast(blk.values, indexer, + None, False, axis=axis) + newb = make_block(new_values, blk.items, self.items) new_blocks.append(newb) return BlockManager(new_blocks, new_axes) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index bc591c530f3ca..95bba59e77cd3 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -665,7 +665,8 @@ def fillna(self, value=None, method='pad'): try: divide = div = _panel_arith_method(operator.div, 'divide') - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 divide = div = _panel_arith_method(operator.truediv, 'divide') def major_xs(self, key, copy=True): @@ -1235,7 +1236,8 @@ def _combine_panel_frame(self, other, func, axis='items'): try: divide = div = _panel_arith_method(operator.div, 'divide') - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 divide = div = _panel_arith_method(operator.truediv, 'divide') def to_wide(self): diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index a746f1286781a..533deef603a6d 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -287,6 +287,9 @@ def stack(frame, level=-1, dropna=True): stacked : Series """ N, K = frame.shape + if isinstance(level, int) and level < 0: + level += frame.columns.nlevels + level = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): @@ -318,8 +321,6 @@ def stack(frame, level=-1, dropna=True): def _stack_multi_columns(frame, level=-1, dropna=True): this = frame.copy() - if level < 0: - level += frame.columns.nlevels # this makes life much simpler if level != frame.columns.nlevels - 1: diff --git a/pandas/core/series.py b/pandas/core/series.py index 2763e5216bcc3..e6648c677070b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -9,12 +9,6 @@ import itertools import operator -try: - from collections import Counter -except ImportError: - # For Python < 2.7, we include a local copy of this: - from pandas.util.counter import Counter - from numpy import nan, ndarray import numpy as np @@ -444,7 +438,7 @@ def iteritems(self): return itertools.izip(iter(self.index), iter(self)) iterkv = iteritems - if py3compat.PY3: + if py3compat.PY3: # pragma: no cover items = iteritems #---------------------------------------------------------------------- @@ -908,6 +902,12 @@ def describe(self): ------- desc : Series """ + try: + from collections import Counter + except ImportError: # pragma: no cover + # For Python < 2.7, we include a local copy of this: + from pandas.util.counter import Counter + if self.dtype == object: names = ['count', 'unique', 'top', 'freq'] @@ -1094,7 +1094,8 @@ def _binop(self, other, func, fill_value=None): mul = _flex_method(operator.mul, 'multiply') try: div = _flex_method(operator.div, 'divide') - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 div = _flex_method(operator.truediv, 'divide') def combine(self, other, func, fill_value=nan): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index d2308f4e5e126..e82bb58acd0fc 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -34,6 +34,7 @@ def test_deepcopy(self): def test_duplicates(self): idx = Index([0, 0, 0]) self.assert_(not idx._verify_integrity()) + self.assertRaises(Exception, getattr, idx, 'indexMap') def test_sort(self): self.assertRaises(Exception, self.strIndex.sort) @@ -582,6 +583,13 @@ def test_constructor_single_level(self): self.assert_(not isinstance(single_level, MultiIndex)) self.assert_(single_level.name == 'first') + single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]]) + self.assert_(single_level.name is None) + + def test_constructor_no_levels(self): + self.assertRaises(Exception, MultiIndex, levels=[], labels=[]) + def test_from_arrays(self): arrays = [] for lev, lab in zip(self.index.levels, self.index.labels): @@ -832,9 +840,17 @@ def test_equals(self): self.assert_(not self.index.equals(self.index.get_tuple_index())) # different number of levels - index = MultiIndex(levels=self.index.levels[:-1], - labels=self.index.labels[:-1]) - self.assert_(not self.index.equals(index)) + index = MultiIndex(levels=[Index(range(4)), + Index(range(4)), + Index(range(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + + index2 = MultiIndex(levels=index.levels[:-1], + labels=index.labels[:-1]) + self.assert_(not index.equals(index2)) + self.assert_(not index.equal_levels(index2)) # levels are different major_axis = Index(range(4)) @@ -877,8 +893,19 @@ def test_union(self): the_union = self.index.union(self.index[:0]) self.assert_(the_union is self.index) - self.assertRaises(TypeError, self.index.union, - self.index.get_tuple_index()) + tuples = self.index.get_tuple_index() + result = self.index[:4] | tuples[4:] + self.assert_(result.equals(tuples)) + + def test_union_with_regular_index(self): + other = Index(['A', 'B', 'C']) + + result = other.union(self.index) + self.assert_(('foo', 'one') in result) + self.assert_('B' in result) + + result2 = self.index.union(other) + self.assert_(result.equals(result2)) def test_intersection(self): piece1 = self.index[:5][::-1] @@ -893,8 +920,9 @@ def test_intersection(self): the_int = self.index.intersection(self.index) self.assert_(the_int is self.index) - self.assertRaises(TypeError, self.index.intersection, - self.index.get_tuple_index()) + tuples = self.index.get_tuple_index() + result = self.index & tuples + self.assert_(result.equals(tuples)) def test_diff(self): first = self.index diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 53f1d286e962d..1587aa205c6d3 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -350,6 +350,10 @@ def test_stack(self): ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) + # stack with negative number + result = self.ymd.unstack(0).stack(-2) + expected = self.ymd.unstack(0).stack(0) + def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' @@ -392,6 +396,10 @@ def test_stack_level_name(self): expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) + result = self.frame.stack('exp') + expected = self.frame.stack() + assert_series_equal(result, expected) + def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 387839c09506a..e54485f2e2059 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -419,6 +419,11 @@ def test_xs(self): itemA_view.values[:] = np.nan self.assert_(np.isnan(self.panel['ItemA'].values).all()) + # mixed-type + self.panel['strings'] = 'foo' + self.assertRaises(Exception, self.panel.xs, 'D', axis=2, + copy=False) + def test_getitem_fancy_labels(self): p = self.panel @@ -670,6 +675,20 @@ def test_reindex_like(self): smaller_like = self.panel.reindex_like(smaller) assert_panel_equal(smaller, smaller_like) + def test_take(self): + # axis == 0 + result = self.panel.take([2, 0, 1], axis=0) + expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) + assert_panel_equal(result, expected) + + # axis >= 1 + result = self.panel.take([3, 0, 1, 2], axis=2) + expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) + assert_panel_equal(result, expected) + + self.assertRaises(Exception, self.panel.take, [3, -1, 1, 2], axis=2) + self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) + def test_sort_index(self): import random @@ -985,6 +1004,17 @@ def test_combine_scalar(self): expected = DataFrame(self.panel._data) * 2 assert_frame_equal(result, expected) + def test_combine_series(self): + s = self.panel['ItemA'][:10] + result = self.panel.add(s, axis=0) + expected = DataFrame.add(self.panel, s, axis=0) + assert_frame_equal(result, expected) + + s = self.panel.ix[5] + result = self.panel + s + expected = DataFrame.add(self.panel, s, axis=1) + assert_frame_equal(result, expected) + def test_operators(self): wp = self.panel.to_wide() result = (self.panel + 1).to_wide() @@ -1000,9 +1030,6 @@ def is_sorted(arr): sorted_major = sorted_minor.sortlevel(level=0) self.assert_(is_sorted(sorted_major.major_labels)) - def test_to_wide(self): - pass - def test_toCSV(self): self.panel.toCSV('__tmp__') os.remove('__tmp__') From f6aa7cae1a28604f98b6297b315e9fd6190df31d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 10:22:42 -0400 Subject: [PATCH 131/161] TST: groupby unit tests --- pandas/core/groupby.py | 10 ++-------- pandas/tests/test_groupby.py | 5 +++++ pandas/tests/test_index.py | 3 +++ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 216669c1dbf29..eccdafeaf1fcf 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -690,13 +690,7 @@ def aggregate(self, func_or_funcs, *args, **kwargs): except Exception: result = self._aggregate_named(func_or_funcs, *args, **kwargs) - if len(result) > 0: - if isinstance(result.values()[0], Series): - ret = DataFrame(result).T - else: - ret = Series(result) - else: - ret = Series({}) + ret = Series(result) if not self.as_index: # pragma: no cover print 'Warning, ignoring as_index=True' @@ -761,7 +755,7 @@ def _aggregate_simple(self, func, *args, **kwargs): result = {} for k, v in self.primary.indices.iteritems(): agged = func(values.take(v), *args, **kwargs) - if isinstance(output, np.ndarray): + if isinstance(agged, np.ndarray): raise Exception('Must produce aggregated value') result[k] = agged diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 13204044df9f1..67ecbd6fd81da 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -111,6 +111,11 @@ def test_agg_regression1(self): expected = grouped.mean() assert_frame_equal(result, expected) + def test_agg_must_add(self): + grouped = self.df.groupby('A')['C'] + self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) + self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) + def test_get_group(self): wp = tm.makePanel() grouped = wp.groupby(lambda x: x.month, axis='major') diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index e82bb58acd0fc..f48b058770590 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -950,6 +950,9 @@ def test_diff(self): result = self.index - self.index.sortlevel(1)[0] self.assert_(len(result) == 0) + # raise Exception called with non-MultiIndex + self.assertRaises(Exception, first.diff, first.get_tuple_index()) + def test_from_tuples(self): self.assertRaises(Exception, MultiIndex.from_tuples, []) From 6141961de7908490bf550aa5075627386236854a Mon Sep 17 00:00:00 2001 From: lodagro Date: Mon, 3 Oct 2011 14:09:45 +0200 Subject: [PATCH 132/161] Adding set_eng_float_format(), which controls default float format for DataFrame. --- pandas/__init__.py | 1 + pandas/core/common.py | 106 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/pandas/__init__.py b/pandas/__init__.py index fb7f14c522daa..7d5ecf84fddf0 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -21,6 +21,7 @@ from pandas.core.api import * from pandas.core.common import set_printoptions +from pandas.core.common import set_eng_float_format from pandas.io.parsers import read_csv, read_table, ExcelFile from pandas.io.pytables import HDFStore from pandas.stats.api import * diff --git a/pandas/core/common.py b/pandas/core/common.py index b321b827c5dcf..b800bf3aa7172 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -12,6 +12,9 @@ from numpy.lib.format import read_array, write_array import numpy as np +import decimal +import math + import pandas._tseries as lib # XXX: HACK for NumPy 1.5.1 to suppress warnings @@ -355,6 +358,109 @@ def set_printoptions(precision=None, column_space=None): if column_space is not None: _column_space = column_space +class EngFormatter(object): + """ + Formats float values according to engineering format. + + Based on matplotlib.ticker.EngFormatter + """ + + # The SI engineering prefixes + ENG_PREFIXES = { + -24: "y", + -21: "z", + -18: "a", + -15: "f", + -12: "p", + -9: "n", + -6: "u", + -3: "m", + 0: "", + 3: "k", + 6: "M", + 9: "G", + 12: "T", + 15: "P", + 18: "E", + 21: "Z", + 24: "Y" + } + + def __init__(self, precision=None, use_eng_prefix=False): + self.precision = precision + self.use_eng_prefix = use_eng_prefix + + def __call__(self, num): + """ Formats a number in engineering notation, appending a letter + representing the power of 1000 of the original number. Some examples: + + >>> format_eng(0) for self.precision = 0 + '0' + + >>> format_eng(1000000) for self.precision = 1, + self.use_eng_prefix = True + '1.0M' + + >>> format_eng("-1e-6") for self.precision = 2 + self.use_eng_prefix = False + '-1.00E-06' + + @param num: the value to represent + @type num: either a numeric value or a string that can be converted to + a numeric value (as per decimal.Decimal constructor) + + @return: engineering formatted string + """ + + dnum = decimal.Decimal(str(num)) + + sign = 1 + + if dnum < 0: + sign = -1 + dnum = -dnum + + if dnum != 0: + pow10 = decimal.Decimal(int(math.floor(dnum.log10()/3)*3)) + else: + pow10 = decimal.Decimal(0) + + pow10 = pow10.min(max(self.ENG_PREFIXES.keys())) + pow10 = pow10.max(min(self.ENG_PREFIXES.keys())) + int_pow10 = int(pow10) + + if self.use_eng_prefix: + prefix = self.ENG_PREFIXES[int_pow10] + else: + if int_pow10 < 0: + prefix = 'E-%02d' % (-int_pow10) + else: + prefix = 'E+%02d' % int_pow10 + + mant = sign*dnum/(10**pow10) + + if self.precision is None: + format_str = u"%g%s" + elif self.precision == 0: + format_str = u"%i%s" + elif self.precision > 0: + format_str = (u"%%.%if%%s" % self.precision) + + formatted = format_str % (mant, prefix) + + return formatted.strip() + +def set_eng_float_format(precision=3, use_eng_prefix=False): + """ + Alter default behavior on how float is formatted in DataFrame. + Format float in engineering format. + + See also EngFormatter. + """ + global _float_format, _column_space + _float_format = EngFormatter(precision, use_eng_prefix) + _column_space = max(12, precision + 9) + _float_format = lambda x: '%.4g' % x _column_space = 12 From b57e87bef71b3061ea76ddb38fbce358b91b20f4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 10:25:49 -0400 Subject: [PATCH 133/161] MSC: Ignore vi .swp files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 564f27c5cd9c2..7699d72823d22 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc +*.swp build dist MANIFEST From f587bd1ca123d76529e64528868cd8916e57e8da Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 11:20:14 -0400 Subject: [PATCH 134/161] DOC: release notes --- RELEASE.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index 9e49899be125f..5ab212d8c3e0a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -129,6 +129,8 @@ feedback on the library. - Implemented `Panel.rename_axis` function (GH #243) - DataFrame will show index level names in console output - Implemented `Panel.take` + - Add `set_eng_float_format` function for setting alternate DataFrame + floating point string formatting **Improvements to existing features** From 3b920ae30e1aa98718009635df64c32301664817 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 11:25:07 -0400 Subject: [PATCH 135/161] BUG: corner cases in MultiIndex set operations --- RELEASE.rst | 1 + pandas/core/index.py | 17 ++++++++++++++--- pandas/tests/test_index.py | 21 ++++++++++++++++++--- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 5ab212d8c3e0a..4428553c948ad 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -193,6 +193,7 @@ feedback on the library. - Can pass list of tuples to `Series` (GH #270) - Can pass level name to `DataFrame.stack` - Support set operations between MultiIndex and Index + - Fix many corner cases in MultiIndex set operations Thanks ------ diff --git a/pandas/core/index.py b/pandas/core/index.py index 914ac9fd6d543..62a141378ec0e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1501,12 +1501,15 @@ def union(self, other): if len(other) == 0 or self.equals(other): return self + result_names = self.names if self.names == other.names else None + # TODO: optimize / make less wasteful self_tuples = self.get_tuple_index() other_tuples = other.get_tuple_index() uniq_tuples = lib.fast_unique_multiple([self_tuples, other_tuples]) - return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0) + return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, + names=result_names) def intersection(self, other): """ @@ -1528,11 +1531,19 @@ def intersection(self, other): if self.equals(other): return self + result_names = self.names if self.names == other.names else None + # TODO: optimize / make less wasteful self_tuples = self.get_tuple_index() other_tuples = other.get_tuple_index() uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) - return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0) + if len(uniq_tuples) == 0: + return MultiIndex(levels=[[]]*self.nlevels, + labels=[[]]*self.nlevels, + names=result_names) + else: + return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, + names=result_names) def diff(self, other): """ @@ -1553,7 +1564,7 @@ def diff(self, other): difference = sorted(set(self.values) - set(other.values)) - if not difference: + if len(difference) == 0: return MultiIndex(levels=[[]]*self.nlevels, labels=[[]]*self.nlevels, names=result_names) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f48b058770590..5db73bdd35068 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -920,6 +920,11 @@ def test_intersection(self): the_int = self.index.intersection(self.index) self.assert_(the_int is self.index) + # empty intersection: disjoint + empty = self.index[:2] & self.index[2:] + expected = self.index[:0] + self.assert_(empty.equals(expected)) + tuples = self.index.get_tuple_index() result = self.index & tuples self.assert_(result.equals(tuples)) @@ -935,9 +940,19 @@ def test_diff(self): self.assert_(result.equals(expected)) self.assertEqual(result.names, self.index.names) - # empty difference - result = first - first - expected = first[:0] + # empty difference: reflexive + result = self.index - self.index + expected = self.index[:0] + self.assert_(result.equals(expected)) + + # empty difference: superset + result = self.index[-3:] - self.index + expected = self.index[:0] + self.assert_(result.equals(expected)) + + # empty difference: degenerate + result = self.index[:0] - self.index + expected = self.index[:0] self.assert_(result.equals(expected)) # names not the same From 203f4115fe3bc024f0019d9633c58443d21b80f7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 12:19:43 -0400 Subject: [PATCH 136/161] BUG: GroupBy.apply bug with differently indexed MultiIndex objects, test coverage --- RELEASE.rst | 2 ++ pandas/core/common.py | 4 ++-- pandas/core/groupby.py | 42 +++++++++++++++++++++--------------- pandas/tests/test_frame.py | 19 +++++++++++++++- pandas/tests/test_groupby.py | 25 +++++++++++++++++++++ pandas/tests/test_index.py | 3 +++ 6 files changed, 75 insertions(+), 20 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 4428553c948ad..95d226c9708ee 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -194,6 +194,8 @@ feedback on the library. - Can pass level name to `DataFrame.stack` - Support set operations between MultiIndex and Index - Fix many corner cases in MultiIndex set operations + - Fix MultiIndex-handling bug with GroupBy.apply when returned groups are not + indexed the same Thanks ------ diff --git a/pandas/core/common.py b/pandas/core/common.py index b800bf3aa7172..13b80c1d8b1ef 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -416,7 +416,7 @@ def __call__(self, num): sign = 1 - if dnum < 0: + if dnum < 0: # pragma: no cover sign = -1 dnum = -dnum @@ -439,7 +439,7 @@ def __call__(self, num): mant = sign*dnum/(10**pow10) - if self.precision is None: + if self.precision is None: # pragma: no cover format_str = u"%g%s" elif self.precision == 0: format_str = u"%i%s" diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index eccdafeaf1fcf..99eaad9ca2400 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -412,8 +412,13 @@ def _python_apply_general(self, func, *args, **kwargs): not_indexed_same = False for key, group in self: group.name = key + + # group might be modified + group_axes = _get_axes(group) + res = func(group, *args, **kwargs) - if not _is_indexed_like(res, group): + + if not _is_indexed_like(res, group_axes): not_indexed_same = True result_keys.append(key) @@ -460,18 +465,19 @@ def groupby(obj, by, **kwds): return klass(obj, by, **kwds) groupby.__doc__ = GroupBy.__doc__ -def _is_indexed_like(obj, other): +def _get_axes(group): + if isinstance(group, Series): + return [group.index] + else: + return group.axes + +def _is_indexed_like(obj, axes): if isinstance(obj, Series): - if not isinstance(other, Series): + if len(axes) > 1: return False - return obj.index.equals(other.index) + return obj.index.equals(axes[0]) elif isinstance(obj, DataFrame): - if isinstance(other, Series): - return obj.index.equals(other.index) - - # deal with this when a case arises - assert(isinstance(other, DataFrame)) - return obj._indexed_same(other) + return obj.index.equals(axes[0]) return False @@ -1093,11 +1099,7 @@ def _concat_frames(frames, index, columns=None, axis=0): return result.reindex(index=index, columns=columns) def _concat_indexes(indexes): - if len(indexes) == 1: - new_index = indexes[0] - else: - new_index = indexes[0].append(indexes[1:]) - return new_index + return indexes[0].append(indexes[1:]) def _concat_frames_hierarchical(frames, keys, groupings, axis=0): if axis == 0: @@ -1135,8 +1137,14 @@ def _make_concat_multiindex(indexes, keys, groupings): to_concat.append(np.repeat(k, len(index))) label_list.append(np.concatenate(to_concat)) - # these go in the last level - label_list.append(np.concatenate(indexes)) + concat_index = _concat_indexes(indexes) + + # these go at the end + if isinstance(concat_index, MultiIndex): + for level in range(concat_index.nlevels): + label_list.append(concat_index.get_level_values(level)) + else: + label_list.append(concat_index.values) return MultiIndex.from_arrays(label_list) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a763dc8cac458..39a55f29ac278 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11,6 +11,7 @@ from numpy.random import randn import numpy as np +import pandas.core.common as common import pandas.core.datetools as datetools from pandas.core.index import NULL_INDEX from pandas.core.api import (DataFrame, Index, Series, notnull, isnull, @@ -1245,10 +1246,26 @@ def test_repr(self): index=np.arange(50)) foo = repr(unsortable) - import pandas.core.common as common common.set_printoptions(precision=3, column_space=10) repr(self.frame) + def test_eng_float_formatter(self): + self.frame.ix[5] = 0 + + common.set_eng_float_format() + + repr(self.frame) + + common.set_eng_float_format(use_eng_prefix=True) + + repr(self.frame) + + common.set_eng_float_format(precision=0) + + repr(self.frame) + + common.set_printoptions(precision=4) + def test_repr_tuples(self): buf = StringIO() diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 67ecbd6fd81da..f17d853df93fb 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -913,6 +913,31 @@ def f(group): assert_frame_equal(result, expected) + def test_apply_corner(self): + result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) + expected = self.tsframe * 2 + assert_frame_equal(result, expected) + + def test_transform_mixed_type(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]]) + df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], + 'c' : np.tile(['a','b','c'], 2), + 'v' : np.arange(1., 7.)}, index=index) + + def f(group): + group['g'] = group['d'] * 2 + return group[:1] + + grouped = df.groupby('c') + result = grouped.apply(f) + + self.assert_(result['d'].dtype == np.float64) + + for key, group in grouped: + res = f(group) + assert_frame_equal(res, result.ix[key]) + class TestPanelGroupBy(unittest.TestCase): def setUp(self): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 5db73bdd35068..a4f11bbfddc05 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -944,16 +944,19 @@ def test_diff(self): result = self.index - self.index expected = self.index[:0] self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) # empty difference: superset result = self.index[-3:] - self.index expected = self.index[:0] self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) # empty difference: degenerate result = self.index[:0] - self.index expected = self.index[:0] self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) # names not the same chunklet = self.index[-3:] From f1ab24ce27862be70629a1c4424bda434aa7876b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 12:23:24 -0400 Subject: [PATCH 137/161] TST: Index.append with empty list, GH #283 --- pandas/tests/test_index.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index a4f11bbfddc05..edd477e38caf8 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -175,7 +175,7 @@ def test_add(self): secondCat = self.strIndex + self.strIndex self.assert_(tm.equalContents(np.append(self.strIndex, - self.dateIndex), firstCat)) + self.dateIndex), firstCat)) self.assert_(tm.equalContents(secondCat, self.strIndex)) tm.assert_contains_all(self.strIndex, firstCat.indexMap) tm.assert_contains_all(self.strIndex, secondCat.indexMap) @@ -184,6 +184,17 @@ def test_add(self): # this is valid too shifted = self.dateIndex + timedelta(1) + def test_append_multiple(self): + index = Index(['a', 'b', 'c', 'd', 'e', 'f']) + + foos = [index[:2], index[2:4], index[4:]] + result = foos[0].append(foos[1:]) + self.assert_(result.equals(index)) + + # empty + result = index.append([]) + self.assert_(result.equals(index)) + def test_add_string(self): # from bug report index = Index(['a', 'b', 'c']) @@ -602,6 +613,14 @@ def test_append(self): result = self.index[:3].append(self.index[3:]) self.assert_(result.equals(self.index)) + foos = [self.index[:1], self.index[1:3], self.index[3:]] + result = foos[0].append(foos[1:]) + self.assert_(result.equals(self.index)) + + # empty + result = self.index.append([]) + self.assert_(result.equals(self.index)) + def test_get_level_values(self): result = self.index.get_level_values(0) expected = ['foo', 'foo', 'bar', 'baz', 'qux', 'qux'] From e4b66dbd45526444208eedddc05da74f3562c5b7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 13:41:52 -0400 Subject: [PATCH 138/161] BUG: more proper handling of no rows / no columns in DataFrame.apply --- RELEASE.rst | 1 + pandas/core/frame.py | 4 ++-- pandas/tests/test_frame.py | 9 +++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 95d226c9708ee..549dfdf742dc2 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -196,6 +196,7 @@ feedback on the library. - Fix many corner cases in MultiIndex set operations - Fix MultiIndex-handling bug with GroupBy.apply when returned groups are not indexed the same + - Fix corner case bugs in DataFrame.apply Thanks ------ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 834e1f03ac468..1ca66aa659523 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1922,7 +1922,7 @@ def apply(self, func, axis=0, broadcast=False): ------- applied : Series or DataFrame """ - if not len(self.columns): + if len(self.columns) == 0 and len(self.index) == 0: return self if isinstance(func, np.ufunc): @@ -2902,7 +2902,7 @@ def _write_to_buffer(self): to_write = [] if len(frame.columns) == 0 or len(frame.index) == 0: - to_write.append('Empty %s' % type(self).__name__) + to_write.append('Empty %s\n' % type(self.frame).__name__) to_write.append(repr(frame.index)) else: # may include levels names also diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 39a55f29ac278..417ad1641fb98 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2347,6 +2347,15 @@ def test_apply(self): applied = self.empty.apply(np.mean) self.assert_(not applied) + no_rows = self.frame[:0] + result = no_rows.apply(lambda x: x.mean()) + expected = Series(np.nan, index=self.frame.columns) + assert_series_equal(result, expected) + + no_cols = self.frame.ix[:, []] + result = no_cols.apply(lambda x: x.mean(), axis=1) + expected = Series(np.nan, index=self.frame.index) + assert_series_equal(result, expected) def test_apply_broadcast(self): broadcasted = self.frame.apply(np.mean, broadcast=True) From e5193c0a726b4497e3f7e3e58d67827319b0d9c4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 13:58:17 -0400 Subject: [PATCH 139/161] ENH: check for duplicate level names, GH #280 --- pandas/core/index.py | 5 +++++ pandas/tests/test_index.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/pandas/core/index.py b/pandas/core/index.py index 62a141378ec0e..0ce3f33bf9730 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -889,6 +889,11 @@ def __iter__(self): def _get_level_number(self, level): if not isinstance(level, int): + count = self.names.count(level) + if count > 1: + raise Exception('The name %s occurs multiple times, use a ' + 'level number' % level) + level = self.names.index(level) elif level < 0: level += self.nlevels diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index edd477e38caf8..f4c8764b8689f 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -601,6 +601,10 @@ def test_constructor_single_level(self): def test_constructor_no_levels(self): self.assertRaises(Exception, MultiIndex, levels=[], labels=[]) + def test_duplicate_names(self): + self.index.names = ['foo', 'foo'] + self.assertRaises(Exception, self.index._get_level_number, 'foo') + def test_from_arrays(self): arrays = [] for lev, lab in zip(self.index.levels, self.index.labels): From 837b7dc4d19464b168dff1f2758f74693ddd045e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 14:28:37 -0400 Subject: [PATCH 140/161] ENH: DataFrame.set_index function, and bugfix when setting DataFrame index, GH #266 --- RELEASE.rst | 5 ++- pandas/core/frame.py | 53 ++++++++++++++++++++++++++++++-- pandas/tests/test_frame.py | 62 +++++++++++++++++++++++++++++++++++--- 3 files changed, 112 insertions(+), 8 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 549dfdf742dc2..3afa141627fe2 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -131,6 +131,8 @@ feedback on the library. - Implemented `Panel.take` - Add `set_eng_float_format` function for setting alternate DataFrame floating point string formatting + - Add convenience `set_index` function for creating a DataFrame index from + its existing columns **Improvements to existing features** @@ -194,9 +196,10 @@ feedback on the library. - Can pass level name to `DataFrame.stack` - Support set operations between MultiIndex and Index - Fix many corner cases in MultiIndex set operations - - Fix MultiIndex-handling bug with GroupBy.apply when returned groups are not + - Fix MultiIndex-handling bug with GroupBy.apply when returned groups are not indexed the same - Fix corner case bugs in DataFrame.apply + - Setting DataFrame index did not cause Series cache to get cleared Thanks ------ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1ca66aa659523..dd6bc1d706fe2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -643,8 +643,13 @@ def _set_columns(self, value): self._series_cache.clear() columns = property(fset=_set_columns, fget=_get_columns) - # reference underlying BlockManager - index = AxisProperty(1) + def _get_index(self): + return self._data.axes[1] + + def _set_index(self, value): + self._data.set_axis(1, value) + self._series_cache.clear() + index = property(fset=_set_index, fget=_get_index) def as_matrix(self, columns=None): """ @@ -1099,6 +1104,50 @@ def reindex_like(self, other, method=None, copy=True): return self.reindex(index=other.index, columns=other.columns, method=method, copy=copy) + def set_index(self, col_or_cols, drop=True, inplace=False): + """ + Set the DataFrame index (row labels) using one or more existing + columns. By default yields a new object. + + Parameters + ---------- + col_or_cols : column label or list of column labels + drop : boolean, default True + Delete columns to be used as the new index + inplace : boolean, default False + Modify the DataFrame in place (do not create a new object) + + Returns + ------- + dataframe : DataFrame + """ + cols = col_or_cols + if not isinstance(col_or_cols, (list, tuple)): + cols = [col_or_cols] + + if inplace: + frame = self + + else: + frame = self.copy() + + arrays = [] + for col in cols: + level = frame[col] + if drop: + del frame[col] + arrays.append(level) + + index = MultiIndex.from_arrays(arrays, names=cols) + + if not index._verify_integrity(): + duplicates = index._get_duplicates() + raise Exception('Index has duplicate keys: %s' % duplicates) + + frame.index = index + + return frame + def take(self, indices, axis=0): """ Analogous to ndarray.take, return DataFrame corresponding to requested diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 417ad1641fb98..f7a4c7fa892d7 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2101,11 +2101,6 @@ def test_pivot_duplicates(self): data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo', 'foo'], 'b' : ['one', 'two', 'one', 'one', 'two'], 'c' : [1., 2., 3., 3., 4.]}) - # expected = DataFrame([[1., 2.], [3., 4.]], index=['bar', 'foo'], - # columns=['one', 'two']) - # result = data.pivot('a', 'b', 'c') - # assert_frame_equal(result, expected) - self.assertRaises(Exception, data.pivot, 'a', 'b', 'c') def test_reindex(self): @@ -2192,6 +2187,63 @@ def test_reindex_columns(self): newFrame = self.frame.reindex(columns=[]) self.assert_(not newFrame) + def test_add_index(self): + df = DataFrame({'A' : ['foo', 'foo', 'foo', 'bar', 'bar'], + 'B' : ['one', 'two', 'three', 'one', 'two'], + 'C' : ['a', 'b', 'c', 'd', 'e'], + 'D' : np.random.randn(5), + 'E' : np.random.randn(5)}) + + # new object, single-column + result = df.set_index('C') + result_nodrop = df.set_index('C', drop=False) + + index = Index(df['C'], name='C') + + expected = df.ix[:, ['A', 'B', 'D', 'E']] + expected.index = index + + expected_nodrop = df.copy() + expected_nodrop.index = index + + assert_frame_equal(result, expected) + assert_frame_equal(result_nodrop, expected_nodrop) + self.assertEqual(result.index.name, index.name) + + # inplace, single + df2 = df.copy() + df2.set_index('C', inplace=True) + assert_frame_equal(df2, expected) + + df3 = df.copy() + df3.set_index('C', drop=False, inplace=True) + assert_frame_equal(df3, expected_nodrop) + + # create new object, multi-column + result = df.set_index(['A', 'B']) + result_nodrop = df.set_index(['A', 'B'], drop=False) + + index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B']) + + expected = df.ix[:, ['C', 'D', 'E']] + expected.index = index + + expected_nodrop = df.copy() + expected_nodrop.index = index + + assert_frame_equal(result, expected) + assert_frame_equal(result_nodrop, expected_nodrop) + self.assertEqual(result.index.names, index.names) + + # inplace + df2 = df.copy() + df2.set_index(['A', 'B'], inplace=True) + assert_frame_equal(df2, expected) + + df3 = df.copy() + df3.set_index(['A', 'B'], drop=False, inplace=True) + assert_frame_equal(df3, expected_nodrop) + def test_align(self): af, bf = self.frame.align(self.frame) From 21520bd46ad8d0e3bd4d8eeea39121f6d0a3bd38 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 14:34:00 -0400 Subject: [PATCH 141/161] TST: corner case in set_index --- pandas/tests/test_frame.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index f7a4c7fa892d7..ebc27fd83ce74 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2244,6 +2244,9 @@ def test_add_index(self): df3.set_index(['A', 'B'], drop=False, inplace=True) assert_frame_equal(df3, expected_nodrop) + # corner case + self.assertRaises(Exception, df.set_index, 'A') + def test_align(self): af, bf = self.frame.align(self.frame) From 2174131ba8262f7236e8322983f4ae8e098cb586 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 14:57:16 -0400 Subject: [PATCH 142/161] TST: 32-bit use 64-bit integer --- pandas/core/internals.py | 2 ++ pandas/io/tests/test_parsers.py | 13 +++++++++++++ setup.py | 2 -- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 27fc245bf0547..8bad5a77efff3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -199,6 +199,8 @@ def make_block(values, items, ref_items, do_integrity_check=False): if issubclass(vtype, np.floating): klass = FloatBlock elif issubclass(vtype, np.integer): + if vtype != np.int64: + values = values.astype('i8') klass = IntBlock elif dtype == np.bool_: klass = BoolBlock diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 11b47dfb45991..02e989515a17a 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -334,6 +334,15 @@ def test_multi_index_no_level_names(self): bar,one,12,13,14,15 bar,two,12,13,14,15 """ + + data2 = """A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + lines = data.split('\n') no_header = '\n'.join(lines[1:]) names = ['A', 'B', 'C', 'D'] @@ -341,6 +350,10 @@ def test_multi_index_no_level_names(self): expected = read_csv(StringIO(data), index_col=[0, 1]) assert_frame_equal(df, expected) + # 2 implicit first cols + df2 = read_csv(StringIO(data2)) + assert_frame_equal(df2, df) + class TestParseSQL(unittest.TestCase): def test_convert_sql_column_floats(self): diff --git a/setup.py b/setup.py index bf997d8c8ecac..538212defe8fb 100755 --- a/setup.py +++ b/setup.py @@ -285,8 +285,6 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): else: tseries_depends = None -print tseries_depends - tseries_ext = Extension('pandas._tseries', depends=tseries_depends, sources=[srcpath('tseries', suffix=suffix)], From 842790bd0c1da1788594daace3e77cb3b4a67808 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 14:58:09 -0400 Subject: [PATCH 143/161] TST: int64 fixes --- pandas/tests/test_internals.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 4603a07d294e1..f29fbad9d790b 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -227,7 +227,7 @@ def test_set_change_dtype(self): self.assert_(mgr2.get('baz').dtype == np.object_) mgr2.set('quux', randn(N).astype(int)) - self.assert_(mgr2.get('quux').dtype == np.int_) + self.assert_(mgr2.get('quux').dtype == np.int64) mgr2.set('quux', randn(N)) self.assert_(mgr2.get('quux').dtype == np.float_) @@ -249,7 +249,7 @@ def test_as_matrix_int_bool(self): blocks = [get_int_ex(['a']), get_int_ex(['b'])] mgr = BlockManager.from_blocks(blocks, np.arange(index_sz)) - self.assert_(mgr.as_matrix().dtype == np.int_) + self.assert_(mgr.as_matrix().dtype == np.int64) def test_xs(self): pass From 0fa4f1e813b1660c38923cf19039cdde53a8decc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 15:03:25 -0400 Subject: [PATCH 144/161] ENH: -> int64 everywhere --- RELEASE.rst | 1 + pandas/core/frame.py | 8 ++++---- pandas/core/generic.py | 4 ++-- pandas/core/internals.py | 3 +-- pandas/core/series.py | 8 ++++---- pandas/io/tests/test_parsers.py | 4 ++-- pandas/tests/test_frame.py | 34 ++++++++++++++++----------------- pandas/tests/test_ndframe.py | 6 +++--- pandas/tests/test_series.py | 2 +- pandas/tests/test_sparse.py | 4 ++-- 10 files changed, 37 insertions(+), 37 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 3afa141627fe2..633f3bdd72cd7 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -200,6 +200,7 @@ feedback on the library. indexed the same - Fix corner case bugs in DataFrame.apply - Setting DataFrame index did not cause Series cache to get cleared + - Various int32 -> int64 platform-specific issues Thanks ------ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dd6bc1d706fe2..747d271b6d342 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2455,7 +2455,7 @@ def sum(self, axis=0, numeric_only=False, skipna=True): mask = np.isfinite(y) if skipna: - if not issubclass(y.dtype.type, np.int_): + if not issubclass(y.dtype.type, np.integer): np.putmask(y, -mask, 0) the_sum = y.sum(axis) @@ -2484,7 +2484,7 @@ def min(self, axis=0, skipna=True): min : Series """ values = self.values.copy() - if skipna and not issubclass(values.dtype.type, np.int_): + if skipna and not issubclass(values.dtype.type, np.integer): np.putmask(values, -np.isfinite(values), np.inf) return Series(values.min(axis), index=self._get_agg_axis(axis)) @@ -2505,7 +2505,7 @@ def max(self, axis=0, skipna=True): max : Series """ values = self.values.copy() - if skipna and not issubclass(values.dtype.type, np.int_): + if skipna and not issubclass(values.dtype.type, np.integer): np.putmask(values, -np.isfinite(values), -np.inf) return Series(values.max(axis), index=self._get_agg_axis(axis)) @@ -2527,7 +2527,7 @@ def prod(self, axis=0, skipna=True): """ y = np.array(self.values, subok=True) if skipna: - if not issubclass(y.dtype.type, np.int_): + if not issubclass(y.dtype.type, np.integer): y[np.isnan(y)] = 1 result = y.prod(axis) count = self.count(axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f4ff2ab0936d5..90a3b1c2c20b2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -321,7 +321,7 @@ def cumsum(self, axis=None, skipna=True): axis = self._get_axis_number(axis) y = self.values.copy() - if not issubclass(y.dtype.type, np.int_): + if not issubclass(y.dtype.type, np.integer): mask = np.isnan(self.values) if skipna: @@ -360,7 +360,7 @@ def cumprod(self, axis=None, skipna=True): axis = self._get_axis_number(axis) y = self.values.copy() - if not issubclass(y.dtype.type, np.int_): + if not issubclass(y.dtype.type, np.integer): mask = np.isnan(self.values) if skipna: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8bad5a77efff3..cee8b51c635cc 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -815,7 +815,6 @@ def block_id_vector(self): assert((result >= 0).all()) return result -_data_types = [np.float_, np.int_] def form_blocks(data, axes): # pre-filter out items if we passed it items = axes[0] @@ -847,7 +846,7 @@ def form_blocks(data, axes): blocks.append(float_block) if len(int_dict): - int_block = _simple_blockify(int_dict, items, np.int_) + int_block = _simple_blockify(int_dict, items, np.int64) blocks.append(int_block) if len(bool_dict): diff --git a/pandas/core/series.py b/pandas/core/series.py index e6648c677070b..2ef4e1ecd13f6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -709,7 +709,7 @@ def min(self, axis=None, out=None, skipna=True): """ arr = self.values.copy() if skipna: - if not issubclass(arr.dtype.type, np.int_): + if not issubclass(arr.dtype.type, np.integer): np.putmask(arr, isnull(arr), np.inf) return arr.min() @@ -728,7 +728,7 @@ def max(self, axis=None, out=None, skipna=True): """ arr = self.values.copy() if skipna: - if not issubclass(arr.dtype.type, np.int_): + if not issubclass(arr.dtype.type, np.integer): np.putmask(arr, isnull(arr), -np.inf) return arr.max() @@ -822,7 +822,7 @@ def cumsum(self, axis=0, dtype=None, out=None, skipna=True): """ arr = self.values.copy() - do_mask = skipna and not issubclass(self.dtype.type, np.int_) + do_mask = skipna and not issubclass(self.dtype.type, np.integer) if do_mask: mask = isnull(arr) np.putmask(arr, mask, 0.) @@ -851,7 +851,7 @@ def cumprod(self, axis=0, dtype=None, out=None, skipna=True): """ arr = self.values.copy() - do_mask = skipna and not issubclass(self.dtype.type, np.int_) + do_mask = skipna and not issubclass(self.dtype.type, np.integer) if do_mask: mask = isnull(arr) np.putmask(arr, mask, 1.) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 02e989515a17a..b5496bbcb774c 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -221,8 +221,8 @@ def test_int_conversion(self): 3.0,3 """ data = read_csv(StringIO(data)) - self.assert_(data['A'].dtype == np.float_) - self.assert_(data['B'].dtype == np.int_) + self.assert_(data['A'].dtype == np.float64) + self.assert_(data['B'].dtype == np.int64) def test_infer_index_col(self): data = """A,B,C diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index ebc27fd83ce74..4b55a426024b3 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -198,7 +198,7 @@ def test_setitem_corner(self): self.assertEqual(dm.values.dtype, np.object_) dm['C'] = 1 - self.assertEqual(dm['C'].dtype, np.int_) + self.assertEqual(dm['C'].dtype, np.int64) # set existing column dm['A'] = 'bar' @@ -371,7 +371,7 @@ def test_setitem_fancy_2d(self): def test_fancy_getitem_slice_mixed(self): sliced = self.mixed_frame.ix[:, -3:] - self.assert_(sliced['D'].dtype == np.float_) + self.assert_(sliced['D'].dtype == np.float64) # get view with single block sliced = self.frame.ix[:, -3:] @@ -935,13 +935,13 @@ def test_constructor_dict_cast(self): } frame = DataFrame(test_data, dtype=float) self.assertEqual(len(frame), 3) - self.assert_(frame['B'].dtype == np.float_) - self.assert_(frame['A'].dtype == np.float_) + self.assert_(frame['B'].dtype == np.float64) + self.assert_(frame['A'].dtype == np.float64) frame = DataFrame(test_data) self.assertEqual(len(frame), 3) self.assert_(frame['B'].dtype == np.object_) - self.assert_(frame['A'].dtype == np.float_) + self.assert_(frame['A'].dtype == np.float64) # can't cast to float test_data = { @@ -951,7 +951,7 @@ def test_constructor_dict_cast(self): frame = DataFrame(test_data, dtype=float) self.assertEqual(len(frame), 20) self.assert_(frame['A'].dtype == np.object_) - self.assert_(frame['B'].dtype == np.float_) + self.assert_(frame['B'].dtype == np.float64) def test_constructor_dict_dont_upcast(self): d = {'Col1': {'Row1': 'A String', 'Row2': np.nan}} @@ -973,7 +973,7 @@ def test_constructor_ndarray(self): # cast type frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2], dtype=int) - self.assert_(frame.values.dtype == np.int_) + self.assert_(frame.values.dtype == np.int64) # 1-D input frame = DataFrame(np.zeros(3), columns=['A'], index=[1, 2, 3]) @@ -1022,16 +1022,16 @@ def test_constructor_corner(self): # does not error but ends up float df = DataFrame(index=range(10), columns=['a','b'], dtype=int) - self.assert_(df.values.dtype == np.float_) + self.assert_(df.values.dtype == np.float64) def test_constructor_scalar_inference(self): data = {'int' : 1, 'bool' : True, 'float' : 3., 'object' : 'foo'} df = DataFrame(data, index=np.arange(10)) - self.assert_(df['int'].dtype == np.int_) + self.assert_(df['int'].dtype == np.int64) self.assert_(df['bool'].dtype == np.bool_) - self.assert_(df['float'].dtype == np.float_) + self.assert_(df['float'].dtype == np.float64) self.assert_(df['object'].dtype == np.object_) def test_constructor_DataFrame(self): @@ -1039,7 +1039,7 @@ def test_constructor_DataFrame(self): assert_frame_equal(df, self.frame) df_casted = DataFrame(self.frame, dtype=int) - self.assert_(df_casted.values.dtype == np.int_) + self.assert_(df_casted.values.dtype == np.int64) def test_constructor_more(self): # used to be in test_matrix.py @@ -1079,7 +1079,7 @@ def test_constructor_more(self): index=np.arange(10)) self.assertEqual(len(dm.columns), 2) - self.assert_(dm.values.dtype == np.float_) + self.assert_(dm.values.dtype == np.float64) def test_constructor_ragged(self): data = {'A' : randn(10), @@ -2162,13 +2162,13 @@ def test_reindex(self): def test_reindex_int(self): smaller = self.intframe.reindex(self.intframe.index[::2]) - self.assert_(smaller['A'].dtype == np.int_) + self.assert_(smaller['A'].dtype == np.int64) bigger = smaller.reindex(self.intframe.index) - self.assert_(bigger['A'].dtype == np.float_) + self.assert_(bigger['A'].dtype == np.float64) smaller = self.intframe.reindex(columns=['A', 'B']) - self.assert_(smaller['A'].dtype == np.int_) + self.assert_(smaller['A'].dtype == np.int64) def test_reindex_like(self): other = self.frame.reindex(index=self.frame.index[:10], @@ -2966,7 +2966,7 @@ def test_reindex_corner(self): # ints are weird smaller = self.intframe.reindex(columns=['A', 'B', 'E']) - self.assert_(smaller['E'].dtype == np.float_) + self.assert_(smaller['E'].dtype == np.float64) def test_rename_objects(self): renamed = self.mixed_frame.rename(columns=str.upper) @@ -3306,7 +3306,7 @@ def test_join_index_mixed(self): df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, index=np.arange(10), columns=['A', 'B', 'C', 'D']) - self.assert_(df1['B'].dtype == np.int_) + self.assert_(df1['B'].dtype == np.int64) self.assert_(df1['D'].dtype == np.bool_) df2 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, diff --git a/pandas/tests/test_ndframe.py b/pandas/tests/test_ndframe.py index e09d78b177a23..70a5d79d2c428 100644 --- a/pandas/tests/test_ndframe.py +++ b/pandas/tests/test_ndframe.py @@ -13,15 +13,15 @@ def setUp(self): def test_constructor(self): # with cast - ndf = NDFrame(self.ndf._data, dtype=int) - self.assert_(ndf.values.dtype == np.int_) + ndf = NDFrame(self.ndf._data, dtype=np.int64) + self.assert_(ndf.values.dtype == np.int64) def test_ndim(self): self.assertEquals(self.ndf.ndim, 2) def test_astype(self): casted = self.ndf.astype(int) - self.assert_(casted.values.dtype == np.int_) + self.assert_(casted.values.dtype == np.int64) if __name__ == '__main__': import nose diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 07ada8ae1fb37..b555cae378fde 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1137,7 +1137,7 @@ def test_reindex_int(self): # NO NaNs introduced reindexed_int = int_ts.reindex(int_ts.index[::2]) - self.assert_(reindexed_int.dtype == np.int_) + self.assert_(reindexed_int.dtype == np.int64) def test_reindex_bool(self): diff --git a/pandas/tests/test_sparse.py b/pandas/tests/test_sparse.py index 3ac04eb4dc385..a361dc70ef39f 100644 --- a/pandas/tests/test_sparse.py +++ b/pandas/tests/test_sparse.py @@ -250,7 +250,7 @@ def test_copy_astype(self): self.assert_((self.bseries.sp_values[:5] == 5).all()) def test_astype(self): - self.assertRaises(Exception, self.bseries.astype, np.int_) + self.assertRaises(Exception, self.bseries.astype, np.int64) def test_kind(self): self.assertEquals(self.bseries.kind, 'block') @@ -958,7 +958,7 @@ def test_applymap(self): self.assert_(isinstance(result, SparseDataFrame)) def test_astype(self): - self.assertRaises(Exception, self.frame.astype, np.int_) + self.assertRaises(Exception, self.frame.astype, np.int64) def test_fillna(self): self.assertRaises(NotImplementedError, self.frame.fillna, 0) From 360a99fe0ca65f672fbf8b7982d9428465b142b9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 15:16:54 -0400 Subject: [PATCH 145/161] TST: more 32-bit integer fussiness --- pandas/tests/test_groupby.py | 3 ++- pandas/tests/test_series.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index f17d853df93fb..d6041b8e80998 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -743,7 +743,8 @@ def test_apply_frame_to_series(self): grouped = self.df.groupby(['A', 'B']) result = grouped.apply(len) expected = grouped.count()['C'] - assert_series_equal(result, expected) + self.assert_(result.index.equals(expected.index)) + self.assert_(np.array_equal(result.values, expected.values)) def test_apply_transform(self): grouped = self.ts.groupby(lambda x: x.month) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index b555cae378fde..07ada8ae1fb37 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1137,7 +1137,7 @@ def test_reindex_int(self): # NO NaNs introduced reindexed_int = int_ts.reindex(int_ts.index[::2]) - self.assert_(reindexed_int.dtype == np.int64) + self.assert_(reindexed_int.dtype == np.int_) def test_reindex_bool(self): From 172d66dba983aae83b809de5d81e8b6855774e61 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 16:41:45 -0400 Subject: [PATCH 146/161] TST: tuples and strings aren't comparable in python 3 --- pandas/core/index.py | 8 -------- pandas/tests/test_index.py | 29 ++++++++++++++++------------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 0ce3f33bf9730..bcf37e2d0ce76 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1498,9 +1498,6 @@ def union(self, other): ------- Index """ - if not isinstance(other, MultiIndex): - return other.union(self) - self._assert_can_do_setop(other) if len(other) == 0 or self.equals(other): @@ -1508,7 +1505,6 @@ def union(self, other): result_names = self.names if self.names == other.names else None - # TODO: optimize / make less wasteful self_tuples = self.get_tuple_index() other_tuples = other.get_tuple_index() @@ -1528,9 +1524,6 @@ def intersection(self, other): ------- Index """ - if not isinstance(other, MultiIndex): - return other.intersection(self) - self._assert_can_do_setop(other) if self.equals(other): @@ -1538,7 +1531,6 @@ def intersection(self, other): result_names = self.names if self.names == other.names else None - # TODO: optimize / make less wasteful self_tuples = self.get_tuple_index() other_tuples = other.get_tuple_index() uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f4c8764b8689f..38365ddfaa1b4 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -916,19 +916,21 @@ def test_union(self): the_union = self.index.union(self.index[:0]) self.assert_(the_union is self.index) - tuples = self.index.get_tuple_index() - result = self.index[:4] | tuples[4:] - self.assert_(result.equals(tuples)) + # won't work in python 3 + # tuples = self.index.get_tuple_index() + # result = self.index[:4] | tuples[4:] + # self.assert_(result.equals(tuples)) - def test_union_with_regular_index(self): - other = Index(['A', 'B', 'C']) + # not valid for python 3 + # def test_union_with_regular_index(self): + # other = Index(['A', 'B', 'C']) - result = other.union(self.index) - self.assert_(('foo', 'one') in result) - self.assert_('B' in result) + # result = other.union(self.index) + # self.assert_(('foo', 'one') in result) + # self.assert_('B' in result) - result2 = self.index.union(other) - self.assert_(result.equals(result2)) + # result2 = self.index.union(other) + # self.assert_(result.equals(result2)) def test_intersection(self): piece1 = self.index[:5][::-1] @@ -948,9 +950,10 @@ def test_intersection(self): expected = self.index[:0] self.assert_(empty.equals(expected)) - tuples = self.index.get_tuple_index() - result = self.index & tuples - self.assert_(result.equals(tuples)) + # can't do in python 3 + # tuples = self.index.get_tuple_index() + # result = self.index & tuples + # self.assert_(result.equals(tuples)) def test_diff(self): first = self.index From 57c8a058809accaacbc0808b0f2a802ffdc20c0e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 21:08:56 -0400 Subject: [PATCH 147/161] BUG: workaround not being able to use cast=True with boolean dtype in Python 2.5 --- pandas/src/generate_code.py | 4 ++-- pandas/src/generated.pyx | 20 ++++++++++---------- pandas/src/parsing.pyx | 12 ++++++------ pandas/src/reindex.pyx | 6 +++--- pandas/src/tseries.pyx | 4 ++-- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 28bc8f2caa985..30624bdff52a6 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -312,13 +312,13 @@ def is_monotonic_%(name)s(ndarray[%(c_type)s] arr): @cython.boundscheck(False) def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[uint8_t, cast=True] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 9d191e0a563b3..f851d9c1ddd8f 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -938,13 +938,13 @@ def is_monotonic_bool(ndarray[uint8_t] arr): @cython.boundscheck(False) def groupby_float64(ndarray[float64_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[uint8_t, cast=True] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: @@ -964,13 +964,13 @@ def groupby_float64(ndarray[float64_t] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_object(ndarray[object] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[uint8_t, cast=True] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: @@ -990,13 +990,13 @@ def groupby_object(ndarray[object] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_int32(ndarray[int32_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[uint8_t, cast=True] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: @@ -1016,13 +1016,13 @@ def groupby_int32(ndarray[int32_t] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_int64(ndarray[int64_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[uint8_t, cast=True] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: @@ -1042,13 +1042,13 @@ def groupby_int64(ndarray[int64_t] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_bool(ndarray[uint8_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[uint8_t, cast=True] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index 486c2550c40c3..cfc81b1a30b23 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -99,7 +99,7 @@ def convert_sql_column(ndarray[object] objects): Py_ssize_t i, n ndarray[float64_t] floats ndarray[int64_t] ints - ndarray[uint8_t, cast=True] bools + ndarray[uint8_t] bools bint seen_float = 0 bint seen_int = 0 bint seen_bool = 0 @@ -111,7 +111,7 @@ def convert_sql_column(ndarray[object] objects): floats = np.empty(n, dtype='f8') ints = np.empty(n, dtype='i8') - bools = np.empty(n, dtype=bool) + bools = np.empty(n, dtype=np.uint8) onan = np.nan fnan = np.nan @@ -153,7 +153,7 @@ def convert_sql_column(ndarray[object] objects): elif seen_float: return floats elif seen_bool: - return bools + return bools.view(np.bool_) else: return objects @@ -206,11 +206,11 @@ def sanitize_objects(ndarray[object] values): def maybe_convert_bool(ndarray[object] arr): cdef: Py_ssize_t i, n - ndarray[uint8_t, cast=True] result + ndarray[uint8_t] result object val n = len(arr) - result = np.empty(n, dtype=bool) + result = np.empty(n, dtype=np.uint8) for i from 0 <= i < n: val = arr[i] @@ -222,4 +222,4 @@ def maybe_convert_bool(ndarray[object] arr): else: return arr - return result + return result.view(np.bool_) diff --git a/pandas/src/reindex.pyx b/pandas/src/reindex.pyx index 447e04d059cc9..4ca19f77f5c50 100644 --- a/pandas/src/reindex.pyx +++ b/pandas/src/reindex.pyx @@ -91,7 +91,7 @@ def ordered_left_join(ndarray[object] left, ndarray[object] right): cdef: Py_ssize_t i, j, k, n ndarray[int32_t] indexer - ndarray[uint8_t, cast=True] mask + ndarray[uint8_t] mask object val i = 0 @@ -100,7 +100,7 @@ def ordered_left_join(ndarray[object] left, ndarray[object] right): k = len(right) indexer = np.zeros(n, dtype=np.int32) - mask = np.ones(n, dtype=np.bool) + mask = np.ones(n, dtype=np.uint8) for i from 0 <= i < n: val = left[i] @@ -115,7 +115,7 @@ def ordered_left_join(ndarray[object] left, ndarray[object] right): indexer[i] = j mask[i] = 0 - return indexer, mask + return indexer, mask.view(np.bool_) @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index 23a9278e6c265..f5029ddc87249 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -232,7 +232,7 @@ cpdef checknull(object val): def isnullobj(ndarray[object] arr): cdef Py_ssize_t i, n cdef object val - cdef ndarray[uint8_t, cast=True] result + cdef ndarray[uint8_t] result n = len(arr) result = np.zeros(n, dtype=bool) @@ -240,7 +240,7 @@ def isnullobj(ndarray[object] arr): val = arr[i] if _checknull(val): result[i] = 1 - return result + return result.view(np.bool_) def list_to_object_array(list obj): ''' From 84477de90ec7b8ae5f3a4b87c0c649d9f2e76776 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 21:11:17 -0400 Subject: [PATCH 148/161] BUG: missed one --- pandas/src/tseries.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index f5029ddc87249..3caab8eb413bc 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -235,7 +235,7 @@ def isnullobj(ndarray[object] arr): cdef ndarray[uint8_t] result n = len(arr) - result = np.zeros(n, dtype=bool) + result = np.zeros(n, dtype=np.uint8) for i from 0 <= i < n: val = arr[i] if _checknull(val): From ef6a7b372497f147f078fd459f8763538a93270e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 21:31:15 -0400 Subject: [PATCH 149/161] BUG: don't be too aggressive with int conversion parsing MultiIndex, GH #285 --- pandas/io/parsers.py | 7 +++++-- pandas/io/tests/test_parsers.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8ac9ed5697e16..d685724398200 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -349,8 +349,11 @@ def get_chunk(self, rows=None): index = Index(_convert_types(index, self.na_values), name=self.index_name) else: - arrays = _maybe_convert_int_mindex(index, self.parse_dates, - self.date_parser) + arrays = [] + for arr in index: + if self.parse_dates: + arr = lib.try_parse_dates(arr, parser=self.date_parser) + arrays.append(_convert_types(arr, self.na_values)) index = MultiIndex.from_arrays(arrays, names=self.index_name) else: index = Index(np.arange(len(content))) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index b5496bbcb774c..1ae0876512ab9 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -354,6 +354,21 @@ def test_multi_index_no_level_names(self): df2 = read_csv(StringIO(data2)) assert_frame_equal(df2, df) + def test_multi_index_parse_dates(self): + data = """index1,index2,A,B,C +20090101,one,a,1,2 +20090101,two,b,3,4 +20090101,three,c,4,5 +20090102,one,a,1,2 +20090102,two,b,3,4 +20090102,three,c,4,5 +20090103,one,a,1,2 +20090103,two,b,3,4 +20090103,three,c,4,5 +""" + df = read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) + self.assert_(isinstance(df.index.levels[0][0], datetime)) + class TestParseSQL(unittest.TestCase): def test_convert_sql_column_floats(self): From bb52401e544d2dbb6912810b0d4d778d586777c3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 21:34:24 -0400 Subject: [PATCH 150/161] TST: fix test case broken by last change --- RELEASE.rst | 2 ++ pandas/tests/test_frame.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index 633f3bdd72cd7..4e3f0e3c9c554 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -201,6 +201,8 @@ feedback on the library. - Fix corner case bugs in DataFrame.apply - Setting DataFrame index did not cause Series cache to get cleared - Various int32 -> int64 platform-specific issues + - Don't be too aggressive converting to integer when parsing file with + MultiIndex (GH #285) Thanks ------ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 4b55a426024b3..5d34d96ea2e6f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1619,7 +1619,7 @@ def test_to_csv_multiindex(self): # round trip frame.to_csv(path) - df = DataFrame.from_csv(path, index_col=[0,1]) + df = DataFrame.from_csv(path, index_col=[0,1], parse_dates=False) assert_frame_equal(frame, df) self.assertEqual(frame.index.names, df.index.names) From 42f36298ecd51a91f3feeeeed4bc80307f6eb8e8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 22:03:44 -0400 Subject: [PATCH 151/161] BUG: handle negative indices extending before beginning of Series --- pandas/core/series.py | 9 +++++++-- pandas/tests/test_series.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2ef4e1ecd13f6..9015b5ee7d497 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -257,6 +257,7 @@ def __getitem__(self, key): if isinstance(self.index, MultiIndex): return self._multilevel_index(key) else: + hash(key) values = self.values try: return values[self.index.get_loc(key)] @@ -312,8 +313,12 @@ def _multilevel_index(self, key): _get_val_at = ndarray.__getitem__ def __getslice__(self, i, j): - return self._constructor(self.values[i:j], index=self.index[i:j], - name=self.name) + if i < 0: + i -= len(self) + if j < 0: + j -= len(self) + slobj = slice(i, j) + return self.__getitem__(slobj) def __setitem__(self, key, value): values = self.values diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 07ada8ae1fb37..6c909f6d40ab3 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -249,6 +249,17 @@ def test_getitem_regression(self): result = s[range(5)] assert_series_equal(result, s) + def test_getitem_slice_bug(self): + s = Series(range(10), range(10)) + result = s[-12:] + assert_series_equal(result, s) + + result = s[-7:] + assert_series_equal(result, s[3:]) + + result = s[:-12] + assert_series_equal(result, s[:0]) + def test_getitem_int64(self): idx = np.int64(5) self.assertEqual(self.ts[idx], self.ts[5]) From c7e3cdbb66ad73518b93e63852971cdee96b59ab Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 22:20:10 -0400 Subject: [PATCH 152/161] BLD: docstring fixes to suppress 2to3 warnings --- RELEASE.rst | 1 + pandas/core/common.py | 10 +++++----- pandas/core/frame.py | 4 ++-- pandas/core/groupby.py | 2 +- pandas/tools/pivot.py | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 4e3f0e3c9c554..8492aa873db0b 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -203,6 +203,7 @@ feedback on the library. - Various int32 -> int64 platform-specific issues - Don't be too aggressive converting to integer when parsing file with MultiIndex (GH #285) + - Fix bug when slicing Series with negative indices before beginning Thanks ------ diff --git a/pandas/core/common.py b/pandas/core/common.py index 13b80c1d8b1ef..329648855b362 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -394,15 +394,15 @@ def __call__(self, num): """ Formats a number in engineering notation, appending a letter representing the power of 1000 of the original number. Some examples: - >>> format_eng(0) for self.precision = 0 + >>> format_eng(0) # for self.precision = 0 '0' - >>> format_eng(1000000) for self.precision = 1, - self.use_eng_prefix = True + >>> format_eng(1000000) # for self.precision = 1, + # self.use_eng_prefix = True '1.0M' - >>> format_eng("-1e-6") for self.precision = 2 - self.use_eng_prefix = False + >>> format_eng("-1e-6") # for self.precision = 2 + # self.use_eng_prefix = False '-1.00E-06' @param num: the value to represent diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 747d271b6d342..062b18e20a4a3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -133,7 +133,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, >>> df = DataFrame(data=d, index=index) >>> df2 = DataFrame(np.random.randn(10, 5)) >>> df3 = DataFrame(np.random.randn(10, 5), - columns=['a', 'b', 'c', 'd', 'e']) + ... columns=['a', 'b', 'c', 'd', 'e']) """ if data is None: @@ -1959,7 +1959,7 @@ def apply(self, func, axis=0, broadcast=False): Examples -------- - >>> df.apply(numpy.sqrt) --> DataFrame + >>> df.apply(numpy.sqrt) # returns DataFrame >>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0) >>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 99eaad9ca2400..78d2c9f4ecbfb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -669,7 +669,7 @@ def aggregate(self, func_or_funcs, *args, **kwargs): q 3.5 0.5 7 >>> grouped.agg({'result' : lambda x: x.mean() / x.std(), - 'total' : np.sum}) + ... 'total' : np.sum}) result total b 2.121 3 q 4.95 7 diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 070fddb8f9e15..98040596925e9 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -35,7 +35,7 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc=np.mean, 8 bar two large 7 >>> table = pivot_table(df, values='D', rows=['A, 'B'], - cols=['C'], aggfunc=np.sum) + ... cols=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 From 2ce209a43fd593f1acbe0528dca169e6e9cf7f37 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 22:22:05 -0400 Subject: [PATCH 153/161] BLD: another 2to3 fix --- pandas/tools/pivot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index 98040596925e9..9090e8c09eed4 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -34,7 +34,7 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc=np.mean, 7 bar two small 6 8 bar two large 7 - >>> table = pivot_table(df, values='D', rows=['A, 'B'], + >>> table = pivot_table(df, values='D', rows=['A', 'B'], ... cols=['C'], aggfunc=np.sum) >>> table small large From 8f79f7c6e01874bbf7f3b9bc38af01551c90121e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 22:32:12 -0400 Subject: [PATCH 154/161] RLS: Version 0.5.0 --- RELEASE.rst | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 8492aa873db0b..9b8e0515ae4b6 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -25,7 +25,7 @@ Where to get it pandas 0.5.0 ============ -**Release date:** not yet released +**Release date:** 10/24/2011 This release of pandas includes a number of API changes (see below) and cleanup of deprecated APIs from pre-0.4.0 releases. There are also bug fixes, new diff --git a/setup.py b/setup.py index 538212defe8fb..8d6edd8a00e50 100755 --- a/setup.py +++ b/setup.py @@ -130,7 +130,7 @@ MAJOR = 0 MINOR = 5 MICRO = 0 -ISRELEASED = False +ISRELEASED = True VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) FULLVERSION = VERSION From 21dbda46793e4c0853a16fbb43556481cfc7a9ce Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 25 Oct 2011 21:11:58 -0400 Subject: [PATCH 155/161] BUG: fix DataFrame.to_csv bug described in GH #290 --- pandas/core/frame.py | 9 +++++---- pandas/tests/test_frame.py | 13 +++++++++++++ setup.py | 4 ++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 062b18e20a4a3..ea704b2766b00 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -521,10 +521,11 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True, name = 'level_%d' % i index_label.append(name) else: - if self.index.name is None: - index_label = self.index.name - if index_label is None: - index_label = ['index'] + index_label = self.index.name + if index_label is None: + index_label = ['index'] + else: + index_label = [index_label] elif not isinstance(index_label, (list, tuple, np.ndarray)): # given a string for a DF with Index index_label = [index_label] diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 5d34d96ea2e6f..6eafe4661634d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1676,6 +1676,19 @@ def test_to_csv_withcommas(self): os.remove(path) + def test_to_csv_bug(self): + from pandas import read_csv + path = '__tmp__.csv' + f1 = StringIO('a,1.0\nb,2.0') + df = DataFrame.from_csv(f1,header=None) + newdf = DataFrame({'t': df[df.columns[0]]}) + newdf.to_csv(path) + + recons = read_csv(path, index_col=0) + assert_frame_equal(recons, newdf) + + os.remove(path) + def test_info(self): io = StringIO() self.frame.info(buf=io) diff --git a/setup.py b/setup.py index 8d6edd8a00e50..e28cfbd026ed6 100755 --- a/setup.py +++ b/setup.py @@ -129,8 +129,8 @@ MAJOR = 0 MINOR = 5 -MICRO = 0 -ISRELEASED = True +MICRO = 1 +ISRELEASED = False VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) FULLVERSION = VERSION From 23f68e3db89bf57e217533be8fccc31c8d562eb4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 25 Oct 2011 21:13:35 -0400 Subject: [PATCH 156/161] DOC: update release note --- RELEASE.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index 9b8e0515ae4b6..92570b70c48c1 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -22,6 +22,16 @@ Where to get it * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.sourceforge.net +pandas 0.5.1 +============ + +**Release date:** Not yet released + +**Bug fixes** + + - Fix bug in `DataFrame.to_csv` when writing a DataFrame with an index + name (GH #290) + pandas 0.5.0 ============ From c99f78a0dde03d6fc76fb1f332b31d29f14896d7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 26 Oct 2011 16:01:33 -0400 Subject: [PATCH 157/161] ENH: cython count_level function, cleanup and tests --- pandas/core/index.py | 2 ++ pandas/core/series.py | 33 +++++++-------------------------- pandas/src/groupby.pyx | 17 +++++++++++++++++ pandas/tests/test_multilevel.py | 18 ++++++++++++++++++ 4 files changed, 44 insertions(+), 26 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index bcf37e2d0ce76..e70d8a36b55db 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1217,6 +1217,8 @@ def sortlevel(self, level=0, ascending=True): ------- sorted_index : MultiIndex """ + # TODO: check if lexsorted when level=0 + labels = list(self.labels) level = self._get_level_number(level) primary = labels.pop(level) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9015b5ee7d497..295fc126e7618 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -573,36 +573,17 @@ def count(self, level=None): def _count_level(self, level): # TODO: GENERALIZE CODE OVERLAP WITH DATAFRAME - # TODO: deal with sortedness?? - obj = self.sortlevel(level) - mask = notnull(obj.values) - - level_index = obj.index.levels[level] + mask = notnull(self.values) + level_index = self.index.levels[level] if len(self) == 0: return Series(0, index=level_index) - n = len(level_index) - locs = obj.index.labels[level].searchsorted(np.arange(n)) - - # WORKAROUND: reduceat fusses about the endpoints. should file ticket? - start = locs.searchsorted(0, side='right') - 1 - end = locs.searchsorted(len(mask), side='left') - - result = np.zeros((n), dtype=int) - out = result[start:end] - np.add.reduceat(mask, locs[start:end], out=out) - - # WORKAROUND: to see why, try this - # arr = np.ones((10, 4), dtype=bool) - # np.add.reduceat(arr, [0, 3, 3, 7, 9], axis=0) - - # this stinks - if len(locs) > 1: - workaround_mask = locs[:-1] == locs[1:] - result[:-1][workaround_mask] = 0 - - return Series(result, index=level_index) + # call cython function + max_bin = len(level_index) + counts = lib.count_level_1d(mask.view(np.uint8), + self.index.labels[level], max_bin) + return Series(counts, index=level_index) def value_counts(self): """ diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx index 7f56e11c37fc0..b89a18e0f8c42 100644 --- a/pandas/src/groupby.pyx +++ b/pandas/src/groupby.pyx @@ -453,6 +453,23 @@ def _bucket_locs(index, buckets, inclusive=False): return locs +def count_level_1d(ndarray[uint8_t, cast=True] mask, + ndarray[int32_t] labels, Py_ssize_t max_bin): + cdef: + Py_ssize_t i, n + ndarray[int64_t] counts + + counts = np.zeros(max_bin, dtype='i8') + + n = len(mask) + + for i from 0 <= i < n: + if mask[i]: + counts[labels[i]] += 1 + + return counts + + ''' def ts_upsample_mean(ndarray[object] indices, diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1587aa205c6d3..69ad9f6996b65 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -281,6 +281,24 @@ def _check_counts(frame, axis=0): df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) + def test_count_level_series(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz'], + ['one', 'two', 'three', 'four']], + labels=[[0, 0, 0, 2, 2], + [2, 0, 1, 1, 2]]) + + s = Series(np.random.randn(len(index)), index=index) + + result = s.count(level=0) + expected = s.groupby(level=0).count() + assert_series_equal(result.astype('f8'), + expected.reindex(result.index).fillna(0)) + + result = s.count(level=1) + expected = s.groupby(level=1).count() + assert_series_equal(result.astype('f8'), + expected.reindex(result.index).fillna(0)) + def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) From 2758eea1a1cb632f9078e3f6dcfd48d816a30fe3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 28 Oct 2011 11:00:20 -0400 Subject: [PATCH 158/161] DOC: fix exceptions in docs --- doc/source/stats.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/stats.rst b/doc/source/stats.rst index 037bd9734ca3c..8ed0de04f740f 100755 --- a/doc/source/stats.rst +++ b/doc/source/stats.rst @@ -200,7 +200,7 @@ Let's pull in some sample data: data = dict((sym, DataReader(sym, "yahoo")) for sym in symbols) panel = Panel(data).swapaxes('items', 'minor') - close_px = panel['close'] + close_px = panel['Close'] # convert closing prices to returns rets = close_px / close_px.shift(1) - 1 @@ -289,7 +289,7 @@ actually quite easy: .. ipython:: python # make the units somewhat comparable - volume = panel['volume'] / 1e8 + volume = panel['Volume'] / 1e8 model = ols(y=volume, x={'return' : np.abs(rets)}) model From 0bd296acaf8e7bb1b831c109af84663da8775265 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 28 Oct 2011 11:11:00 -0400 Subject: [PATCH 159/161] BUG: clear Series caches on consolidation, address GH #304 --- pandas/core/frame.py | 15 +++++++++++---- pandas/tests/test_frame.py | 15 +++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ea704b2766b00..d341d038fb901 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -641,7 +641,7 @@ def _get_columns(self): def _set_columns(self, value): self._data.set_axis(0, value) - self._series_cache.clear() + self._clear_caches() columns = property(fset=_set_columns, fget=_get_columns) def _get_index(self): @@ -649,9 +649,16 @@ def _get_index(self): def _set_index(self, value): self._data.set_axis(1, value) - self._series_cache.clear() + self._clear_caches() index = property(fset=_set_index, fget=_get_index) + def _clear_caches(self): + self._series_cache.clear() + + def _consolidate_inplace(self): + self._clear_caches() + NDFrame._consolidate_inplace(self) + def as_matrix(self, columns=None): """ Convert the frame to its Numpy-array matrix representation. Columns @@ -1479,11 +1486,11 @@ def rename(self, index=None, columns=None, copy=True): def _rename_index_inplace(self, mapper): self._data = self._data.rename_axis(mapper, axis=1) - self._series_cache.clear() + self._clear_caches() def _rename_columns_inplace(self, mapper): self._data = self._data.rename_items(mapper, copydata=False) - self._series_cache.clear() + self._clear_caches() #---------------------------------------------------------------------- # Arithmetic / combination related diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6eafe4661634d..dc5bca73e5749 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -238,6 +238,21 @@ def test_setitem_ambig(self): # self.assert_(dm.objects is not None) self.assert_(dm[2].dtype == np.object_) + def test_setitem_clear_caches(self): + # GH #304 + df = DataFrame({'x': [1.1, 2.1, 3.1, 4.1], 'y': [5.1, 6.1, 7.1, 8.1]}, + index=[0,1,2,3]) + df.insert(2, 'z', np.nan) + + # cache it + foo = df['z'] + + df.ix[2:, 'z'] = 42 + + expected = Series([np.nan, np.nan, 42, 42], index=df.index) + self.assert_(df['z'] is not foo) + assert_series_equal(df['z'], expected) + def test_delitem_corner(self): f = self.frame.copy() del f['D'] From e34c8a9d45fe8a3b6d459ad205d76dc3f6b12ccf Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 28 Oct 2011 11:12:25 -0400 Subject: [PATCH 160/161] DOC: release notes re: GH #304 --- RELEASE.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index 92570b70c48c1..3876c572ae351 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -31,6 +31,13 @@ pandas 0.5.1 - Fix bug in `DataFrame.to_csv` when writing a DataFrame with an index name (GH #290) + - DataFrame should clear its Series caches on consolidation, was causing + "stale" Series to be returned in some corner cases (GH #304) + +Thanks +------ + +- Kieran O'Mahony pandas 0.5.0 ============ From cf32be202dff38f92f10c792cbcae5c2f2ebaaf1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 30 Oct 2011 20:59:51 -0400 Subject: [PATCH 161/161] ENH: add melt function, speed up DataFrame.apply --- RELEASE.rst | 8 +++++++ pandas/core/frame.py | 22 +++++++++++-------- pandas/core/reshape.py | 49 ++++++++++++++++++++++++++++++++++++++++++ pandas/core/sparse.py | 4 ++-- pandas/rpy/common.py | 10 +++++++++ 5 files changed, 82 insertions(+), 11 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 3876c572ae351..e65ad6fef3455 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -27,6 +27,14 @@ pandas 0.5.1 **Release date:** Not yet released +**New features / modules** + + - Add `melt` function to `pandas.core.reshape` + +**Improvements to existing features** + + - Sped up `DataFrame.apply` performance in most cases + **Bug fixes** - Fix bug in `DataFrame.to_csv` when writing a DataFrame with an index diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d341d038fb901..2c18da8e56428 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12,6 +12,7 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0212,W0231,W0703,W0622 +from itertools import izip from StringIO import StringIO import csv import operator @@ -1994,26 +1995,29 @@ def apply(self, func, axis=0, broadcast=False): def _apply_standard(self, func, axis): if axis == 0: - target = self - agg_index = self.columns + series_gen = ((c, self[c]) for c in self.columns) + res_index = self.columns + res_columns = self.index elif axis == 1: - target = self.T - agg_index = self.index + res_index = self.index + res_columns = self.columns + series_gen = ((i, Series(v, self.columns)) + for i, v in izip(self.index, self.values)) results = {} - for k in target.columns: - results[k] = func(target[k]) + for k, v in series_gen: + results[k] = func(v) if hasattr(results.values()[0], '__iter__'): - result = self._constructor(data=results, index=target.index, - columns=target.columns) + result = self._constructor(data=results, index=res_columns, + columns=res_index) if axis == 1: result = result.T return result else: - return Series(results, index=agg_index) + return Series(results, index=res_index) def _apply_broadcast(self, func, axis): if axis == 0: diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 533deef603a6d..4e13737a76f60 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -391,3 +391,52 @@ def _stack_multi_columns(frame, level=-1, dropna=True): return result + +def melt(frame, id_vars=None, value_vars=None): + """ + "Unpivots" a DataFrame from wide format to long format, optionally leaving + id variables set + + Parameters + ---------- + frame : DataFrame + id_vars : + value_vars : + + Examples + -------- + >>> df + A B C + a 1 2 + b 3 4 + c 5 6 + + >>> melt(df, ['A']) + A variable value + a B 1 + b B 3 + c B 5 + a C 2 + b C 4 + c C 6 + """ + # TODO: what about the existing index? + + N, K = frame.shape + + mdata = {} + + if id_vars is not None: + idvars = list(idvars) + frame = frame.copy() + K -= len(idvars) + for col in idvars: + mdata[col] = np.tile(frame.pop(col).values, K) + else: + idvars = [] + + mcolumns = idvars + ['variable', 'value'] + + mdata['value'] = frame.values.ravel('F') + mdata['variable'] = np.asarray(frame.columns).repeat(N) + return DataFrame(mdata, columns=mcolumns) diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py index 0df5880a6580f..78f894b476ac3 100644 --- a/pandas/core/sparse.py +++ b/pandas/core/sparse.py @@ -1162,7 +1162,7 @@ def transpose(self): T = property(transpose) def count(self, axis=0, **kwds): - return self.apply(SparseSeries.count, axis=axis) + return self.apply(lambda x: x.count(), axis=axis) count.__doc__ = DataFrame.count.__doc__ def cumsum(self, axis=0): @@ -1178,7 +1178,7 @@ def cumsum(self, axis=0): ------- y : SparseDataFrame """ - return self.apply(SparseSeries.cumsum, axis=axis) + return self.apply(lambda x: x.cumsum(), axis=axis) def shift(self, periods, offset=None, timeRule=None): """ diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py index 534d8f58cf969..afd1f57306b54 100644 --- a/pandas/rpy/common.py +++ b/pandas/rpy/common.py @@ -74,6 +74,9 @@ def _convert_array(obj): def _convert_vector(obj): if isinstance(obj, robj.IntVector): return _convert_int_vector(obj) + elif isinstance(obj, robj.StrVector): + return _convert_str_vector(obj) + return list(obj) NA_INTEGER = -2147483648 @@ -86,6 +89,13 @@ def _convert_int_vector(obj): arr[mask] = np.nan return arr +def _convert_str_vector(obj): + arr = np.asarray(obj, dtype=object) + mask = arr == robj.NA_Character + if mask.any(): + arr[mask] = np.nan + return arr + def _convert_DataFrame(rdf): columns = list(rdf.colnames) rows = np.array(rdf.rownames)