Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

CLN: revisit & simplify Block/BlockManager, remove axes #6745

Merged
merged 2 commits into from Apr 25, 2014
Jump to file or symbol
Failed to load files and symbols.
+2,844 −2,586
Split
View
@@ -1024,9 +1024,8 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
# preallocate data 2d list
self.blocks = self.obj._data.blocks
- ncols = sum(len(b.items) for b in self.blocks)
+ ncols = sum(b.shape[0] for b in self.blocks)
self.data = [None] * ncols
- self.column_map = self.obj._data.get_items_map(use_cached=False)
if chunksize is None:
chunksize = (100000 / (len(self.cols) or 1)) or 1
@@ -1293,10 +1292,9 @@ def _save_chunk(self, start_i, end_i):
float_format=self.float_format,
date_format=self.date_format)
- for i, item in enumerate(b.items):
-
+ for col_loc, col in zip(b.mgr_locs, d):
# self.data is a preallocated list
- self.data[self.column_map[b][i]] = d[i]
+ self.data[col_loc] = col
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format,
View
@@ -1043,9 +1043,11 @@ def to_panel(self):
new_blocks = []
for block in selfsorted._data.blocks:
- newb = block2d_to_blocknd(block.values.T, block.items, shape,
- [major_labels, minor_labels],
- ref_items=selfsorted.columns)
+ newb = block2d_to_blocknd(
+ values=block.values.T,
+ placement=block.mgr_locs, shape=shape,
+ labels=[major_labels, minor_labels],
+ ref_items=selfsorted.columns)
new_blocks.append(newb)
# preserve names, if any
@@ -1934,7 +1936,9 @@ def _ensure_valid_index(self, value):
raise ValueError('Cannot set a frame with no defined index '
'and a value that cannot be converted to a '
'Series')
- self._data.set_axis(1, value.index.copy(), check_axis=False)
+
+ self._data = self._data.reindex_axis(value.index.copy(), axis=1,
+ fill_value=np.nan)
# we are a scalar
# noop
@@ -2039,7 +2043,11 @@ def _sanitize_column(self, key, value):
@property
def _series(self):
- return self._data.get_series_dict()
+ result = {}
+ for idx, item in enumerate(self.columns):
+ result[item] = Series(self._data.iget(idx), index=self.index,
+ name=item)
+ return result
def lookup(self, row_labels, col_labels):
"""Label-based "fancy indexing" function for DataFrame.
@@ -2629,16 +2637,14 @@ def trans(v):
indexer = _nargsort(labels, kind=kind, ascending=ascending,
na_position=na_position)
+ bm_axis = self._get_block_manager_axis(axis)
+ new_data = self._data.take(indexer, axis=bm_axis,
+ convert=False, verify=False)
+
if inplace:
- if axis == 1:
- new_data = self._data.reindex_items(
- self._data.items[indexer],
- copy=False)
- elif axis == 0:
- new_data = self._data.take(indexer)
- self._update_inplace(new_data)
+ return self._update_inplace(new_data)
else:
- return self.take(indexer, axis=axis, convert=False, is_copy=False)
+ return self._constructor(new_data).__finalize__(self)
def sortlevel(self, level=0, axis=0, ascending=True, inplace=False):
"""
@@ -2673,16 +2679,13 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False):
else:
return self.take(indexer, axis=axis, convert=False)
+ bm_axis = self._get_block_manager_axis(axis)
+ new_data = self._data.take(indexer, axis=bm_axis,
+ convert=False, verify=False)
if inplace:
- if axis == 1:
- new_data = self._data.reindex_items(
- self._data.items[indexer],
- copy=False)
- elif axis == 0:
- new_data = self._data.take(indexer)
- self._update_inplace(new_data)
+ return self._update_inplace(new_data)
else:
- return self.take(indexer, axis=axis, convert=False, is_copy=False)
+ return self._constructor(new_data).__finalize__(self)
def swaplevel(self, i, j, axis=0):
"""
View
@@ -565,7 +565,7 @@ def f(x):
f = _get_rename_function(v)
baxis = self._get_block_manager_axis(axis)
- result._data = result._data.rename(f, axis=baxis, copy=copy)
+ result._data = result._data.rename_axis(f, axis=baxis, copy=copy)
result._clear_item_cache()
if inplace:
@@ -1217,21 +1217,9 @@ def take(self, indices, axis=0, convert=True, is_copy=True):
taken : type of caller
"""
- # check/convert indicies here
- if convert:
- axis = self._get_axis_number(axis)
- indices = _maybe_convert_indices(
- indices, len(self._get_axis(axis)))
-
- baxis = self._get_block_manager_axis(axis)
- if baxis == 0:
- labels = self._get_axis(axis)
- new_items = labels.take(indices)
- new_data = self._data.reindex_axis(new_items, indexer=indices,
- axis=baxis)
- else:
- new_data = self._data.take(indices, axis=baxis)
-
+ new_data = self._data.take(indices,
+ axis=self._get_block_manager_axis(axis),
+ convert=True, verify=True)
result = self._constructor(new_data).__finalize__(self)
# maybe set copy if we didn't actually change the index
@@ -1701,7 +1689,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
labels, method, level, limit=limit, copy_if_needed=True)
return self._reindex_with_indexers(
{axis: [new_index, indexer]}, method=method, fill_value=fill_value,
- limit=limit, copy=copy).__finalize__(self)
+ limit=limit, copy=copy)
def _reindex_with_indexers(self, reindexers, method=None,
fill_value=np.nan, limit=None, copy=False,
@@ -1716,30 +1704,16 @@ def _reindex_with_indexers(self, reindexers, method=None,
if index is None:
continue
- index = _ensure_index(index)
- # reindex the axis
- if method is not None:
- new_data = new_data.reindex_axis(
- index, indexer=indexer, method=method, axis=baxis,
- fill_value=fill_value, limit=limit, copy=copy)
-
- elif indexer is not None:
- # TODO: speed up on homogeneous DataFrame objects
+ index = _ensure_index(index)
+ if indexer is not None:
indexer = com._ensure_int64(indexer)
- new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
- fill_value=fill_value,
- allow_dups=allow_dups)
-
- elif (baxis == 0 and index is not None and
- index is not new_data.axes[baxis]):
- new_data = new_data.reindex_items(index, copy=copy,
- fill_value=fill_value)
-
- elif (baxis > 0 and index is not None and
- index is not new_data.axes[baxis]):
- new_data = new_data.copy(deep=copy)
- new_data.set_axis(baxis, index)
+
@jreback

jreback Apr 18, 2014

Contributor

Is copy handled in BlockManager.reindex_indexers ?

I don't think this is well tested and most of the time it should prob copy unless identical indexes

@immerrr

immerrr Apr 19, 2014

Contributor

A rule of thumb I try to follow is, yeah, that reindex should copy, unless there's "inplace=True" kwarg somewhere.

But point taken, need to double check that.

+ # TODO: speed up on homogeneous DataFrame objects
+ new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
+ fill_value=fill_value,
+ allow_dups=allow_dups,
+ copy=copy)
if copy and new_data is self._data:
new_data = new_data.copy()
View
@@ -2196,10 +2196,10 @@ def _iterate_slices(self):
yield val, slicer(val)
def _cython_agg_general(self, how, numeric_only=True):
- new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
- return self._wrap_agged_blocks(new_blocks)
+ new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
+ return self._wrap_agged_blocks(new_items, new_blocks)
- def _wrap_agged_blocks(self, blocks):
+ def _wrap_agged_blocks(self, items, blocks):
obj = self._obj_with_exclusions
new_axes = list(obj._data.axes)
@@ -2210,6 +2210,10 @@ def _wrap_agged_blocks(self, blocks):
else:
new_axes[self.axis] = self.grouper.result_index
+ # Make sure block manager integrity check passes.
+ assert new_axes[0].equals(items)
+ new_axes[0] = items
+
mgr = BlockManager(blocks, new_axes)
new_obj = type(obj)(mgr)
@@ -2223,14 +2227,14 @@ def _cython_agg_blocks(self, how, numeric_only=True):
new_blocks = []
+ if numeric_only:
+ data = data.get_numeric_data(copy=False)
+
for block in data.blocks:
values = block.values
is_numeric = is_numeric_dtype(values.dtype)
- if numeric_only and not is_numeric:
- continue
-
if is_numeric:
values = com.ensure_float(values)
@@ -2239,13 +2243,13 @@ def _cython_agg_blocks(self, how, numeric_only=True):
# see if we can cast the block back to the original dtype
result = block._try_cast_result(result)
- newb = make_block(result, block.items, block.ref_items)
+ newb = make_block(result, placement=block.mgr_locs)
new_blocks.append(newb)
if len(new_blocks) == 0:
raise DataError('No numeric types to aggregate')
- return new_blocks
+ return data.items, new_blocks
def _get_data_to_aggregate(self):
obj = self._obj_with_exclusions
@@ -2837,28 +2841,10 @@ def _wrap_aggregated_output(self, output, names=None):
return result.convert_objects()
- def _wrap_agged_blocks(self, blocks):
- obj = self._obj_with_exclusions
-
- if self.axis == 0:
- agg_labels = obj.columns
- else:
- agg_labels = obj.index
-
- if sum(len(x.items) for x in blocks) == len(agg_labels):
- output_keys = agg_labels
- else:
- all_items = []
- for b in blocks:
- all_items.extend(b.items)
- output_keys = agg_labels[agg_labels.isin(all_items)]
-
- for blk in blocks:
- blk.set_ref_items(output_keys, maybe_rename=False)
-
+ def _wrap_agged_blocks(self, items, blocks):
if not self.as_index:
index = np.arange(blocks[0].values.shape[1])
- mgr = BlockManager(blocks, [output_keys, index])
+ mgr = BlockManager(blocks, [items, index])
result = DataFrame(mgr)
group_levels = self.grouper.get_group_levels()
@@ -2869,7 +2855,7 @@ def _wrap_agged_blocks(self, blocks):
result = result.consolidate()
else:
index = self.grouper.result_index
- mgr = BlockManager(blocks, [output_keys, index])
+ mgr = BlockManager(blocks, [items, index])
result = DataFrame(mgr)
if self.axis == 1:
Oops, something went wrong.