Skip to content

Commit

Permalink
Fixing bug in drop where col_partitions were set incorrectly (#53)
Browse files Browse the repository at this point in the history
  • Loading branch information
devin-petersohn authored and simon-mo committed Jul 20, 2018
1 parent d019142 commit cfc6b65
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 48 deletions.
83 changes: 35 additions & 48 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1775,62 +1775,49 @@ def drop_helper(obj, axis, label):
if axis == 'index':
try:
coords = obj._row_metadata[label]
if isinstance(coords, pandas.DataFrame):
partitions = list(coords['partition'])
indexes = list(coords['index_within_partition'])
else:
partitions, indexes = coords
partitions = [partitions]
indexes = [indexes]

for part, index in zip(partitions, indexes):
x = _deploy_func.remote(
lambda df: df.drop(labels=index, axis=axis,
errors='ignore'),
obj._row_partitions[part])
obj._row_partitions = \
[obj._row_partitions[i] if i != part
else x
for i in range(len(obj._row_partitions))]

# The decrement here is because we're dropping one at a
# time and the index is automatically updated when we
# convert back to blocks.
obj._row_metadata.squeeze(part, index)

obj._row_metadata.drop(labels=label)
object_partitions = obj._row_partitions
except KeyError:
return obj
else:
try:
coords = obj._col_metadata[label]
if isinstance(coords, pandas.DataFrame):
partitions = list(coords['partition'])
indexes = list(coords['index_within_partition'])
else:
partitions, indexes = coords
partitions = [partitions]
indexes = [indexes]

for part, index in zip(partitions, indexes):
x = _deploy_func.remote(
lambda df: df.drop(labels=index, axis=axis,
errors='ignore'),
obj._col_partitions[part])
obj._col_partitions = \
[obj._col_partitions[i] if i != part
else x
for i in range(len(obj._col_partitions))]

# The decrement here is because we're dropping one at a
# time and the index is automatically updated when we
# convert back to blocks.
obj._col_metadata.squeeze(part, index)

obj._col_metadata.drop(labels=label)
object_partitions = obj._col_partitions
except KeyError:
return obj

if isinstance(coords, pandas.DataFrame):
drop_map = {part: list(df['index_within_partition'])
for part, df in
coords.copy().groupby('partition')}
else:
partitions, indexes = coords
drop_map = {partitions: indexes}

new_partitions = {}

for part in drop_map:
index = drop_map[part]

new_partitions[part] = _deploy_func.remote(
lambda df: df.drop(labels=index, axis=axis,
errors='ignore'),
object_partitions[part])

if axis == 'index':
obj._row_partitions = \
[object_partitions[i] if i not in new_partitions
else new_partitions[i]
for i in range(len(object_partitions))]

obj._row_metadata.drop(labels=label)
else:
obj._col_partitions = \
[object_partitions[i] if i not in new_partitions
else new_partitions[i]
for i in range(len(object_partitions))]

obj._col_metadata.drop(labels=label)

return obj

for axis, labels in axes.items():
Expand Down
3 changes: 3 additions & 0 deletions modin/pandas/index_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,9 @@ def drop(self, labels, errors='raise'):
new_coord_df['partition'][new_coord_df['partition'] == i] \
-= num_dropped

new_coord_df['index_within_partition'] = [i for l in self._lengths
for i in range(l)]

self._coord_df = new_coord_df
return dropped

Expand Down

0 comments on commit cfc6b65

Please sign in to comment.