Skip to content

Commit

Permalink
Improve __repr__ performance (#41)
Browse files Browse the repository at this point in the history
* Fixing repr performance

* Fix head/tail component of repr

* Fixing performance

* Cleaning up implementation and adding docs inline

* Adding more inline docs

* Adding test and fixing bugs

* Setting copy=False for concat
  • Loading branch information
devin-petersohn authored and simon-mo committed Jul 15, 2018
1 parent 59c9436 commit 2870de7
Show file tree
Hide file tree
Showing 2 changed files with 245 additions and 76 deletions.
280 changes: 204 additions & 76 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,102 +229,230 @@ def _set_col_partitions(self, new_col_partitions):
def __str__(self):
return repr(self)

def _repr_helper_(self):
if len(self._row_metadata) <= 60 and \
len(self._col_metadata) <= 20:
return to_pandas(self)
def _repr_pandas_builder(self):
"""Creates a pandas DataFrame of appropriate size from this DataFrame.
def head(df, n, get_local_head=False):
"""Compute the head for this without creating a new DataFrame"""
if get_local_head:
return df.head(n)
Note: Currently the values for the sizes are hard-coded, but eventually
we will need to have an options module for these to be changed.
new_dfs = _map_partitions(lambda df: df.head(n),
df)
Returns:
A new pandas DataFrame. repr() will be called on this DataFrame.
"""

index = self.index[:n]
pandas_head = pandas.concat(ray.get(new_dfs), axis=1, copy=False)
pandas_head.index = index
pandas_head.columns = self.columns
return pandas_head
def front_block_builder(blocks, n, index):
"""Get first n columns from the blocks provided.
def tail(df, n, get_local_tail=False):
"""Compute the tail for this without creating a new DataFrame"""
if get_local_tail:
return df.tail(n)
Note: This is called after we obtain the head/tail blocks. We do
not extract the n columns for each row, only for the head/tail.
new_dfs = _map_partitions(lambda df: df.tail(n),
df)
Args:
blocks: A numpy array of OIDs containing block partitions
n: The number of columns to extract
index: The pandas index to assign to the resulting DataFrame.
index = self.index[-n:]
pandas_tail = pandas.concat(ray.get(new_dfs), axis=1, copy=False)
pandas_tail.index = index
pandas_tail.columns = self.columns
return pandas_tail
Returns:
A pandas DataFrame containing the first n columns extracted
from the blocks provided.
"""
cum_col_lengths = self._col_metadata._lengths.cumsum()
idx = np.digitize(n, cum_col_lengths)

if idx > 0:
# This value will be what we need to get from the last block
remaining = n - cum_col_lengths[idx - 1]
# These are the blocks that we will take (all the blocks before
# the cutoff n)
full_blocks = \
pandas.concat([pandas.concat(ray.get(df.tolist()),
axis=1, copy=False)
for df in blocks[:, :idx]],
copy=False)
else:
remaining = n
full_blocks = pandas.DataFrame()

def front(df, n):
"""Get first n columns without creating a new Dataframe"""
if remaining == 0:
full_blocks.index = index
return full_blocks

cum_col_lengths = self._col_metadata._lengths.cumsum()
index = np.argmax(cum_col_lengths >= 10)
pandas_front = pandas.concat(ray.get(x[:index + 1]),
axis=1, copy=False)
pandas_front = pandas_front.iloc[:, :n]
pandas_front.index = self.index
pandas_front.columns = self.columns[:n]
return pandas_front

def back(df, n):
"""Get last n columns without creating a new Dataframe"""

cum_col_lengths = np.flip(self._col_metadata._lengths,
axis=0).cumsum()
index = np.argmax(cum_col_lengths >= 10)
pandas_back = pandas.concat(ray.get(x[-(index + 1):]),
axis=1, copy=False)
pandas_back = pandas_back.iloc[:, -n:]
pandas_back.index = self.index
pandas_back.columns = self.columns[-n:]
return pandas_back

x = self._col_partitions
get_local_head = False
# These are the blocks that we need extract the remaining (not
# already extracted from full_blocks) columns from.
partial_blocks = \
pandas.concat(ray.get([_deploy_func.remote(
lambda df: df.iloc[:, :remaining], df)
for df in blocks[:, idx]]), copy=False)

all_n_columns = \
pandas.concat([full_blocks, partial_blocks],
axis=1, copy=False)
all_n_columns.index = index
return all_n_columns

def back_block_builder(blocks, n, index):
"""Get last n columns from the blocks provided.
Note: This is called after we obtain the head/tail blocks. We do
not extract the n columns for each row, only for the head/tail.
Args:
blocks: A numpy array of OIDs containing block partitions
n: The number of columns to extract
index: The pandas index to assign to the resulting DataFrame.
Returns:
A pandas DataFrame containing the last n columns extracted
from the blocks provided.
"""
# We use the number of partitions later to work backwards from the
# end of the columns.
nparts = len(self._col_metadata._lengths)
# We are cumulatively summing the lengths in reverse order because
# we'll build the last columns in reverse order
cum_col_lengths = self._col_metadata._lengths[::-1].cumsum()
idx = np.digitize(n, cum_col_lengths)

if idx > 0:
# This value will be what we need to get from the last block
remaining = n - cum_col_lengths[idx - 1]
# These are the blocks that we will take (all the blocks before
# the cutoff n)
full_blocks = \
pandas.concat([pandas.concat(ray.get(df.tolist()),
axis=1, copy=False)
for df in blocks[:, nparts - idx:]],
copy=False)
else:
remaining = n
full_blocks = pandas.DataFrame()

if remaining == 0:
full_blocks.index = index
return full_blocks

# These are the blocks that we need extract the remaining (not
# already extracted from full_blocks) columns from.
partial_blocks = \
pandas.concat(ray.get([_deploy_func.remote(
lambda df: df.iloc[:, -remaining:], df)
for df in blocks[:, -idx - 1]]), copy=False)

all_n_columns = \
pandas.concat([partial_blocks, full_blocks],
axis=1, copy=False)
all_n_columns.index = index
return all_n_columns

def row_dots_builder(full_head, full_tail):
"""Inserts a row of dots between head and tail DataFrames
Args:
full_head: The head pandas DataFrame for the repr.
full_tail: The tail pandas DataFrame for the repr.
Returns:
A new DataFrame combining full_head and full_tail with a row
of dots inserted between.
"""
row_dots = \
pandas.Series(["..." for _ in range(len(full_head.columns))])
row_dots.index = full_head.columns
row_dots.name = "..."

return full_head.append(row_dots).append(full_tail)

def col_dots_builder(full_front, full_back):
"""Inserts a column of dots between head and tail DataFrames.
Args:
full_front: The front DataFrame for the repr.
full_back: The back DataFrame for the repr.
Returns:
A new DataFrame combining front_blocks and back_blocks with a
column of dots inserted between.
"""
col_dots = pandas.Series(["..." for _ in range(len(full_front))])
col_dots.index = index_of_head
col_dots.name = "..."
return pandas.concat([full_front, col_dots, full_back],
axis=1, copy=False)

# If we don't exceed the maximum number of values on either dimension
if len(self.index) <= 60 and len(self.columns) <= 20:
return to_pandas(self)

if len(self.index) >= 60:
head_blocks = self._head_block_builder(30)
tail_blocks = self._tail_block_builder(30)
length_of_index = 30
else:
head_blocks = self._block_partitions
# We set this to None so we know
tail_blocks = None
length_of_index = len(self.index)

# Get first and last 10 columns if there are more than 20 columns
if len(self._col_metadata) >= 20:
get_local_head = True
front = front(x, 10)
back = back(x, 10)

col_dots = pandas.Series(["..." for _ in range(len(self.index))])
col_dots.index = self.index
col_dots.name = "..."
x = pandas.concat([front, col_dots, back], axis=1, copy=False)
index_of_head = self.index[:length_of_index]

# If less than 60 rows, x is already in the correct format.
if len(self._row_metadata) < 60:
return x
# Building the front blocks from head_blocks
front_blocks = \
front_block_builder(head_blocks, 10, index_of_head)
front_blocks.columns = self.columns[:10]

head = head(x, 30, get_local_head)
tail = tail(x, 30, get_local_head)
# Building the back blocks from head_blocks
back_blocks = back_block_builder(head_blocks, 10, index_of_head)
back_blocks.columns = self.columns[-10:]

# Make the dots in between the head and tail
row_dots = pandas.Series(["..." for _ in range(len(head.columns))])
row_dots.index = head.columns
row_dots.name = "..."
full_head = col_dots_builder(front_blocks, back_blocks)

# We have to do it this way or convert dots to a dataframe and
# transpose. This seems better.
result = head.append(row_dots).append(tail)
return result
# True only if we have >60 rows in the DataFrame
if tail_blocks is not None:
index_of_tail = self.index[-30:]
# Building the font blocks from tail_blocks
front_blocks = \
front_block_builder(tail_blocks, 10, index_of_tail)
front_blocks.columns = self.columns[:10]

# Building the back blocks from tail_blocks
back_blocks = \
back_block_builder(tail_blocks, 10, index_of_tail)
back_blocks.columns = self.columns[-10:]

full_tail = col_dots_builder(front_blocks, back_blocks)

return row_dots_builder(full_head, full_tail)
else:
return full_head

else:
# Convert head_blocks into a pandas DataFrame
list_of_head_rows = [pandas.concat(ray.get(df.tolist()), axis=1)
for df in head_blocks]
full_head = pandas.concat(list_of_head_rows)
full_head.columns = self.columns

# True only if we have >60 rows in the DataFrame
if tail_blocks is not None:
# Convert tail_blocks into a pandas DataFrame
list_of_tail_rows = \
[pandas.concat(ray.get(df.tolist()), axis=1)
for df in tail_blocks]
full_tail = pandas.concat(list_of_tail_rows)
full_tail.columns = self.columns

return row_dots_builder(full_head, full_tail)
else:
return full_head

def __repr__(self):
# We use pandas repr so that we match them.
if len(self._row_metadata) <= 60 and \
len(self._col_metadata) <= 20:
return repr(self._repr_helper_())
return repr(self._repr_pandas_builder())
# The split here is so that we don't repr pandas row lengths.
result = self._repr_helper_()
result = self._repr_pandas_builder()
final_result = repr(result).rsplit("\n\n", 1)[0] + \
"\n\n[{0} rows x {1} columns]".format(len(self.index),
len(self.columns))
Expand All @@ -341,9 +469,9 @@ def _repr_html_(self):
# of the dataframe.
if len(self._row_metadata) <= 60 and \
len(self._col_metadata) <= 20:
return self._repr_helper_()._repr_html_()
return self._repr_pandas_builder()._repr_html_()
# We split so that we insert our correct dataframe dimensions.
result = self._repr_helper_()._repr_html_()
result = self._repr_pandas_builder()._repr_html_()
return result.split("<p>")[0] + \
"<p>{0} rows x {1} columns</p>\n</div>".format(len(self.index),
len(self.columns))
Expand Down
41 changes: 41 additions & 0 deletions modin/pandas/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3265,6 +3265,47 @@ def test___rsub__():
test_inter_df_math_right_ops("__rsub__")


def test___repr__():
frame_data = np.random.randint(0, 100, size=(1000, 100))
pandas_df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)

assert repr(pandas_df) == repr(ray_df)

frame_data = np.random.randint(0, 100, size=(1000, 99))
pandas_df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)

assert repr(pandas_df) == repr(ray_df)

# These currently fails because the dots do not line up.
# For some reason only two dots are being added for our DataFrame

# frame_data = np.random.randint(0, 100, size=(1000, 101))
# pandas_df = pandas.DataFrame(frame_data)
# ray_df = pd.DataFrame(frame_data)
#
# assert repr(pandas_df) == repr(ray_df)
#
# frame_data = np.random.randint(0, 100, size=(1000, 102))
# pandas_df = pandas.DataFrame(frame_data)
# ray_df = pd.DataFrame(frame_data)
#
# assert repr(pandas_df) == repr(ray_df)

frame_data = np.random.randint(0, 100, size=(10, 100))
pandas_df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)

assert repr(pandas_df) == repr(ray_df)

frame_data = np.random.randint(0, 100, size=(10, 10))
pandas_df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)

assert repr(pandas_df) == repr(ray_df)


@pytest.fixture
def test_loc(ray_df, pd_df):
# Singleton
Expand Down

0 comments on commit 2870de7

Please sign in to comment.