Skip to content

Commit

Permalink
Fixing bug in describe where only certain columns are computed over (#…
Browse files Browse the repository at this point in the history
…111)

* Fixing bug in describe where only certain columns are computed over after a read_csv.

* Fix lint
  • Loading branch information
devin-petersohn authored and simon-mo committed Oct 5, 2018
1 parent 84cf399 commit eaca95c
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
12 changes: 7 additions & 5 deletions modin/data_management/data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1295,22 +1295,24 @@ def describe(self, **kwargs):
"""
# Only describe numeric if there are numeric
# Otherwise, describe all
new_index = self.numeric_columns()
if len(new_index) != 0:
columns_for_describe = self.numeric_columns()
if len(columns_for_describe) != 0 and "object" in kwargs["exclude"]:
numeric = True
else:
numeric = False
# If no numeric dtypes, then do all
new_index = self.columns
columns_for_describe = self.columns

def describe_builder(df, **kwargs):
return pandas.DataFrame.describe(df, **kwargs)

# Apply describe and update indices, columns, and dtypes
func = self._prepare_method(describe_builder, **kwargs)
new_data = self.full_axis_reduce_along_select_indices(func, 0, new_index, False)
new_data = self.full_axis_reduce_along_select_indices(
func, 0, columns_for_describe, False
)
new_columns = columns_for_describe
new_index = self.compute_index(0, new_data, False)
new_columns = self.compute_index(1, new_data, True)
if numeric:
new_dtypes = pandas.Series(
[np.float64 for _ in new_columns], index=new_columns
Expand Down
1 change: 1 addition & 0 deletions modin/pandas/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs, h
to_read = header + bio.read(end - start)
bio.close()
pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
pandas_df.columns = pandas.RangeIndex(len(pandas_df.columns))
if kwargs.get("index_col", None) is not None:
index = pandas_df.index
# Partitions must have RangeIndex
Expand Down

0 comments on commit eaca95c

Please sign in to comment.