Fixing bug in describe where only certain columns are computed over (#…

…111) * Fixing bug in describe where only certain columns are computed over after a read_csv. * Fix lint
modin-project · Oct 5, 2018 · eaca95c · eaca95c
1 parent 84cf399
commit eaca95c
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 5 deletions.
diff --git a/modin/data_management/data_manager.py b/modin/data_management/data_manager.py
@@ -1295,22 +1295,24 @@ def describe(self, **kwargs):
         """
         # Only describe numeric if there are numeric
         # Otherwise, describe all
-        new_index = self.numeric_columns()
-        if len(new_index) != 0:
+        columns_for_describe = self.numeric_columns()
+        if len(columns_for_describe) != 0 and "object" in kwargs["exclude"]:
             numeric = True
         else:
             numeric = False
             # If no numeric dtypes, then do all
-            new_index = self.columns
+            columns_for_describe = self.columns
 
         def describe_builder(df, **kwargs):
             return pandas.DataFrame.describe(df, **kwargs)
 
         # Apply describe and update indices, columns, and dtypes
         func = self._prepare_method(describe_builder, **kwargs)
-        new_data = self.full_axis_reduce_along_select_indices(func, 0, new_index, False)
+        new_data = self.full_axis_reduce_along_select_indices(
+            func, 0, columns_for_describe, False
+        )
+        new_columns = columns_for_describe
         new_index = self.compute_index(0, new_data, False)
-        new_columns = self.compute_index(1, new_data, True)
         if numeric:
             new_dtypes = pandas.Series(
                 [np.float64 for _ in new_columns], index=new_columns

diff --git a/modin/pandas/io.py b/modin/pandas/io.py
@@ -592,6 +592,7 @@ def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs, h
     to_read = header + bio.read(end - start)
     bio.close()
     pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
+    pandas_df.columns = pandas.RangeIndex(len(pandas_df.columns))
     if kwargs.get("index_col", None) is not None:
         index = pandas_df.index
         # Partitions must have RangeIndex