BUG: read_excel return empty dataframe when using usecols and restored

capability of passing column labels for columns to be read - [x] closes #18273 - [x] tests added / passed - [x] passes git diff master --name-only -- "*.py" | grep "pandas/" | xargs -r flake8 - [x] whatsnew entry This commit reimplements usage of 'usecols' as a list of columns lables, list of ints or a callable for read_excel function. The 'usecols' as used in pandas 0.22 is renamed as 'usecols_excel' and is enables the feature of receiving column indexes as a list.
pandas-dev · Apr 30, 2018 · 6143a0f · 6143a0f
1 parent 8ddc0fd
commit 6143a0f
Show file tree

Hide file tree

Showing 3 changed files with 164 additions and 36 deletions.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -856,6 +856,7 @@ Other API Changes
 - Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`).
 - :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`)
 - A user-defined-function that is passed to :func:`Series.rolling().aggregate() <pandas.core.window.Rolling.aggregate>`, :func:`DataFrame.rolling().aggregate() <pandas.core.window.Rolling.aggregate>`, or its expanding cousins, will now *always* be passed a ``Series``, rather than a ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here <whatsnew_0230.enhancements.window_raw>`. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`)
+- Changed the named argument `usecols` at :func:`read_excel` to `usecols_excel` that receives a list of index numbers or A1 index to select the columns that must be in the DataFrame, so the `usecols` argument can serve its purpose to select the columns that must be in the DataFrame using column labels (:issue:`18273`)
 
 .. _whatsnew_0230.deprecations:
 
@@ -1166,6 +1167,7 @@ I/O
 - Bug in :func:`DataFrame.to_latex()` where a ``MultiIndex`` with an empty string as its name would result in incorrect output (:issue:`18669`)
 - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`)
 - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`)
+- Bug in :func:`read_excel` where `usecols_excel` named argument as a list of strings were returning a empty DataFrame (:issue:`18273`)
 - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`)
 - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`, :issue:`9155`, :issue:`19900`)
 - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`)

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -85,19 +85,41 @@
     Column (0-indexed) to use as the row labels of the DataFrame.
     Pass None if there is no such column.  If a list is passed,
     those columns will be combined into a ``MultiIndex``.  If a
-    subset of data is selected with ``usecols``, index_col
+    subset of data is selected with ``usecols_excel``, index_col
     is based on the subset.
 parse_cols : int or list, default None
 
     .. deprecated:: 0.21.0
-       Pass in `usecols` instead.
-
-usecols : int or list, default None
+       Pass in `usecols_excel` instead.
+
+usecols : list-like or callable, default None
+    Return a subset of the columns. If list-like, all elements must either
+    be positional (i.e. integer indices into the document columns) or string
+    that correspond to column names provided either by the user in `names` or
+    inferred from the document header row(s). For example, a valid list-like
+    `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element
+    order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]`` and
+    ``usecols=['foo', 'bar']`` is the same as ``['bar', 'foo']``.
+    To instantiate a DataFrame from ``data`` with element order preserved use
+    ``pd.read_excel(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
+    in ``['foo', 'bar']`` order or
+    ``pd.read_excel(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
+    for ``['bar', 'foo']`` order.
+
+    If callable, the callable function will be evaluated against the column
+    names, returning names where the callable function evaluates to True. An
+    example of a valid callable argument would be ``lambda x: x.upper() in
+    ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
+    parsing time and lower memory usage.
+usecols_excel : int or list, default None
     * If None then parse all columns,
     * If int then indicates last column to be parsed
     * If list of ints then indicates list of column numbers to be parsed
     * If string then indicates comma separated list of Excel column letters and
-      column ranges (e.g. "A:E" or "A,C,E:F").  Ranges are inclusive of
+      column ranges (e.g. "A:E" or "A,C,E:F") to be parsed. Ranges are
+      inclusive of both sides.
+    * If list of strings each string shall be an Excel column letter or column
+      range (e.g. "A:E" or "A,C,E:F") to be parsed. Ranges are inclusive of
       both sides.
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
@@ -278,14 +300,14 @@ def get_writer(engine_name):
 
 
 @Appender(_read_excel_doc)
-@deprecate_kwarg("parse_cols", "usecols")
+@deprecate_kwarg("parse_cols", "usecols_excel")
 @deprecate_kwarg("skip_footer", "skipfooter")
 def read_excel(io,
                sheet_name=0,
                header=0,
                names=None,
                index_col=None,
-               usecols=None,
+               usecols_excel=None,
                squeeze=False,
                dtype=None,
                engine=None,
@@ -320,7 +342,7 @@ def read_excel(io,
         header=header,
         names=names,
         index_col=index_col,
-        usecols=usecols,
+        usecols_excel=usecols_excel,
         squeeze=squeeze,
         dtype=dtype,
         converters=converters,
@@ -413,7 +435,7 @@ def parse(self,
               header=0,
               names=None,
               index_col=None,
-              usecols=None,
+              usecols_excel=None,
               squeeze=False,
               converters=None,
               true_values=None,
@@ -439,7 +461,7 @@ def parse(self,
                                  header=header,
                                  names=names,
                                  index_col=index_col,
-                                 usecols=usecols,
+                                 usecols_excel=usecols_excel,
                                  squeeze=squeeze,
                                  converters=converters,
                                  true_values=true_values,
@@ -455,7 +477,7 @@ def parse(self,
                                  convert_float=convert_float,
                                  **kwds)
 
-    def _should_parse(self, i, usecols):
+    def _should_parse(self, i, usecols_excel):
 
         def _range2cols(areas):
             """
@@ -481,18 +503,26 @@ def _excel2num(x):
                     cols.append(_excel2num(rng))
             return cols
 
-        if isinstance(usecols, int):
-            return i <= usecols
-        elif isinstance(usecols, compat.string_types):
-            return i in _range2cols(usecols)
+        if isinstance(usecols_excel, int):
+            return i <= usecols_excel
+        # check if usecols_excel is a string that indicates a comma separated
+        # list of Excel column letters and column ranges
+        elif isinstance(usecols_excel, compat.string_types):
+                return i in _range2cols(usecols_excel)
+        # check if usecols_excel is a list of strings, each one indicating a
+        # Excel column letter or a column range
+        elif all(isinstance(x, compat.string_types) for x in usecols_excel):
+                usecols_excel_str = ",".join(usecols_excel)
+                return i in _range2cols(usecols_excel_str)
         else:
-            return i in usecols
+            return i in usecols_excel
 
     def _parse_excel(self,
                      sheetname=0,
                      header=0,
                      names=None,
                      index_col=None,
+                     usecols_excel=None,
                      usecols=None,
                      squeeze=False,
                      dtype=None,
@@ -512,6 +542,10 @@ def _parse_excel(self,
 
         _validate_header_arg(header)
 
+        if (usecols is not None) and (usecols_excel is not None):
+            raise TypeError("Cannot specify both `usecols` and `usecols_excel`"
+                            ". Choose one of them.")
+
         if 'chunksize' in kwds:
             raise NotImplementedError("chunksize keyword of read_excel "
                                       "is not implemented")
@@ -615,13 +649,27 @@ def _parse_cell(cell_contents, cell_typ):
                 row = []
                 for j, (value, typ) in enumerate(zip(sheet.row_values(i),
                                                      sheet.row_types(i))):
-                    if usecols is not None and j not in should_parse:
-                        should_parse[j] = self._should_parse(j, usecols)
+                    if usecols_excel is not None and j not in should_parse:
+                        should_parse[j] = self._should_parse(j, usecols_excel)
 
-                    if usecols is None or should_parse[j]:
+                    if usecols_excel is None or should_parse[j]:
                         row.append(_parse_cell(value, typ))
                 data.append(row)
 
+            # Check if some string in usecols may be interpreted as a Excel
+            # positional column
+            if (usecols is not None) and (not callable(usecols)) and \
+                (not all(isinstance(x, int) for x in usecols)) and \
+                any(isinstance(x, compat.string_types) and x.isalpha()
+                    for x in usecols):
+                warnings.warn("The `usecols` named argument used to refer to "
+                              "Excel column letters or ranges and int "
+                              "positional indexes was renamed to "
+                              "`usecols_excel`. Now `usecols` is used to "
+                              "pass either a list of only string column lables"
+                              " or a list of only integer positional indexes.",
+                              UserWarning, stacklevel=3)
+
             if sheet.nrows == 0:
                 output[asheetname] = DataFrame()
                 continue
@@ -674,6 +722,7 @@ def _parse_cell(cell_contents, cell_typ):
                                     dtype=dtype,
                                     true_values=true_values,
                                     false_values=false_values,
+                                    usecols=usecols,
                                     skiprows=skiprows,
                                     nrows=nrows,
                                     na_values=na_values,

diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -107,13 +107,14 @@ def get_exceldf(self, basename, ext, *args, **kwds):
 class ReadingTestsBase(SharedItems):
     # This is based on ExcelWriterBase
 
-    def test_usecols_int(self, ext):
+    def test_usecols_excel_int(self, ext):
 
         dfref = self.get_csv_refdf('test1')
         dfref = dfref.reindex(columns=['A', 'B', 'C'])
-        df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols=3)
+        df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+                               usecols_excel=3)
         df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
-                               index_col=0, usecols=3)
+                               index_col=0, usecols_excel=3)
 
         with tm.assert_produces_warning(FutureWarning):
             df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
@@ -124,14 +125,14 @@ def test_usecols_int(self, ext):
         tm.assert_frame_equal(df2, dfref, check_names=False)
         tm.assert_frame_equal(df3, dfref, check_names=False)
 
-    def test_usecols_list(self, ext):
+    def test_usecols_excel_list(self, ext):
 
         dfref = self.get_csv_refdf('test1')
         dfref = dfref.reindex(columns=['B', 'C'])
         df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
-                               usecols=[0, 2, 3])
+                               usecols_excel=[0, 2, 3])
         df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
-                               index_col=0, usecols=[0, 2, 3])
+                               index_col=0, usecols_excel=[0, 2, 3])
 
         with tm.assert_produces_warning(FutureWarning):
             df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
@@ -142,15 +143,15 @@ def test_usecols_list(self, ext):
         tm.assert_frame_equal(df2, dfref, check_names=False)
         tm.assert_frame_equal(df3, dfref, check_names=False)
 
-    def test_usecols_str(self, ext):
+    def test_usecols_excel_str(self, ext):
 
         dfref = self.get_csv_refdf('test1')
 
         df1 = dfref.reindex(columns=['A', 'B', 'C'])
         df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
-                               usecols='A:D')
+                               usecols_excel='A:D')
         df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
-                               index_col=0, usecols='A:D')
+                               index_col=0, usecols_excel='A:D')
 
         with tm.assert_produces_warning(FutureWarning):
             df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
@@ -163,20 +164,96 @@ def test_usecols_str(self, ext):
 
         df1 = dfref.reindex(columns=['B', 'C'])
         df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
-                               usecols='A,C,D')
+                               usecols_excel='A,C,D')
         df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
-                               index_col=0, usecols='A,C,D')
+                               index_col=0, usecols_excel='A,C,D')
         # TODO add index to xls file
         tm.assert_frame_equal(df2, df1, check_names=False)
         tm.assert_frame_equal(df3, df1, check_names=False)
 
         df1 = dfref.reindex(columns=['B', 'C'])
         df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
-                               usecols='A,C:D')
+                               usecols_excel='A,C:D')
+        df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+                               index_col=0, usecols_excel='A,C:D')
+        tm.assert_frame_equal(df2, df1, check_names=False)
+        tm.assert_frame_equal(df3, df1, check_names=False)
+
+    @pytest.mark.parametrize("columns,usecols_excel,parse_cols", [
+        (['A', 'B', 'C'], ['A:D'], ['A:D']),
+        (['B', 'C'], ['A', 'C', 'D'], ['A', 'C', 'D']),
+        (['B', 'C'], ['A', 'C:D'], ['A', 'C:D'])
+    ])
+    # GH18273 - read_excel return empty dataframe when using usecols_excel
+    # as a list of strings
+    def test_usecols_excel_str_list(self, ext, columns, usecols_excel,
+                                    parse_cols):
+
+        dfref = self.get_csv_refdf('test1')
+
+        df1 = dfref.reindex(columns=columns)
+        df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+                               usecols_excel=usecols_excel)
         df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
-                               index_col=0, usecols='A,C:D')
+                               index_col=0, usecols_excel=usecols_excel)
+
+        with tm.assert_produces_warning(FutureWarning):
+            df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+                                   index_col=0, parse_cols=parse_cols)
+
+        # TODO add index to xls, read xls ignores index name ?
         tm.assert_frame_equal(df2, df1, check_names=False)
         tm.assert_frame_equal(df3, df1, check_names=False)
+        tm.assert_frame_equal(df4, df1, check_names=False)
+
+    def test_usecols_diff_positional_int_columns_order(self, ext):
+
+        dfref = self.get_csv_refdf('test1')
+
+        df1 = dfref.reindex(columns=['A', 'C'])
+        df2 = self.get_exceldf('test1', ext, 'Sheet1', usecols=[0, 2])
+        df3 = self.get_exceldf('test1', ext, 'Sheet1', usecols=[2, 0])
+
+        tm.assert_frame_equal(df2, df1, check_names=False)
+        tm.assert_frame_equal(df3, df2, check_names=False)
+
+    def test_usecols_diff_positional_str_columns_order(self, ext):
+
+        df1 = self.get_csv_refdf('test1')[['B', 'D']]
+
+        with tm.assert_produces_warning(UserWarning):
+            df2 = self.get_exceldf('test1', ext, 'Sheet1', usecols=['B', 'D'])
+        with tm.assert_produces_warning(UserWarning):
+            df3 = self.get_exceldf('test1', ext, 'Sheet1', usecols=['D', 'B'])
+
+        tm.assert_frame_equal(df2, df1, check_names=False)
+        tm.assert_frame_equal(df3, df1, check_names=False)
+
+    def test_pass_callble_argument(self, ext):
+
+        dfref = self.get_csv_refdf('test1')[['C', 'D']]
+
+        df1 = dfref.reindex(columns=['C', 'D'])
+        df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+                               usecols=lambda x: x > 'B')
+        df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+                               usecols_excel=['A', 'D', 'E'])
+
+        tm.assert_frame_equal(df2, df1, check_names=False)
+        tm.assert_frame_equal(df3, df1, check_names=False)
+
+    def test_pass_non_existent_column(self, ext):
+        msg = "Usecols do not match columns, columns expected but not found: "
+        "['E']"
+        with tm.assert_raises_regex(ValueError, msg):
+            with tm.assert_produces_warning(UserWarning):
+                self.get_exceldf('test1', ext, usecols=['E'])
+
+    def test_usecols_and_usecols_error(self, ext):
+        msg = "Cannot specify both `usecols` and `usecols_excel`. Choose one"
+        " of them."
+        with tm.assert_raises_regex(TypeError, msg):
+            self.get_exceldf('test1', ext, usecols=[0, 2], usecols_excel="A:C")
 
     def test_excel_stop_iterator(self, ext):
 
@@ -421,14 +498,14 @@ def test_read_one_empty_col_no_header(self, ext):
             actual_header_none = read_excel(
                 path,
                 'no_header',
-                usecols=[0],
+                usecols_excel=[0],
                 header=None
             )
 
             actual_header_zero = read_excel(
                 path,
                 'no_header',
-                usecols=[0],
+                usecols_excel=[0],
                 header=0
             )
         expected = DataFrame()
@@ -449,14 +526,14 @@ def test_read_one_empty_col_with_header(self, ext):
             actual_header_none = read_excel(
                 path,
                 'with_header',
-                usecols=[0],
+                usecols_excel=[0],
                 header=None
             )
 
             actual_header_zero = read_excel(
                 path,
                 'with_header',
-                usecols=[0],
+                usecols_excel=[0],
                 header=0
             )
         expected_header_none = DataFrame(pd.Series([0], dtype='int64'))