From 6c6eede94483bc2b14a75adb7e48e8e0fe534956 Mon Sep 17 00:00:00 2001 From: Jackson Souza Date: Mon, 30 Apr 2018 01:33:40 +0000 Subject: [PATCH] BUG: read_excel return empty dataframe when using usecols and restored capability of passing column labels for columns to be read - [x] closes #18273 - [x] tests added / passed - [x] passes git diff master --name-only -- "*.py" | grep "pandas/" | xargs -r flake8 - [x] whatsnew entry Created 'usecols_excel' that receives a string containing comma separated Excel ranges and columns. Changed 'usecols' named argument, now it receives a list of strings containing column labels or a list of integers representing column indexes or a callable for 'read_excel' function. Created and altered tests to reflect the new usage of these named arguments. 'index_col' keyword used to indicated which columns in the subset of selected columns by 'usecols' or 'usecols_excel' that should be the index of the DataFrame read. Now 'index_col' indicates which columns of the DataFrame will be the index even if that column is not in the subset of the selected columns. --- doc/source/io.rst | 42 ++++++-- doc/source/whatsnew/v0.23.0.txt | 1 + doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/io/excel.py | 100 ++++++++++++++++---- pandas/tests/io/test_excel.py | 163 ++++++++++++++++++++++---------- 5 files changed, 234 insertions(+), 74 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index aa2484b0cb5c3f..e0f848ceb58fb4 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2852,23 +2852,53 @@ Parsing Specific Columns It is often the case that users will insert columns to do temporary computations in Excel and you may not want to read in those columns. ``read_excel`` takes -a ``usecols`` keyword to allow you to specify a subset of columns to parse. +either a ``usecols`` or ``usecols_excel`` keyword to allow you to specify a +subset of columns to parse. Note that you can not use both ``usecols`` and +``usecols_excel`` named arguments at the same time. + +If ``usecols_excel`` is supplied, then it is assumed that indicates a comma +separated list of Excel column letters and column ranges to be parsed. + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', usecols_excel='A:E') + read_excel('path_to_file.xls', 'Sheet1', usecols_excel='A,C,E:F') If ``usecols`` is an integer, then it is assumed to indicate the last column to be parsed. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=2) + read_excel('path_to_file.xls', 'Sheet1', usecols_excel=2) + +If ``usecols`` is a list of integers, then it is assumed to be the file +column indices to be parsed. + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', usecols=[1, 3, 5]) + +Element order is ignored, so ``usecols_excel=[0, 1]`` is the same as ``[1, 0]``. + +If ``usecols`` is a list of strings, then it is assumed that each string +correspond to column names provided either by the user in `names` or +inferred from the document header row(s) and those strings define which columns +will be parsed. + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) + +Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as +``['joe', 'baz']``. -If `usecols` is a list of integers, then it is assumed to be the file column -indices to be parsed. +If ``usecols`` is callable, the callable function will be evaluated against the +column names, returning names where the callable function evaluates to True. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) + read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) -Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. Parsing Dates +++++++++++++ diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a099fb40c35a78..2ef2660002004a 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1325,6 +1325,7 @@ I/O - Bug in :func:`DataFrame.to_latex()` where missing space characters caused wrong escaping and produced non-valid latex in some cases (:issue:`20859`) - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`) - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`) +- Bug in :func:`read_excel` where ``usecols`` keyword argument as a list of strings were returning a empty ``DataFrame`` (:issue:`18273`) - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`) - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`, :issue:`9155`, :issue:`19900`) - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 6cbc19cca99e1d..e9e66a568858b1 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -35,7 +35,7 @@ Datetimelike API Changes Other API Changes ^^^^^^^^^^^^^^^^^ -- +- :func:`read_excel` has gained the keyword argument ``usecols_excel`` that receives a string containing comma separated Excel ranges and columns. The ``usecols`` keyword argument at :func:`read_excel` had removed support for a string containing comma separated Excel ranges and columns and for an int indicating the first j columns to be read in a ``DataFrame``. Also, the ``usecols`` keyword argument at :func:`read_excel` had added support for receiving a list of strings containing column labels and a callable. (:issue:`18273`) - - diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 5608c296374479..593ea249c9f8a7 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -10,6 +10,8 @@ import abc import warnings import numpy as np +import string +import re from io import UnsupportedOperation from pandas.core.dtypes.common import ( @@ -85,20 +87,45 @@ Column (0-indexed) to use as the row labels of the DataFrame. Pass None if there is no such column. If a list is passed, those columns will be combined into a ``MultiIndex``. If a - subset of data is selected with ``usecols``, index_col - is based on the subset. + subset of data is selected with ``usecols_excel`` or ``usecols``, + index_col is based on the subset. parse_cols : int or list, default None .. deprecated:: 0.21.0 Pass in `usecols` instead. -usecols : int or list, default None +usecols : list-like or callable or int, default None + Return a subset of the columns. If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or string + that correspond to column names provided either by the user in `names` or + inferred from the document header row(s). For example, a valid list-like + `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Note that + you can not give both ``usecols`` and ``usecols_excel`` keyword arguments + at the same time. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to True. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. + + .. versionadded:: 0.24.0 + Added support to column labels and now `usecols_excel` is the keyword that + receives separated comma list of excel columns and ranges. +usecols_excel : string or list, default None + Return a subset of the columns from a spreadsheet specified as Excel column + ranges and columns. Note that you can not use both ``usecols`` and + ``usecols_excel`` keyword arguments at the same time. + * If None then parse all columns, - * If int then indicates last column to be parsed - * If list of ints then indicates list of column numbers to be parsed * If string then indicates comma separated list of Excel column letters and - column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of - both sides. + column ranges (e.g. "A:E" or "A,C,E:F") to be parsed. Ranges are + inclusive of both sides. + * If list of strings each string shall be an Excel column letter or column + range (e.g. ["A:E"] or ["A", "C", "E:F"]) to be parsed. Ranges are + inclusive of both sides. + + .. versionadded:: 0.24.0 + squeeze : boolean, default False If the parsed data only contains one column then return a Series dtype : Type name or dict of column -> type, default None @@ -269,6 +296,17 @@ def _get_default_writer(ext): return _default_writers[ext] +def _is_excel_columns_notation(columns): + """Receives a string and check if the string is a comma separated list of + Excel index columns and index ranges. An Excel range is a string with two + column indexes separated by ':').""" + if isinstance(columns, compat.string_types) and all( + (x in string.ascii_letters) for x in re.split(r',|:', columns)): + return True + + return False + + def get_writer(engine_name): try: return _writers[engine_name] @@ -286,6 +324,7 @@ def read_excel(io, names=None, index_col=None, usecols=None, + usecols_excel=None, squeeze=False, dtype=None, engine=None, @@ -311,6 +350,7 @@ def read_excel(io, header=header, names=names, index_col=index_col, + usecols_excel=usecols_excel, usecols=usecols, squeeze=squeeze, dtype=dtype, @@ -405,6 +445,7 @@ def parse(self, names=None, index_col=None, usecols=None, + usecols_excel=None, squeeze=False, converters=None, true_values=None, @@ -439,6 +480,7 @@ def parse(self, header=header, names=names, index_col=index_col, + usecols_excel=usecols_excel, usecols=usecols, squeeze=squeeze, converters=converters, @@ -455,7 +497,7 @@ def parse(self, convert_float=convert_float, **kwds) - def _should_parse(self, i, usecols): + def _should_parse(self, i, usecols_excel, usecols): def _range2cols(areas): """ @@ -481,12 +523,12 @@ def _excel2num(x): cols.append(_excel2num(rng)) return cols - if isinstance(usecols, int): - return i <= usecols - elif isinstance(usecols, compat.string_types): - return i in _range2cols(usecols) - else: - return i in usecols + # check if usecols_excel is a string that indicates a comma separated + # list of Excel column letters and column ranges + if isinstance(usecols_excel, compat.string_types): + return i in _range2cols(usecols_excel) + + return True def _parse_excel(self, sheet_name=0, @@ -494,6 +536,7 @@ def _parse_excel(self, names=None, index_col=None, usecols=None, + usecols_excel=None, squeeze=False, dtype=None, true_values=None, @@ -512,6 +555,25 @@ def _parse_excel(self, _validate_header_arg(header) + if (usecols is not None) and (usecols_excel is not None): + raise ValueError("Cannot specify both `usecols` and " + "`usecols_excel`. Choose one of them.") + + # Check if some string in usecols may be interpreted as a Excel + # range or positional column + elif _is_excel_columns_notation(usecols): + warnings.warn("The `usecols` keyword argument used to refer to " + "Excel ranges and columns as strings was " + "renamed to `usecols_excel`.", UserWarning, + stacklevel=3) + usecols_excel = usecols + usecols = None + + elif (usecols_excel is not None) and not _is_excel_columns_notation( + usecols_excel): + raise TypeError("`usecols_excel` must be None or a string as a " + "comma separeted Excel ranges and columns.") + if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") @@ -615,10 +677,13 @@ def _parse_cell(cell_contents, cell_typ): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): - if usecols is not None and j not in should_parse: - should_parse[j] = self._should_parse(j, usecols) + if ((usecols is not None) or (usecols_excel is not None) or + (j not in should_parse)): + should_parse[j] = self._should_parse(j, usecols_excel, + usecols) - if usecols is None or should_parse[j]: + if (((usecols_excel is None) and (usecols is None)) or + should_parse[j]): row.append(_parse_cell(value, typ)) data.append(row) @@ -674,6 +739,7 @@ def _parse_cell(cell_contents, cell_typ): dtype=dtype, true_values=true_values, false_values=false_values, + usecols=usecols, skiprows=skiprows, nrows=nrows, na_values=na_values, diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 05423474f330ae..439244a9f62623 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -107,54 +107,43 @@ def get_exceldf(self, basename, ext, *args, **kwds): class ReadingTestsBase(SharedItems): # This is based on ExcelWriterBase - def test_usecols_int(self, ext): - - dfref = self.get_csv_refdf('test1') - dfref = dfref.reindex(columns=['A', 'B', 'C']) - df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols=3) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols=3) - - with tm.assert_produces_warning(FutureWarning): - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols=3) - - # TODO add index to xls file) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - tm.assert_frame_equal(df3, dfref, check_names=False) - def test_usecols_list(self, ext): dfref = self.get_csv_refdf('test1') dfref = dfref.reindex(columns=['B', 'C']) df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols=[0, 2, 3]) + usecols=[1, 2]) df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols=[0, 2, 3]) + index_col=0, usecols=[1, 2]) with tm.assert_produces_warning(FutureWarning): df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols=[0, 2, 3]) + index_col=0, parse_cols=[1, 2]) # TODO add index to xls file) tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) tm.assert_frame_equal(df3, dfref, check_names=False) - def test_usecols_str(self, ext): + def test_usecols_excel_str(self, ext): dfref = self.get_csv_refdf('test1') df1 = dfref.reindex(columns=['A', 'B', 'C']) df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A:D') + usecols_excel='A:D') df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A:D') + index_col=0, usecols_excel='A:D') - with tm.assert_produces_warning(FutureWarning): + # The following code receives two warnings because FutureWarning is + # thrown when parse_cols is passed in read_excel and UserWarning is + # thrown when parse_cols (usecols) receives an comma separated list of + # Excel indexes and ranges + with tm.assert_produces_warning() as w: df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], index_col=0, parse_cols='A:D') + assert issubclass(w[0].category, FutureWarning) + assert issubclass(w[1].category, UserWarning) # TODO add index to xls, read xls ignores index name ? tm.assert_frame_equal(df2, df1, check_names=False) @@ -163,21 +152,107 @@ def test_usecols_str(self, ext): df1 = dfref.reindex(columns=['B', 'C']) df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A,C,D') + usecols_excel='A,C,D') df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C,D') + index_col=0, usecols_excel='A,C,D') # TODO add index to xls file tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) df1 = dfref.reindex(columns=['B', 'C']) df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A,C:D') + usecols_excel='A,C:D') df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C:D') + index_col=0, usecols_excel='A,C:D') tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) + def test_usecols_diff_positional_int_columns_order(self, ext): + + df1 = self.get_csv_refdf('test1')[['A', 'C']] + + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols=[0, 2]) + df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols=[2, 0]) + + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + + def test_usecols_diff_positional_str_columns_order(self, ext): + + df1 = self.get_csv_refdf('test1')[['B', 'D']] + + df2 = self.get_exceldf('test1', ext, 'Sheet1', usecols=['B', 'D']) + df3 = self.get_exceldf('test1', ext, 'Sheet1', usecols=['D', 'B']) + + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + def test_read_excel_without_slicing(self, ext): + + df1 = self.get_csv_refdf('test1') + df2 = self.get_exceldf('test1', ext, 'Sheet1') + + tm.assert_frame_equal(df2, df1, check_names=False) + + def test_pass_callable_argument(self, ext): + + dfref = self.get_csv_refdf('test1')[['C', 'D']] + + df1 = dfref.reindex(columns=['C', 'D']) + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols=lambda x: x > 'B') + df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols_excel='A,D:E') + + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + def test_usecols_deprecated_excel_range_str(self, ext): + + dfref = self.get_csv_refdf('test1')[['B', 'C', 'D']] + + df1 = dfref.reindex(columns=['C', 'D']) + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols=['C', 'D']) + with tm.assert_produces_warning(UserWarning): + df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols='A,D:E') + + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + def test_index_col_label_error(self, ext): + msg = "list indices must be integers.*, not str" + with tm.assert_raises_regex(TypeError, msg): + self.get_exceldf('test1', ext, 'Sheet1', index_col=["A"], + usecols=["", "A", "C"]) + + def test_pass_non_existent_column(self, ext): + msg = "Usecols do not match columns, columns expected but not found: " + "['E']" + with tm.assert_raises_regex(ValueError, msg): + self.get_exceldf('test1', ext, usecols=['E']) + + def test_usecols_excel_wrong_type(self, ext): + msg = "`usecols_excel` must be None or a string as a comma separeted " + " Excel ranges and columns." + with tm.assert_raises_regex(TypeError, msg): + self.get_exceldf('test1', ext, usecols_excel=1) + + def test_usecols_wrong_type(self, ext): + msg = "'usecols' must either be list-like of all strings, all unicode," + " all integers or a callable." + with tm.assert_raises_regex(ValueError, msg): + self.get_exceldf('test1', ext, usecols='E1') + + def test_usecols_and_usecols_excel_error(self, ext): + msg = "Cannot specify both `usecols` and `usecols_excel`. Choose one" + " of them." + with tm.assert_raises_regex(ValueError, msg): + self.get_exceldf('test1', ext, usecols=[0, 2], usecols_excel="A:C") + def test_excel_stop_iterator(self, ext): parsed = self.get_exceldf('test2', ext, 'Sheet1') @@ -422,7 +497,8 @@ def test_read_one_empty_col_no_header(self, ext): path, 'no_header', usecols=[0], - header=None + header=None, + nrows=0 ) actual_header_zero = read_excel( @@ -431,9 +507,10 @@ def test_read_one_empty_col_no_header(self, ext): usecols=[0], header=0 ) - expected = DataFrame() - tm.assert_frame_equal(actual_header_none, expected) - tm.assert_frame_equal(actual_header_zero, expected) + expected_header_none = DataFrame(columns=[0]) + tm.assert_frame_equal(actual_header_none, expected_header_none) + expected_header_zero = DataFrame({1: [2, 3, 4]}, index=3 * [np.nan]) + tm.assert_frame_equal(actual_header_zero, expected_header_zero) @td.skip_if_no('openpyxl') @td.skip_if_no('xlwt') @@ -450,7 +527,8 @@ def test_read_one_empty_col_with_header(self, ext): path, 'with_header', usecols=[0], - header=None + header=None, + nrows=1 ) actual_header_zero = read_excel( @@ -461,7 +539,7 @@ def test_read_one_empty_col_with_header(self, ext): ) expected_header_none = DataFrame(pd.Series([0], dtype='int64')) tm.assert_frame_equal(actual_header_none, expected_header_none) - expected_header_zero = DataFrame(columns=[0]) + expected_header_zero = DataFrame(pd.Series(4 * [np.nan])) tm.assert_frame_equal(actual_header_zero, expected_header_zero) @td.skip_if_no('openpyxl') @@ -503,35 +581,20 @@ def test_sheet_name_and_sheetname(self, ext): # GH10559: Minor improvement: Change "sheet_name" to "sheetname" # GH10969: DOC: Consistent var names (sheetname vs sheet_name) # GH12604: CLN GH10559 Rename sheetname variable to sheet_name - # GH20920: ExcelFile.parse() and pd.read_xlsx() have different - # behavior for "sheetname" argument dfref = self.get_csv_refdf('test1') - df1 = self.get_exceldf('test1', ext, - sheet_name='Sheet1') # doc + df1 = self.get_exceldf('test1', ext, sheet_name='Sheet1') # doc with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df2 = self.get_exceldf('test1', ext, sheetname='Sheet1') # bkwrd compat - excel = self.get_excelfile('test1', ext) - df1_parse = excel.parse(sheet_name='Sheet1') # doc - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2_parse = excel.parse(sheetname='Sheet1') # bkwrd compat - tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) - tm.assert_frame_equal(df1_parse, dfref, check_names=False) - tm.assert_frame_equal(df2_parse, dfref, check_names=False) def test_sheet_name_both_raises(self, ext): with tm.assert_raises_regex(TypeError, "Cannot specify both"): self.get_exceldf('test1', ext, sheetname='Sheet1', sheet_name='Sheet1') - excel = self.get_excelfile('test1', ext) - with tm.assert_raises_regex(TypeError, "Cannot specify both"): - excel.parse(sheetname='Sheet1', - sheet_name='Sheet1') - @pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) class TestXlrdReader(ReadingTestsBase):