From 1222a465265b62da21bd7cc94d8533217e0b3b1b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 23 Dec 2020 17:11:39 -0800 Subject: [PATCH] Backport PR #38571: DEPR: Adjust read excel behavior for xlrd >= 2.0 (#38670) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- ci/deps/azure-38-slow.yaml | 2 +- ci/deps/azure-windows-37.yaml | 2 +- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/excel/_base.py | 239 +++++++++++++++++--------- pandas/tests/io/__init__.py | 4 + pandas/tests/io/excel/__init__.py | 19 ++ pandas/tests/io/excel/test_readers.py | 34 ++++ pandas/tests/io/excel/test_writers.py | 6 +- pandas/tests/io/excel/test_xlrd.py | 24 ++- 9 files changed, 235 insertions(+), 97 deletions(-) diff --git a/ci/deps/azure-38-slow.yaml b/ci/deps/azure-38-slow.yaml index 2b4339cf12658..9651837f26114 100644 --- a/ci/deps/azure-38-slow.yaml +++ b/ci/deps/azure-38-slow.yaml @@ -30,7 +30,7 @@ dependencies: - moto>=1.3.14 - scipy - sqlalchemy - - xlrd<2.0 + - xlrd>=2.0 - xlsxwriter - xlwt - moto diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index ad72b9c8577e9..e7ac4c783b855 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -33,7 +33,7 @@ dependencies: - s3fs>=0.4.2 - scipy - sqlalchemy - - xlrd<2.0 + - xlrd>=2.0 - xlsxwriter - xlwt - pyreadstat diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 151b41705f9a5..6dc0d0cfaf80b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -27,7 +27,7 @@ including other versions of pandas. **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.** This is no longer supported, switch to using ``openpyxl`` instead. - Attempting to use the the ``xlwt`` engine will raise a ``FutureWarning`` + Attempting to use the ``xlwt`` engine will raise a ``FutureWarning`` unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``. While this option is now deprecated and will also raise a ``FutureWarning``, it can be globally set and the warning suppressed. Users are recommended to diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index c72f294bf6ac8..221e8b9ccfb14 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,11 +1,13 @@ import abc import datetime +from distutils.version import LooseVersion import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, Dict, Mapping, Union, cast +from typing import IO, Any, Dict, Mapping, Optional, Union, cast import warnings +import zipfile from pandas._config import config @@ -13,11 +15,12 @@ from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.errors import EmptyDataError -from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments +from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like from pandas.core.frame import DataFrame +from pandas.core.shared_docs import _shared_docs from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg from pandas.io.excel._util import ( @@ -116,17 +119,15 @@ When ``engine=None``, the following logic will be used to determine the engine: - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will - be used. - - Otherwise if `openpyxl `_ is installed, - then ``openpyxl`` will be used. - - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. - - Specifying ``engine="xlrd"`` will continue to be allowed for the - indefinite future. + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, + ``xlrd`` will be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This + case will raise a ``ValueError`` in a future version of pandas. converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -888,39 +889,92 @@ def close(self): return content -def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: +XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" +ZIP_SIGNATURE = b"PK\x03\x04" +PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) + + +@doc(storage_options=_shared_docs["storage_options"]) +def inspect_excel_format( + path: Optional[str] = None, + content: Union[None, BufferedIOBase, RawIOBase, bytes] = None, + storage_options: StorageOptions = None, +) -> str: """ - Check if the stream is an OpenDocument Spreadsheet (.ods) file + Inspect the path or content of an excel file and get its format. + + At least one of path or content must be not None. If both are not None, + content will take precedence. - It uses magic values inside the stream + Adopted from xlrd: https://github.com/python-excel/xlrd. Parameters ---------- - stream : Union[BufferedIOBase, RawIOBase] - IO stream with data which might be an ODS file + path : str, optional + Path to file to inspect. May be a URL. + content : file-like object, optional + Content of file to inspect. + {storage_options} Returns ------- - is_ods : bool - Boolean indication that this is indeed an ODS file or not + str + Format of file. + + Raises + ------ + ValueError + If resulting stream is empty. + BadZipFile + If resulting stream does not have an XLS signature and is not a valid zipfile. """ - stream.seek(0) - is_ods = False - if stream.read(4) == b"PK\003\004": - stream.seek(30) - is_ods = ( - stream.read(54) == b"mimetype" - b"application/vnd.oasis.opendocument.spreadsheet" - ) - stream.seek(0) - return is_ods + content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]] + if isinstance(content, bytes): + content_or_path = BytesIO(content) + else: + content_or_path = content or path + assert content_or_path is not None + + with get_handle( + content_or_path, "rb", storage_options=storage_options, is_text=False + ) as handle: + stream = handle.handle + stream.seek(0) + buf = stream.read(PEEK_SIZE) + if buf is None: + raise ValueError("stream is empty") + else: + assert isinstance(buf, bytes) + peek = buf + stream.seek(0) + + if peek.startswith(XLS_SIGNATURE): + return "xls" + elif not peek.startswith(ZIP_SIGNATURE): + raise ValueError("File is not a recognized excel file") + + # ZipFile typing is overly-strict + # https://github.com/python/typeshed/issues/4212 + zf = zipfile.ZipFile(stream) # type: ignore[arg-type] + + # Workaround for some third party files that use forward slashes and + # lower case names. + component_names = [name.replace("\\", "/").lower() for name in zf.namelist()] + + if "xl/workbook.xml" in component_names: + return "xlsx" + if "xl/workbook.bin" in component_names: + return "xlsb" + if "content.xml" in component_names: + return "ods" + return "zip" class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. - See read_excel for more documentation + See read_excel for more documentation. Parameters ---------- @@ -947,12 +1001,13 @@ class ExcelFile: - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` - will be used. + - Otherwise if ``path_or_buffer`` is an xls format, + ``xlrd`` will be used. - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. + - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + This case will raise a ``ValueError`` in a future version of pandas. .. warning:: @@ -975,33 +1030,47 @@ class ExcelFile: def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): - if engine is None: - # Determine ext and use odf for ods stream/file - if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): - ext = None - if _is_ods_stream(path_or_buffer): - engine = "odf" - else: - ext = os.path.splitext(str(path_or_buffer))[-1] - if ext == ".ods": - engine = "odf" + if engine is not None and engine not in self._engines: + raise ValueError(f"Unknown engine: {engine}") - if ( - import_optional_dependency( - "xlrd", raise_on_missing=False, on_version="ignore" - ) - is not None - ): - from xlrd import Book + # Could be a str, ExcelFile, Book, etc. + self.io = path_or_buffer + # Always a string + self._io = stringify_path(path_or_buffer) - if isinstance(path_or_buffer, Book): - engine = "xlrd" + # Determine xlrd version if installed + if ( + import_optional_dependency( + "xlrd", raise_on_missing=False, on_version="ignore" + ) + is None + ): + xlrd_version = None + else: + import xlrd - # GH 35029 - Prefer openpyxl except for xls files - if engine is None: - if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": - engine = "xlrd" - elif ( + xlrd_version = LooseVersion(xlrd.__version__) + + if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)): + ext = inspect_excel_format( + content=path_or_buffer, storage_options=storage_options + ) + elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + # path_or_buffer is path-like, use stringified path + ext = inspect_excel_format( + path=str(self._io), storage_options=storage_options + ) + + if engine is None: + if ext == "ods": + engine = "odf" + elif ext == "xls": + engine = "xlrd" + else: + # GH 35029 - Prefer openpyxl except for xls files + if ( import_optional_dependency( "openpyxl", raise_on_missing=False, on_version="ignore" ) @@ -1009,37 +1078,39 @@ def __init__( ): engine = "openpyxl" else: - caller = inspect.stack()[1] - if ( - caller.filename.endswith("pandas/io/excel/_base.py") - and caller.function == "read_excel" - ): - stacklevel = 4 - else: - stacklevel = 2 - warnings.warn( - "The xlrd engine is no longer maintained and is not " - "supported when using pandas with python >= 3.9. However, " - "the engine xlrd will continue to be allowed for the " - "indefinite future. Beginning with pandas 1.2.0, the " - "openpyxl engine will be used if it is installed and the " - "engine argument is not specified. Either install openpyxl " - "or specify engine='xlrd' to silence this warning.", - FutureWarning, - stacklevel=stacklevel, - ) engine = "xlrd" - if engine not in self._engines: - raise ValueError(f"Unknown engine: {engine}") + + if engine == "xlrd" and ext != "xls" and xlrd_version is not None: + if xlrd_version >= "2": + raise ValueError( + f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " + f"only the xls format is supported. Install openpyxl instead." + ) + else: + caller = inspect.stack()[1] + if ( + caller.filename.endswith( + os.path.join("pandas", "io", "excel", "_base.py") + ) + and caller.function == "read_excel" + ): + stacklevel = 4 + else: + stacklevel = 2 + warnings.warn( + f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " + f"only the xls format is supported. As a result, the " + f"openpyxl engine will be used if it is installed and the " + f"engine argument is not specified. Install " + f"openpyxl instead.", + FutureWarning, + stacklevel=stacklevel, + ) + assert engine in self._engines, f"Engine {engine} not recognized" self.engine = engine self.storage_options = storage_options - # Could be a str, ExcelFile, Book, etc. - self.io = path_or_buffer - # Always a string - self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io, storage_options=storage_options) def __fspath__(self): diff --git a/pandas/tests/io/__init__.py b/pandas/tests/io/__init__.py index c5e867f45b92d..39474dedba78c 100644 --- a/pandas/tests/io/__init__.py +++ b/pandas/tests/io/__init__.py @@ -14,4 +14,8 @@ r"Use 'tree.iter\(\)' or 'list\(tree.iter\(\)\)' instead." ":PendingDeprecationWarning" ), + # GH 26552 + pytest.mark.filterwarnings( + "ignore:As the xlwt package is no longer maintained:FutureWarning" + ), ] diff --git a/pandas/tests/io/excel/__init__.py b/pandas/tests/io/excel/__init__.py index 384f1006c44df..b7ceb28573484 100644 --- a/pandas/tests/io/excel/__init__.py +++ b/pandas/tests/io/excel/__init__.py @@ -1,5 +1,9 @@ +from distutils.version import LooseVersion + import pytest +from pandas.compat._optional import import_optional_dependency + pytestmark = [ pytest.mark.filterwarnings( # Looks like tree.getiterator is deprecated in favor of tree.iter @@ -13,4 +17,19 @@ pytest.mark.filterwarnings( "ignore:As the xlwt package is no longer maintained:FutureWarning" ), + # GH 38571 + pytest.mark.filterwarnings( + "ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning" + ), ] + + +if ( + import_optional_dependency("xlrd", raise_on_missing=False, on_version="ignore") + is None +): + xlrd_version = None +else: + import xlrd + + xlrd_version = LooseVersion(xlrd.__version__) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 98a55ae39bd77..df1250cee8b00 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -11,6 +11,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm +from pandas.tests.io.excel import xlrd_version read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ @@ -57,6 +58,13 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if read_ext == ".xlsb" and engine != "pyxlsb": return False + if ( + engine == "xlrd" + and xlrd_version is not None + and xlrd_version >= "2" + and read_ext != ".xls" + ): + return False return True @@ -614,6 +622,19 @@ def test_bad_engine_raises(self, read_ext): with pytest.raises(ValueError, match="Unknown engine: foo"): pd.read_excel("", engine=bad_engine) + def test_missing_file_raises(self, read_ext): + bad_file = f"foo{read_ext}" + # CI tests with zh_CN.utf8, translates to "No such file or directory" + with pytest.raises( + FileNotFoundError, match=r"(No such file or directory|没有那个文件或目录)" + ): + pd.read_excel(bad_file) + + def test_corrupt_bytes_raises(self, read_ext, engine): + bad_stream = b"foo" + with pytest.raises(ValueError, match="File is not a recognized excel file"): + pd.read_excel(bad_stream) + @tm.network def test_read_from_http_url(self, read_ext): url = ( @@ -1158,6 +1179,19 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + def test_excel_read_binary_via_read_excel(self, read_ext, engine): + # GH 38424 + if read_ext == ".xlsb" and engine == "pyxlsb": + pytest.xfail("GH 38667 - should default to pyxlsb but doesn't") + with open("test1" + read_ext, "rb") as f: + result = pd.read_excel(f) + expected = pd.read_excel("test1" + read_ext, engine=engine) + tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif( + xlrd_version is not None and xlrd_version >= "2", + reason="xlrd no longer supports xlsx", + ) def test_excel_high_surrogate(self, engine): # GH 23809 expected = DataFrame(["\udc88"], columns=["Column1"]) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 80ebeb4c03d89..197738330efe1 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1195,9 +1195,9 @@ def test_datetimes(self, path): write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") - # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd - engine = "odf" if path.endswith("ods") else "xlrd" - read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine) + if path.endswith("xlsx") or path.endswith("xlsm"): + pytest.skip("Defaults to openpyxl and fails - GH #38644") + read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index f2fbcbc2e2f04..2a1114a9570f0 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -4,6 +4,7 @@ import pandas as pd import pandas._testing as tm +from pandas.tests.io.excel import xlrd_version from pandas.io.excel import ExcelFile @@ -17,6 +18,8 @@ def skip_ods_and_xlsb_files(read_ext): pytest.skip("Not valid for xlrd") if read_ext == ".xlsb": pytest.skip("Not valid for xlrd") + if read_ext in (".xlsx", ".xlsm") and xlrd_version >= "2": + pytest.skip("Not valid for xlrd >= 2.0") def test_read_xlrd_book(read_ext, frame): @@ -66,7 +69,7 @@ def test_excel_file_warning_with_xlsx_file(datapath): pd.read_excel(path, "Sheet1", engine=None) -def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): +def test_read_excel_warning_with_xlsx_file(datapath): # GH 29375 path = datapath("io", "data", "excel", "test1.xlsx") has_openpyxl = ( @@ -76,12 +79,19 @@ def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): is not None ) if not has_openpyxl: - with tm.assert_produces_warning( - FutureWarning, - raise_on_extra_warnings=False, - match="The xlrd engine is no longer maintained", - ): - pd.read_excel(path, "Sheet1", engine=None) + if xlrd_version >= "2": + with pytest.raises( + ValueError, + match="Your version of xlrd is ", + ): + pd.read_excel(path, "Sheet1", engine=None) + else: + with tm.assert_produces_warning( + FutureWarning, + raise_on_extra_warnings=False, + match="The xlrd engine is no longer maintained", + ): + pd.read_excel(path, "Sheet1", engine=None) else: with tm.assert_produces_warning(None): pd.read_excel(path, "Sheet1", engine=None)