From 1222a465265b62da21bd7cc94d8533217e0b3b1b Mon Sep 17 00:00:00 2001
From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com>
Date: Wed, 23 Dec 2020 17:11:39 -0800
Subject: [PATCH] Backport PR #38571: DEPR: Adjust read excel behavior for xlrd
 >= 2.0 (#38670)

Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
---
 ci/deps/azure-38-slow.yaml            |   2 +-
 ci/deps/azure-windows-37.yaml         |   2 +-
 doc/source/whatsnew/v1.2.0.rst        |   2 +-
 pandas/io/excel/_base.py              | 239 +++++++++++++++++---------
 pandas/tests/io/__init__.py           |   4 +
 pandas/tests/io/excel/__init__.py     |  19 ++
 pandas/tests/io/excel/test_readers.py |  34 ++++
 pandas/tests/io/excel/test_writers.py |   6 +-
 pandas/tests/io/excel/test_xlrd.py    |  24 ++-
 9 files changed, 235 insertions(+), 97 deletions(-)

diff --git a/ci/deps/azure-38-slow.yaml b/ci/deps/azure-38-slow.yaml
index 2b4339cf12658..9651837f26114 100644
--- a/ci/deps/azure-38-slow.yaml
+++ b/ci/deps/azure-38-slow.yaml
@@ -30,7 +30,7 @@ dependencies:
   - moto>=1.3.14
   - scipy
   - sqlalchemy
-  - xlrd<2.0
+  - xlrd>=2.0
   - xlsxwriter
   - xlwt
   - moto
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
index ad72b9c8577e9..e7ac4c783b855 100644
--- a/ci/deps/azure-windows-37.yaml
+++ b/ci/deps/azure-windows-37.yaml
@@ -33,7 +33,7 @@ dependencies:
   - s3fs>=0.4.2
   - scipy
   - sqlalchemy
-  - xlrd<2.0
+  - xlrd>=2.0
   - xlsxwriter
   - xlwt
   - pyreadstat
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 151b41705f9a5..6dc0d0cfaf80b 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -27,7 +27,7 @@ including other versions of pandas.
    **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
    This is no longer supported, switch to using ``openpyxl`` instead.
 
-   Attempting to use the the ``xlwt`` engine will raise a ``FutureWarning``
+   Attempting to use the ``xlwt`` engine will raise a ``FutureWarning``
    unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``.
    While this option is now deprecated and will also raise a ``FutureWarning``,
    it can be globally set and the warning suppressed. Users are recommended to
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index c72f294bf6ac8..221e8b9ccfb14 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -1,11 +1,13 @@
 import abc
 import datetime
+from distutils.version import LooseVersion
 import inspect
 from io import BufferedIOBase, BytesIO, RawIOBase
 import os
 from textwrap import fill
-from typing import Any, Dict, Mapping, Union, cast
+from typing import IO, Any, Dict, Mapping, Optional, Union, cast
 import warnings
+import zipfile
 
 from pandas._config import config
 
@@ -13,11 +15,12 @@
 from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import EmptyDataError
-from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
+from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc
 
 from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like
 
 from pandas.core.frame import DataFrame
+from pandas.core.shared_docs import _shared_docs
 
 from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg
 from pandas.io.excel._util import (
@@ -116,17 +119,15 @@
         When ``engine=None``, the following logic will be
         used to determine the engine:
 
-        - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
-          then `odf <https://pypi.org/project/odfpy/>`_ will be used.
-        - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
-          extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
-          be used.
-        - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
-          then ``openpyxl`` will be used.
-        - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
-
-        Specifying ``engine="xlrd"`` will continue to be allowed for the
-        indefinite future.
+       - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
+         then `odf <https://pypi.org/project/odfpy/>`_ will be used.
+       - Otherwise if ``path_or_buffer`` is an xls format,
+         ``xlrd`` will be used.
+       - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
+         then ``openpyxl`` will be used.
+       - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
+       - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This
+         case will raise a ``ValueError`` in a future version of pandas.
 
 converters : dict, default None
     Dict of functions for converting values in certain columns. Keys can
@@ -888,39 +889,92 @@ def close(self):
         return content
 
 
-def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool:
+XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
+ZIP_SIGNATURE = b"PK\x03\x04"
+PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE))
+
+
+@doc(storage_options=_shared_docs["storage_options"])
+def inspect_excel_format(
+    path: Optional[str] = None,
+    content: Union[None, BufferedIOBase, RawIOBase, bytes] = None,
+    storage_options: StorageOptions = None,
+) -> str:
     """
-    Check if the stream is an OpenDocument Spreadsheet (.ods) file
+    Inspect the path or content of an excel file and get its format.
+
+    At least one of path or content must be not None. If both are not None,
+    content will take precedence.
 
-    It uses magic values inside the stream
+    Adopted from xlrd: https://github.com/python-excel/xlrd.
 
     Parameters
     ----------
-    stream : Union[BufferedIOBase, RawIOBase]
-        IO stream with data which might be an ODS file
+    path : str, optional
+        Path to file to inspect. May be a URL.
+    content : file-like object, optional
+        Content of file to inspect.
+    {storage_options}
 
     Returns
     -------
-    is_ods : bool
-        Boolean indication that this is indeed an ODS file or not
+    str
+        Format of file.
+
+    Raises
+    ------
+    ValueError
+        If resulting stream is empty.
+    BadZipFile
+        If resulting stream does not have an XLS signature and is not a valid zipfile.
     """
-    stream.seek(0)
-    is_ods = False
-    if stream.read(4) == b"PK\003\004":
-        stream.seek(30)
-        is_ods = (
-            stream.read(54) == b"mimetype"
-            b"application/vnd.oasis.opendocument.spreadsheet"
-        )
-    stream.seek(0)
-    return is_ods
+    content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]]
+    if isinstance(content, bytes):
+        content_or_path = BytesIO(content)
+    else:
+        content_or_path = content or path
+    assert content_or_path is not None
+
+    with get_handle(
+        content_or_path, "rb", storage_options=storage_options, is_text=False
+    ) as handle:
+        stream = handle.handle
+        stream.seek(0)
+        buf = stream.read(PEEK_SIZE)
+        if buf is None:
+            raise ValueError("stream is empty")
+        else:
+            assert isinstance(buf, bytes)
+            peek = buf
+        stream.seek(0)
+
+        if peek.startswith(XLS_SIGNATURE):
+            return "xls"
+        elif not peek.startswith(ZIP_SIGNATURE):
+            raise ValueError("File is not a recognized excel file")
+
+        # ZipFile typing is overly-strict
+        # https://github.com/python/typeshed/issues/4212
+        zf = zipfile.ZipFile(stream)  # type: ignore[arg-type]
+
+        # Workaround for some third party files that use forward slashes and
+        # lower case names.
+        component_names = [name.replace("\\", "/").lower() for name in zf.namelist()]
+
+        if "xl/workbook.xml" in component_names:
+            return "xlsx"
+        if "xl/workbook.bin" in component_names:
+            return "xlsb"
+        if "content.xml" in component_names:
+            return "ods"
+        return "zip"
 
 
 class ExcelFile:
     """
     Class for parsing tabular excel sheets into DataFrame objects.
 
-    See read_excel for more documentation
+    See read_excel for more documentation.
 
     Parameters
     ----------
@@ -947,12 +1001,13 @@ class ExcelFile:
 
            - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
              then `odf <https://pypi.org/project/odfpy/>`_ will be used.
-           - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
-             extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
-             will be used.
+           - Otherwise if ``path_or_buffer`` is an xls format,
+             ``xlrd`` will be used.
            - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
              then ``openpyxl`` will be used.
+           - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
            - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
+             This case will raise a ``ValueError`` in a future version of pandas.
 
            .. warning::
 
@@ -975,33 +1030,47 @@ class ExcelFile:
     def __init__(
         self, path_or_buffer, engine=None, storage_options: StorageOptions = None
     ):
-        if engine is None:
-            # Determine ext and use odf for ods stream/file
-            if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
-                ext = None
-                if _is_ods_stream(path_or_buffer):
-                    engine = "odf"
-            else:
-                ext = os.path.splitext(str(path_or_buffer))[-1]
-                if ext == ".ods":
-                    engine = "odf"
+        if engine is not None and engine not in self._engines:
+            raise ValueError(f"Unknown engine: {engine}")
 
-            if (
-                import_optional_dependency(
-                    "xlrd", raise_on_missing=False, on_version="ignore"
-                )
-                is not None
-            ):
-                from xlrd import Book
+        # Could be a str, ExcelFile, Book, etc.
+        self.io = path_or_buffer
+        # Always a string
+        self._io = stringify_path(path_or_buffer)
 
-                if isinstance(path_or_buffer, Book):
-                    engine = "xlrd"
+        # Determine xlrd version if installed
+        if (
+            import_optional_dependency(
+                "xlrd", raise_on_missing=False, on_version="ignore"
+            )
+            is None
+        ):
+            xlrd_version = None
+        else:
+            import xlrd
 
-            # GH 35029 - Prefer openpyxl except for xls files
-            if engine is None:
-                if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
-                    engine = "xlrd"
-                elif (
+            xlrd_version = LooseVersion(xlrd.__version__)
+
+        if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)):
+            ext = inspect_excel_format(
+                content=path_or_buffer, storage_options=storage_options
+            )
+        elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
+            ext = "xls"
+        else:
+            # path_or_buffer is path-like, use stringified path
+            ext = inspect_excel_format(
+                path=str(self._io), storage_options=storage_options
+            )
+
+        if engine is None:
+            if ext == "ods":
+                engine = "odf"
+            elif ext == "xls":
+                engine = "xlrd"
+            else:
+                # GH 35029 - Prefer openpyxl except for xls files
+                if (
                     import_optional_dependency(
                         "openpyxl", raise_on_missing=False, on_version="ignore"
                     )
@@ -1009,37 +1078,39 @@ def __init__(
                 ):
                     engine = "openpyxl"
                 else:
-                    caller = inspect.stack()[1]
-                    if (
-                        caller.filename.endswith("pandas/io/excel/_base.py")
-                        and caller.function == "read_excel"
-                    ):
-                        stacklevel = 4
-                    else:
-                        stacklevel = 2
-                    warnings.warn(
-                        "The xlrd engine is no longer maintained and is not "
-                        "supported when using pandas with python >= 3.9. However, "
-                        "the engine xlrd will continue to be allowed for the "
-                        "indefinite future. Beginning with pandas 1.2.0, the "
-                        "openpyxl engine will be used if it is installed and the "
-                        "engine argument is not specified. Either install openpyxl "
-                        "or specify engine='xlrd' to silence this warning.",
-                        FutureWarning,
-                        stacklevel=stacklevel,
-                    )
                     engine = "xlrd"
-        if engine not in self._engines:
-            raise ValueError(f"Unknown engine: {engine}")
+
+        if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
+            if xlrd_version >= "2":
+                raise ValueError(
+                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
+                    f"only the xls format is supported. Install openpyxl instead."
+                )
+            else:
+                caller = inspect.stack()[1]
+                if (
+                    caller.filename.endswith(
+                        os.path.join("pandas", "io", "excel", "_base.py")
+                    )
+                    and caller.function == "read_excel"
+                ):
+                    stacklevel = 4
+                else:
+                    stacklevel = 2
+                warnings.warn(
+                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
+                    f"only the xls format is supported. As a result, the "
+                    f"openpyxl engine will be used if it is installed and the "
+                    f"engine argument is not specified. Install "
+                    f"openpyxl instead.",
+                    FutureWarning,
+                    stacklevel=stacklevel,
+                )
+        assert engine in self._engines, f"Engine {engine} not recognized"
 
         self.engine = engine
         self.storage_options = storage_options
 
-        # Could be a str, ExcelFile, Book, etc.
-        self.io = path_or_buffer
-        # Always a string
-        self._io = stringify_path(path_or_buffer)
-
         self._reader = self._engines[engine](self._io, storage_options=storage_options)
 
     def __fspath__(self):
diff --git a/pandas/tests/io/__init__.py b/pandas/tests/io/__init__.py
index c5e867f45b92d..39474dedba78c 100644
--- a/pandas/tests/io/__init__.py
+++ b/pandas/tests/io/__init__.py
@@ -14,4 +14,8 @@
         r"Use 'tree.iter\(\)' or 'list\(tree.iter\(\)\)' instead."
         ":PendingDeprecationWarning"
     ),
+    # GH 26552
+    pytest.mark.filterwarnings(
+        "ignore:As the xlwt package is no longer maintained:FutureWarning"
+    ),
 ]
diff --git a/pandas/tests/io/excel/__init__.py b/pandas/tests/io/excel/__init__.py
index 384f1006c44df..b7ceb28573484 100644
--- a/pandas/tests/io/excel/__init__.py
+++ b/pandas/tests/io/excel/__init__.py
@@ -1,5 +1,9 @@
+from distutils.version import LooseVersion
+
 import pytest
 
+from pandas.compat._optional import import_optional_dependency
+
 pytestmark = [
     pytest.mark.filterwarnings(
         # Looks like tree.getiterator is deprecated in favor of tree.iter
@@ -13,4 +17,19 @@
     pytest.mark.filterwarnings(
         "ignore:As the xlwt package is no longer maintained:FutureWarning"
     ),
+    # GH 38571
+    pytest.mark.filterwarnings(
+        "ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning"
+    ),
 ]
+
+
+if (
+    import_optional_dependency("xlrd", raise_on_missing=False, on_version="ignore")
+    is None
+):
+    xlrd_version = None
+else:
+    import xlrd
+
+    xlrd_version = LooseVersion(xlrd.__version__)
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 98a55ae39bd77..df1250cee8b00 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -11,6 +11,7 @@
 import pandas as pd
 from pandas import DataFrame, Index, MultiIndex, Series
 import pandas._testing as tm
+from pandas.tests.io.excel import xlrd_version
 
 read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
 engine_params = [
@@ -57,6 +58,13 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
         return False
     if read_ext == ".xlsb" and engine != "pyxlsb":
         return False
+    if (
+        engine == "xlrd"
+        and xlrd_version is not None
+        and xlrd_version >= "2"
+        and read_ext != ".xls"
+    ):
+        return False
     return True
 
 
@@ -614,6 +622,19 @@ def test_bad_engine_raises(self, read_ext):
         with pytest.raises(ValueError, match="Unknown engine: foo"):
             pd.read_excel("", engine=bad_engine)
 
+    def test_missing_file_raises(self, read_ext):
+        bad_file = f"foo{read_ext}"
+        # CI tests with zh_CN.utf8, translates to "No such file or directory"
+        with pytest.raises(
+            FileNotFoundError, match=r"(No such file or directory|没有那个文件或目录)"
+        ):
+            pd.read_excel(bad_file)
+
+    def test_corrupt_bytes_raises(self, read_ext, engine):
+        bad_stream = b"foo"
+        with pytest.raises(ValueError, match="File is not a recognized excel file"):
+            pd.read_excel(bad_stream)
+
     @tm.network
     def test_read_from_http_url(self, read_ext):
         url = (
@@ -1158,6 +1179,19 @@ def test_excel_read_binary(self, engine, read_ext):
         actual = pd.read_excel(data, engine=engine)
         tm.assert_frame_equal(expected, actual)
 
+    def test_excel_read_binary_via_read_excel(self, read_ext, engine):
+        # GH 38424
+        if read_ext == ".xlsb" and engine == "pyxlsb":
+            pytest.xfail("GH 38667 - should default to pyxlsb but doesn't")
+        with open("test1" + read_ext, "rb") as f:
+            result = pd.read_excel(f)
+        expected = pd.read_excel("test1" + read_ext, engine=engine)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.skipif(
+        xlrd_version is not None and xlrd_version >= "2",
+        reason="xlrd no longer supports xlsx",
+    )
     def test_excel_high_surrogate(self, engine):
         # GH 23809
         expected = DataFrame(["\udc88"], columns=["Column1"])
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
index 80ebeb4c03d89..197738330efe1 100644
--- a/pandas/tests/io/excel/test_writers.py
+++ b/pandas/tests/io/excel/test_writers.py
@@ -1195,9 +1195,9 @@ def test_datetimes(self, path):
 
         write_frame = DataFrame({"A": datetimes})
         write_frame.to_excel(path, "Sheet1")
-        # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd
-        engine = "odf" if path.endswith("ods") else "xlrd"
-        read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine)
+        if path.endswith("xlsx") or path.endswith("xlsm"):
+            pytest.skip("Defaults to openpyxl and fails - GH #38644")
+        read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0)
 
         tm.assert_series_equal(write_frame["A"], read_frame["A"])
 
diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py
index f2fbcbc2e2f04..2a1114a9570f0 100644
--- a/pandas/tests/io/excel/test_xlrd.py
+++ b/pandas/tests/io/excel/test_xlrd.py
@@ -4,6 +4,7 @@
 
 import pandas as pd
 import pandas._testing as tm
+from pandas.tests.io.excel import xlrd_version
 
 from pandas.io.excel import ExcelFile
 
@@ -17,6 +18,8 @@ def skip_ods_and_xlsb_files(read_ext):
         pytest.skip("Not valid for xlrd")
     if read_ext == ".xlsb":
         pytest.skip("Not valid for xlrd")
+    if read_ext in (".xlsx", ".xlsm") and xlrd_version >= "2":
+        pytest.skip("Not valid for xlrd >= 2.0")
 
 
 def test_read_xlrd_book(read_ext, frame):
@@ -66,7 +69,7 @@ def test_excel_file_warning_with_xlsx_file(datapath):
             pd.read_excel(path, "Sheet1", engine=None)
 
 
-def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
+def test_read_excel_warning_with_xlsx_file(datapath):
     # GH 29375
     path = datapath("io", "data", "excel", "test1.xlsx")
     has_openpyxl = (
@@ -76,12 +79,19 @@ def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
         is not None
     )
     if not has_openpyxl:
-        with tm.assert_produces_warning(
-            FutureWarning,
-            raise_on_extra_warnings=False,
-            match="The xlrd engine is no longer maintained",
-        ):
-            pd.read_excel(path, "Sheet1", engine=None)
+        if xlrd_version >= "2":
+            with pytest.raises(
+                ValueError,
+                match="Your version of xlrd is ",
+            ):
+                pd.read_excel(path, "Sheet1", engine=None)
+        else:
+            with tm.assert_produces_warning(
+                FutureWarning,
+                raise_on_extra_warnings=False,
+                match="The xlrd engine is no longer maintained",
+            ):
+                pd.read_excel(path, "Sheet1", engine=None)
     else:
         with tm.assert_produces_warning(None):
             pd.read_excel(path, "Sheet1", engine=None)