Skip to content

Commit

Permalink
ENH: loosen XLS signature (#41321)
Browse files Browse the repository at this point in the history
  • Loading branch information
geoffrey-eisenbarth committed May 21, 2021
1 parent 269a3e4 commit b3e3352
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 13 deletions.
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v1.3.0.rst
Expand Up @@ -197,7 +197,7 @@ Other enhancements
- Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
- Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`)
- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`)
- :func:`pandas.read_excel` can now auto detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`)
- :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`)
- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`)
- :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
Expand Down Expand Up @@ -850,6 +850,8 @@ I/O
- Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`)
- Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`)
- Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`)
- Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`)
-

Period
^^^^^^
Expand Down
30 changes: 20 additions & 10 deletions pandas/io/excel/_base.py
Expand Up @@ -1014,16 +1014,21 @@ def close(self):
return content


XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
XLS_SIGNATURES = (
b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2
b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3
b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4
b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary
)
ZIP_SIGNATURE = b"PK\x03\x04"
PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE))
PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,)))


@doc(storage_options=_shared_docs["storage_options"])
def inspect_excel_format(
content_or_path: FilePathOrBuffer,
storage_options: StorageOptions = None,
) -> str:
) -> str | None:
"""
Inspect the path or content of an excel file and get its format.
Expand All @@ -1037,8 +1042,8 @@ def inspect_excel_format(
Returns
-------
str
Format of file.
str or None
Format of file if it can be determined.
Raises
------
Expand All @@ -1063,10 +1068,10 @@ def inspect_excel_format(
peek = buf
stream.seek(0)

if peek.startswith(XLS_SIGNATURE):
if any(peek.startswith(sig) for sig in XLS_SIGNATURES):
return "xls"
elif not peek.startswith(ZIP_SIGNATURE):
raise ValueError("File is not a recognized excel file")
return None

# ZipFile typing is overly-strict
# https://github.com/python/typeshed/issues/4212
Expand Down Expand Up @@ -1174,8 +1179,12 @@ def __init__(
ext = inspect_excel_format(
content_or_path=path_or_buffer, storage_options=storage_options
)
if ext is None:
raise ValueError(
"Excel file format cannot be determined, you must specify "
"an engine manually."
)

# ext will always be valid, otherwise inspect_excel_format would raise
engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
if engine == "auto":
engine = get_default_engine(ext, mode="reader")
Expand All @@ -1190,12 +1199,13 @@ def __init__(
path_or_buffer, storage_options=storage_options
)

if ext != "xls" and xlrd_version >= Version("2"):
# Pass through if ext is None, otherwise check if ext valid for xlrd
if ext and ext != "xls" and xlrd_version >= Version("2"):
raise ValueError(
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
f"only the xls format is supported. Install openpyxl instead."
)
elif ext != "xls":
elif ext and ext != "xls":
caller = inspect.stack()[1]
if (
caller.filename.endswith(
Expand Down
15 changes: 13 additions & 2 deletions pandas/tests/io/excel/test_readers.py
Expand Up @@ -727,9 +727,20 @@ def test_missing_file_raises(self, read_ext):

def test_corrupt_bytes_raises(self, read_ext, engine):
bad_stream = b"foo"
if engine is None or engine == "xlrd":
if engine is None:
error = ValueError
msg = "File is not a recognized excel file"
msg = (
"Excel file format cannot be determined, you must "
"specify an engine manually."
)
elif engine == "xlrd":
from xlrd import XLRDError

error = XLRDError
msg = (
"Unsupported format, or corrupt file: Expected BOF "
"record; found b'foo'"
)
else:
error = BadZipFile
msg = "File is not a zip file"
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/io/excel/test_xlrd.py
@@ -1,3 +1,5 @@
import io

import pytest

from pandas.compat._optional import import_optional_dependency
Expand All @@ -8,6 +10,7 @@
from pandas.util.version import Version

from pandas.io.excel import ExcelFile
from pandas.io.excel._base import inspect_excel_format

xlrd = pytest.importorskip("xlrd")
xlwt = pytest.importorskip("xlwt")
Expand Down Expand Up @@ -78,3 +81,18 @@ def test_read_excel_warning_with_xlsx_file(datapath):
else:
with tm.assert_produces_warning(None):
pd.read_excel(path, "Sheet1", engine=None)


@pytest.mark.parametrize(
"file_header",
[
b"\x09\x00\x04\x00\x07\x00\x10\x00",
b"\x09\x02\x06\x00\x00\x00\x10\x00",
b"\x09\x04\x06\x00\x00\x00\x10\x00",
b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1",
],
)
def test_read_old_xls_files(file_header):
# GH 41226
f = io.BytesIO(file_header)
assert inspect_excel_format(f) == "xls"

0 comments on commit b3e3352

Please sign in to comment.