diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d771da8c787ae9..148cff423f1a0f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -143,7 +143,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) -- :func:`read_excel` default engine "xlrd" is replaced by "openpyxl" because "xlrd" is deprecated (:issue:`28547`). +- :func:`read_excel` "xlrd" engine is deprecated for all file types that can be handled by "openpyxl" because "xlrd" is no longer maintained (:issue:`28547`). - - diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 432c936ede0f59..c08499b1bc2958 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -826,7 +826,8 @@ def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. - Uses xlrd, openpyxl or odf. See read_excel for more documentation + + Uses xlrd engine by default. See read_excel for more documentation Parameters ---------- @@ -837,7 +838,7 @@ class ExcelFile: engine : str, default None If io is not a buffer or path, this must be set to identify io. Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, - default ``openpyxl``, ``xlrd`` for .xls files, ``odf`` for .ods files. + default ``xlrd`` for .xls* files, ``odf`` for .ods files. Engine compatibility : - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. @@ -860,19 +861,20 @@ class ExcelFile: def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): + ext = None + if not isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): + ext = os.path.splitext(str(path_or_buffer))[-1][1:] + if engine is None: - engine = "openpyxl" + engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): if _is_ods_stream(path_or_buffer): engine = "odf" else: - ext = os.path.splitext(str(path_or_buffer))[-1] - if ext == ".ods": + if ext == "ods": engine = "odf" - elif ext == ".xls": - engine = "xlrd" - elif engine == "xlrd": + elif engine == "xlrd" and ext in ("xlsx", "xlsm"): warnings.warn( 'The Excel reader engine "xlrd" is deprecated, use "openpyxl" instead. ' 'Specify engine="openpyxl" to suppress this warning.', diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 858b41dae4020b..8bbdcbe12f1928 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -37,8 +37,8 @@ pytest.param( None, marks=[ - td.skip_if_no("openpyxl"), - pytest.mark.filterwarnings("ignore:.*html argument"), + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), ], ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), @@ -54,8 +54,6 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: engine = engine.values[0] if engine == "openpyxl" and read_ext == ".xls": return False - if engine is None and read_ext == ".xls": - return False if engine == "odf" and read_ext != ".ods": return False if read_ext == ".ods" and engine != "odf": @@ -564,7 +562,7 @@ def test_date_conversion_overflow(self, read_ext): columns=["DateColWithBigInt", "StringCol"], ) - if pd.read_excel.keywords["engine"] in ["openpyxl", None]: + if pd.read_excel.keywords["engine"] == "openpyxl": pytest.xfail("Maybe not supported by openpyxl") result = pd.read_excel("testdateoverflow" + read_ext) @@ -969,19 +967,6 @@ def test_no_header_with_list_index_col(self, read_ext): ) tm.assert_frame_equal(expected, result) - def test_excel_high_surrogate(self, engine, read_ext): - # GH 23809 - if read_ext != ".xlsx": - pytest.skip("Test is only applicable to .xlsx file") - if engine in ["openpyxl", None]: - pytest.skip("Test does not work for openpyxl") - - expected = pd.DataFrame(["\udc88"], columns=["Column1"]) - - # should not produce a segmentation violation - actual = pd.read_excel("high_surrogate.xlsx") - tm.assert_frame_equal(expected, actual) - class TestExcelFileRead: @pytest.fixture(autouse=True) @@ -1137,6 +1122,14 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + def test_excel_high_surrogate(self, engine): + # GH 23809 + expected = pd.DataFrame(["\udc88"], columns=["Column1"]) + + # should not produce a segmentation violation + actual = pd.read_excel("high_surrogate.xlsx") + tm.assert_frame_equal(expected, actual) + @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) def test_header_with_index_col(self, engine, filename): # GH 33476 diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 1063834b8367c1..461ddbbbc77e33 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -351,16 +351,12 @@ def test_excel_sheet_by_name_raise(self, path, engine): msg = "sheet 0 not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") - elif engine == "xlwt": + else: import xlrd msg = "No sheet named <'0'>" with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, sheet_name="0") - else: # openpyxl - msg = "Worksheet 0 does not exist." - with pytest.raises(KeyError, match=msg): - pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -1216,15 +1212,8 @@ def test_bytes_io(self, engine): df.to_excel(writer) writer.save() - if engine == "xlwt": - read_engine = "xlrd" - elif engine == "xlsxwriter": - read_engine = "openpyxl" - else: - read_engine = engine - bio.seek(0) - reread_df = pd.read_excel(bio, index_col=0, engine=read_engine) + reread_df = pd.read_excel(bio, index_col=0) tm.assert_frame_equal(df, reread_df) def test_write_lists_dict(self, path):