REF: Read excel parse refactor (#58497)

pandas-dev · May 1, 2024 · f6932cb · f6932cb
1 parent 7320430
commit f6932cb
Showing 1 changed file with 178 additions and 126 deletions.
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -780,143 +780,195 @@ def parse(
                 output[asheetname] = DataFrame()
                 continue
 
-            is_list_header = False
-            is_len_one_list_header = False
-            if is_list_like(header):
-                assert isinstance(header, Sequence)
-                is_list_header = True
-                if len(header) == 1:
-                    is_len_one_list_header = True
-
-            if is_len_one_list_header:
-                header = cast(Sequence[int], header)[0]
-
-            # forward fill and pull out names for MultiIndex column
-            header_names = None
-            if header is not None and is_list_like(header):
-                assert isinstance(header, Sequence)
-
-                header_names = []
-                control_row = [True] * len(data[0])
-
-                for row in header:
-                    if is_integer(skiprows):
-                        assert isinstance(skiprows, int)
-                        row += skiprows
-
-                    if row > len(data) - 1:
-                        raise ValueError(
-                            f"header index {row} exceeds maximum index "
-                            f"{len(data) - 1} of data.",
-                        )
-
-                    data[row], control_row = fill_mi_header(data[row], control_row)
-
-                    if index_col is not None:
-                        header_name, _ = pop_header_name(data[row], index_col)
-                        header_names.append(header_name)
-
-            # If there is a MultiIndex header and an index then there is also
-            # a row containing just the index name(s)
-            has_index_names = False
-            if is_list_header and not is_len_one_list_header and index_col is not None:
-                index_col_list: Sequence[int]
-                if isinstance(index_col, int):
-                    index_col_list = [index_col]
-                else:
-                    assert isinstance(index_col, Sequence)
-                    index_col_list = index_col
-
-                # We have to handle mi without names. If any of the entries in the data
-                # columns are not empty, this is a regular row
-                assert isinstance(header, Sequence)
-                if len(header) < len(data):
-                    potential_index_names = data[len(header)]
-                    potential_data = [
-                        x
-                        for i, x in enumerate(potential_index_names)
-                        if not control_row[i] and i not in index_col_list
-                    ]
-                    has_index_names = all(x == "" or x is None for x in potential_data)
-
-            if is_list_like(index_col):
-                # Forward fill values for MultiIndex index.
-                if header is None:
-                    offset = 0
-                elif isinstance(header, int):
-                    offset = 1 + header
-                else:
-                    offset = 1 + max(header)
+            output = self._parse_sheet(
+                data=data,
+                output=output,
+                asheetname=asheetname,
+                header=header,
+                names=names,
+                index_col=index_col,
+                usecols=usecols,
+                dtype=dtype,
+                skiprows=skiprows,
+                nrows=nrows,
+                true_values=true_values,
+                false_values=false_values,
+                na_values=na_values,
+                parse_dates=parse_dates,
+                date_parser=date_parser,
+                date_format=date_format,
+                thousands=thousands,
+                decimal=decimal,
+                comment=comment,
+                skipfooter=skipfooter,
+                dtype_backend=dtype_backend,
+                **kwds,
+            )
 
-                # GH34673: if MultiIndex names present and not defined in the header,
-                # offset needs to be incremented so that forward filling starts
-                # from the first MI value instead of the name
-                if has_index_names:
-                    offset += 1
+        if last_sheetname is None:
+            raise ValueError("Sheet name is an empty list")
 
-                # Check if we have an empty dataset
-                # before trying to collect data.
-                if offset < len(data):
-                    assert isinstance(index_col, Sequence)
+        if ret_dict:
+            return output
+        else:
+            return output[last_sheetname]
 
-                    for col in index_col:
-                        last = data[offset][col]
+    def _parse_sheet(
+        self,
+        data: list,
+        output: dict,
+        asheetname: str | int | None = None,
+        header: int | Sequence[int] | None = 0,
+        names: SequenceNotStr[Hashable] | range | None = None,
+        index_col: int | Sequence[int] | None = None,
+        usecols=None,
+        dtype: DtypeArg | None = None,
+        skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
+        nrows: int | None = None,
+        true_values: Iterable[Hashable] | None = None,
+        false_values: Iterable[Hashable] | None = None,
+        na_values=None,
+        parse_dates: list | dict | bool = False,
+        date_parser: Callable | lib.NoDefault = lib.no_default,
+        date_format: dict[Hashable, str] | str | None = None,
+        thousands: str | None = None,
+        decimal: str = ".",
+        comment: str | None = None,
+        skipfooter: int = 0,
+        dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+        **kwds,
+    ):
+        is_list_header = False
+        is_len_one_list_header = False
+        if is_list_like(header):
+            assert isinstance(header, Sequence)
+            is_list_header = True
+            if len(header) == 1:
+                is_len_one_list_header = True
+
+        if is_len_one_list_header:
+            header = cast(Sequence[int], header)[0]
+
+        # forward fill and pull out names for MultiIndex column
+        header_names = None
+        if header is not None and is_list_like(header):
+            assert isinstance(header, Sequence)
+
+            header_names = []
+            control_row = [True] * len(data[0])
+
+            for row in header:
+                if is_integer(skiprows):
+                    assert isinstance(skiprows, int)
+                    row += skiprows
+
+                if row > len(data) - 1:
+                    raise ValueError(
+                        f"header index {row} exceeds maximum index "
+                        f"{len(data) - 1} of data.",
+                    )
 
-                        for row in range(offset + 1, len(data)):
-                            if data[row][col] == "" or data[row][col] is None:
-                                data[row][col] = last
-                            else:
-                                last = data[row][col]
+                data[row], control_row = fill_mi_header(data[row], control_row)
 
-            # GH 12292 : error when read one empty column from excel file
-            try:
-                parser = TextParser(
-                    data,
-                    names=names,
-                    header=header,
-                    index_col=index_col,
-                    has_index_names=has_index_names,
-                    dtype=dtype,
-                    true_values=true_values,
-                    false_values=false_values,
-                    skiprows=skiprows,
-                    nrows=nrows,
-                    na_values=na_values,
-                    skip_blank_lines=False,  # GH 39808
-                    parse_dates=parse_dates,
-                    date_parser=date_parser,
-                    date_format=date_format,
-                    thousands=thousands,
-                    decimal=decimal,
-                    comment=comment,
-                    skipfooter=skipfooter,
-                    usecols=usecols,
-                    dtype_backend=dtype_backend,
-                    **kwds,
-                )
+                if index_col is not None:
+                    header_name, _ = pop_header_name(data[row], index_col)
+                    header_names.append(header_name)
 
-                output[asheetname] = parser.read(nrows=nrows)
+        # If there is a MultiIndex header and an index then there is also
+        # a row containing just the index name(s)
+        has_index_names = False
+        if is_list_header and not is_len_one_list_header and index_col is not None:
+            index_col_list: Sequence[int]
+            if isinstance(index_col, int):
+                index_col_list = [index_col]
+            else:
+                assert isinstance(index_col, Sequence)
+                index_col_list = index_col
+
+            # We have to handle mi without names. If any of the entries in the data
+            # columns are not empty, this is a regular row
+            assert isinstance(header, Sequence)
+            if len(header) < len(data):
+                potential_index_names = data[len(header)]
+                potential_data = [
+                    x
+                    for i, x in enumerate(potential_index_names)
+                    if not control_row[i] and i not in index_col_list
+                ]
+                has_index_names = all(x == "" or x is None for x in potential_data)
+
+        if is_list_like(index_col):
+            # Forward fill values for MultiIndex index.
+            if header is None:
+                offset = 0
+            elif isinstance(header, int):
+                offset = 1 + header
+            else:
+                offset = 1 + max(header)
+
+            # GH34673: if MultiIndex names present and not defined in the header,
+            # offset needs to be incremented so that forward filling starts
+            # from the first MI value instead of the name
+            if has_index_names:
+                offset += 1
+
+            # Check if we have an empty dataset
+            # before trying to collect data.
+            if offset < len(data):
+                assert isinstance(index_col, Sequence)
+
+                for col in index_col:
+                    last = data[offset][col]
+
+                    for row in range(offset + 1, len(data)):
+                        if data[row][col] == "" or data[row][col] is None:
+                            data[row][col] = last
+                        else:
+                            last = data[row][col]
+
+        # GH 12292 : error when read one empty column from excel file
+        try:
+            parser = TextParser(
+                data,
+                names=names,
+                header=header,
+                index_col=index_col,
+                has_index_names=has_index_names,
+                dtype=dtype,
+                true_values=true_values,
+                false_values=false_values,
+                skiprows=skiprows,
+                nrows=nrows,
+                na_values=na_values,
+                skip_blank_lines=False,  # GH 39808
+                parse_dates=parse_dates,
+                date_parser=date_parser,
+                date_format=date_format,
+                thousands=thousands,
+                decimal=decimal,
+                comment=comment,
+                skipfooter=skipfooter,
+                usecols=usecols,
+                dtype_backend=dtype_backend,
+                **kwds,
+            )
 
-                if header_names:
-                    output[asheetname].columns = output[asheetname].columns.set_names(
-                        header_names
-                    )
+            output[asheetname] = parser.read(nrows=nrows)
 
-            except EmptyDataError:
-                # No Data, return an empty DataFrame
-                output[asheetname] = DataFrame()
+            if header_names:
+                output[asheetname].columns = output[asheetname].columns.set_names(
+                    header_names
+                )
 
-            except Exception as err:
-                err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
-                raise err
+        except EmptyDataError:
+            # No Data, return an empty DataFrame
+            output[asheetname] = DataFrame()
 
-        if last_sheetname is None:
-            raise ValueError("Sheet name is an empty list")
+        except Exception as err:
+            err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
+            raise err
 
-        if ret_dict:
-            return output
-        else:
-            return output[last_sheetname]
+        return output
 
 
 @doc(storage_options=_shared_docs["storage_options"])