TYP: Typ part of python_parser (#44406)

pandas-dev · Nov 28, 2021 · 3d99081 · 3d99081
1 parent 7f06a8a
commit 3d99081
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 57 deletions.
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -110,7 +110,12 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
                 multi_index_named = False
             frame.columns = self.names
         # we only need the frame not the names
-        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
+        # error: Incompatible types in assignment (expression has type
+        # "Union[List[Union[Union[str, int, float, bool], Union[Period, Timestamp,
+        # Timedelta, Any]]], Index]", variable has type "Index")  [assignment]
+        frame.columns, frame = self._do_date_conversions(  # type: ignore[assignment]
+            frame.columns, frame
+        )
         if self.index_col is not None:
             for i, item in enumerate(self.index_col):
                 if is_integer(item):

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -10,10 +10,13 @@
     Any,
     Callable,
     DefaultDict,
+    Hashable,
     Iterable,
+    Mapping,
     Sequence,
     cast,
     final,
+    overload,
 )
 import warnings
 
@@ -56,6 +59,7 @@
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 
+from pandas import DataFrame
 from pandas.core import algorithms
 from pandas.core.arrays import Categorical
 from pandas.core.indexes.api import (
@@ -241,7 +245,7 @@ def _open_handles(
             errors=kwds.get("encoding_errors", "strict"),
         )
 
-    def _validate_parse_dates_presence(self, columns: list[str]) -> Iterable:
+    def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
         """
         Check if parse_dates are in columns.
 
@@ -337,11 +341,24 @@ def _should_parse_dates(self, i: int) -> bool:
 
     @final
     def _extract_multi_indexer_columns(
-        self, header, index_names, passed_names: bool = False
+        self,
+        header,
+        index_names: list | None,
+        passed_names: bool = False,
     ):
         """
-        extract and return the names, index_names, col_names
-        header is a list-of-lists returned from the parsers
+        Extract and return the names, index_names, col_names if the column
+        names are a MultiIndex.
+
+        Parameters
+        ----------
+        header: list of lists
+            The header rows
+        index_names: list, optional
+            The names of the future index
+        passed_names: bool, default False
+            A flag specifying if names where passed
+
         """
         if len(header) < 2:
             return header[0], index_names, None, passed_names
@@ -400,15 +417,15 @@ def extract(r):
         return names, index_names, col_names, passed_names
 
     @final
-    def _maybe_dedup_names(self, names):
+    def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
         # see gh-7160 and gh-9424: this helps to provide
         # immediate alleviation of the duplicate names
         # issue and appears to be satisfactory to users,
         # but ultimately, not needing to butcher the names
         # would be nice!
         if self.mangle_dupe_cols:
             names = list(names)  # so we can index
-            counts: DefaultDict[int | str | tuple, int] = defaultdict(int)
+            counts: DefaultDict[Hashable, int] = defaultdict(int)
             is_potential_mi = _is_potential_multi_index(names, self.index_col)
 
             for i, col in enumerate(names):
@@ -418,6 +435,8 @@ def _maybe_dedup_names(self, names):
                     counts[col] = cur_count + 1
 
                     if is_potential_mi:
+                        # for mypy
+                        assert isinstance(col, tuple)
                         col = col[:-1] + (f"{col[-1]}.{cur_count}",)
                     else:
                         col = f"{col}.{cur_count}"
@@ -572,7 +591,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
     @final
     def _convert_to_ndarrays(
         self,
-        dct: dict,
+        dct: Mapping,
         na_values,
         na_fvalues,
         verbose: bool = False,
@@ -664,7 +683,7 @@ def _convert_to_ndarrays(
 
     @final
     def _set_noconvert_dtype_columns(
-        self, col_indices: list[int], names: list[int | str | tuple]
+        self, col_indices: list[int], names: Sequence[Hashable]
     ) -> set[int]:
         """
         Set the columns that should not undergo dtype conversions.
@@ -848,7 +867,27 @@ def _cast_types(self, values, cast_type, column):
                 ) from err
         return values
 
-    def _do_date_conversions(self, names, data):
+    @overload
+    def _do_date_conversions(
+        self,
+        names: Index,
+        data: DataFrame,
+    ) -> tuple[Sequence[Hashable] | Index, DataFrame]:
+        ...
+
+    @overload
+    def _do_date_conversions(
+        self,
+        names: Sequence[Hashable],
+        data: Mapping[Hashable, ArrayLike],
+    ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
+        ...
+
+    def _do_date_conversions(
+        self,
+        names: Sequence[Hashable] | Index,
+        data: Mapping[Hashable, ArrayLike] | DataFrame,
+    ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:
         # returns data, columns
 
         if self.parse_dates is not None:
@@ -864,7 +903,11 @@ def _do_date_conversions(self, names, data):
 
         return names, data
 
-    def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None:
+    def _check_data_length(
+        self,
+        columns: Sequence[Hashable],
+        data: Sequence[ArrayLike],
+    ) -> None:
         """Checks if length of data is equal to length of column names.
 
         One set of trailing commas is allowed. self.index_col not False

diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -279,7 +279,7 @@ def read(self, nrows=None):
             data_tups = sorted(data.items())
             data = {k: v for k, (i, v) in zip(names, data_tups)}
 
-            names, data = self._do_date_conversions(names, data)
+            names, date_data = self._do_date_conversions(names, data)
 
         else:
             # rename dict keys
@@ -302,13 +302,13 @@ def read(self, nrows=None):
 
             data = {k: v for k, (i, v) in zip(names, data_tups)}
 
-            names, data = self._do_date_conversions(names, data)
-            index, names = self._make_index(data, alldata, names)
+            names, date_data = self._do_date_conversions(names, data)
+            index, names = self._make_index(date_data, alldata, names)
 
         # maybe create a mi on the columns
         names = self._maybe_make_multi_index_columns(names, self.col_names)
 
-        return index, names, data
+        return index, names, date_data
 
     def _filter_usecols(self, names):
         # hackish