From d0bebe6c6e538b2f22e5ff895f80135260b6cf10 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 1 Dec 2021 20:20:54 +0100
Subject: [PATCH 1/6] TYP: Type python parser

---
 pandas/io/parsers/base_parser.py      | 23 ++++++------
 pandas/io/parsers/c_parser_wrapper.py | 23 +++++++++---
 pandas/io/parsers/python_parser.py    | 53 ++++++++++++++++++++-------
 3 files changed, 68 insertions(+), 31 deletions(-)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 5d03529654b0d..21dca55354e8b 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -376,9 +376,7 @@ def _extract_multi_indexer_columns(
 
         # clean the index_names
         index_names = header.pop(-1)
-        index_names, _, _ = self._clean_index_names(
-            index_names, self.index_col, self.unnamed_cols
-        )
+        index_names, _, _ = self._clean_index_names(index_names, self.index_col)
 
         # extract the columns
         field_count = len(header[0])
@@ -455,21 +453,24 @@ def _maybe_make_multi_index_columns(self, columns, col_names=None):
         return columns
 
     @final
-    def _make_index(self, data, alldata, columns, indexnamerow=False):
+    def _make_index(
+        self, data, alldata, columns, indexnamerow=False
+    ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
+        index: Index | None
         if not is_index_col(self.index_col) or not self.index_col:
             index = None
 
         elif not self._has_complex_date_col:
-            index = self._get_simple_index(alldata, columns)
-            index = self._agg_index(index)
+            simple_index = self._get_simple_index(alldata, columns)
+            index = self._agg_index(simple_index)
         elif self._has_complex_date_col:
             if not self._name_processed:
                 (self.index_names, _, self.index_col) = self._clean_index_names(
-                    list(columns), self.index_col, self.unnamed_cols
+                    list(columns), self.index_col
                 )
                 self._name_processed = True
-            index = self._get_complex_date_index(data, columns)
-            index = self._agg_index(index, try_parse_dates=False)
+            date_index = self._get_complex_date_index(data, columns)
+            index = self._agg_index(date_index, try_parse_dates=False)
 
         # add names for the index
         if indexnamerow:
@@ -1022,7 +1023,7 @@ def _validate_usecols_arg(self, usecols):
             return usecols, usecols_dtype
         return usecols, None
 
-    def _clean_index_names(self, columns, index_col, unnamed_cols):
+    def _clean_index_names(self, columns, index_col):
         if not is_index_col(index_col):
             return None, columns, index_col
 
@@ -1054,7 +1055,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols):
 
         # Only clean index names that were placeholders.
         for i, name in enumerate(index_names):
-            if isinstance(name, str) and name in unnamed_cols:
+            if isinstance(name, str) and name in self.unnamed_cols:
                 index_names[i] = None
 
         return index_names, columns, index_col
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index 05c963f2d2552..cb0accfa1163a 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -1,5 +1,9 @@
 from __future__ import annotations
 
+from typing import (
+    Hashable,
+    Sequence,
+)
 import warnings
 
 import numpy as np
@@ -20,6 +24,10 @@
 from pandas.core.dtypes.concat import union_categoricals
 from pandas.core.dtypes.dtypes import ExtensionDtype
 
+from pandas import (
+    Index,
+    MultiIndex,
+)
 from pandas.core.indexes.api import ensure_index_from_sequences
 
 from pandas.io.parsers.base_parser import (
@@ -172,7 +180,6 @@ def __init__(
                     self.names,  # type: ignore[has-type]
                     # error: Cannot determine type of 'index_col'
                     self.index_col,  # type: ignore[has-type]
-                    self.unnamed_cols,
                 )
 
                 if self.index_names is None:
@@ -215,6 +222,8 @@ def _set_noconvert_columns(self):
             self._reader.set_noconvert(col)
 
     def read(self, nrows=None):
+        index: Index | MultiIndex | Sequence[Hashable] | None
+
         try:
             if self.low_memory:
                 chunks = self._reader.read_low_memory(nrows)
@@ -279,7 +288,7 @@ def read(self, nrows=None):
             data_tups = sorted(data.items())
             data = {k: v for k, (i, v) in zip(names, data_tups)}
 
-            names, date_data = self._do_date_conversions(names, data)
+            column_names, date_data = self._do_date_conversions(names, data)
 
         else:
             # rename dict keys
@@ -303,12 +312,14 @@ def read(self, nrows=None):
             data = {k: v for k, (i, v) in zip(names, data_tups)}
 
             names, date_data = self._do_date_conversions(names, data)
-            index, names = self._make_index(date_data, alldata, names)
+            index, column_names = self._make_index(date_data, alldata, names)
 
         # maybe create a mi on the columns
-        names = self._maybe_make_multi_index_columns(names, self.col_names)
+        column_names = self._maybe_make_multi_index_columns(
+            column_names, self.col_names
+        )
 
-        return index, names, date_data
+        return index, column_names, date_data
 
     def _filter_usecols(self, names):
         # hackish
@@ -325,7 +336,7 @@ def _get_index_names(self):
 
         if self._reader.leading_cols == 0 and self.index_col is not None:
             (idx_names, names, self.index_col) = self._clean_index_names(
-                names, self.index_col, self.unnamed_cols
+                names, self.index_col
             )
 
         return names, idx_names
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 27d0944572024..d12725fce7f1a 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -9,12 +9,14 @@
 import re
 import sys
 from typing import (
+    IO,
     DefaultDict,
     Hashable,
     Iterator,
     Mapping,
     Sequence,
     cast,
+    overload,
 )
 import warnings
 
@@ -36,6 +38,11 @@
 from pandas.core.dtypes.common import is_integer
 from pandas.core.dtypes.inference import is_dict_like
 
+from pandas import (
+    Index,
+    MultiIndex,
+)
+
 from pandas.io.parsers.base_parser import (
     ParserBase,
     parser_defaults,
@@ -178,7 +185,7 @@ def __init__(
             )
         self.num = re.compile(regex)
 
-    def _make_reader(self, f) -> None:
+    def _make_reader(self, f: IO[str]) -> None:
         sep = self.delimiter
 
         if sep is None or len(sep) == 1:
@@ -203,8 +210,8 @@ class MyDialect(csv.Dialect):
             else:
                 # attempt to sniff the delimiter from the first valid line,
                 # i.e. no comment line and not in skiprows
-                line = f.readline()
-                lines = self._check_comments([[line]])[0]
+                line: str = f.readline()
+                lines: list[str] = self._check_comments([[line]])[0]
                 while self.skipfunc(self.pos) or not lines:
                     self.pos += 1
                     line = f.readline()
@@ -212,15 +219,15 @@ class MyDialect(csv.Dialect):
 
                 # since `line` was a string, lines will be a list containing
                 # only a single string
-                line = lines[0]
+                first_line = lines[0]
 
                 self.pos += 1
                 self.line_pos += 1
-                sniffed = csv.Sniffer().sniff(line)
+                sniffed = csv.Sniffer().sniff(first_line)
                 dia.delimiter = sniffed.delimiter
 
                 # Note: encoding is irrelevant here
-                line_rdr = csv.reader(StringIO(line), dialect=dia)
+                line_rdr = csv.reader(StringIO(first_line), dialect=dia)
                 self.buf.extend(list(line_rdr))
 
             # Note: encoding is irrelevant here
@@ -244,7 +251,11 @@ def _read():
         # TextIOWrapper, mmap, None]")
         self.data = reader  # type: ignore[assignment]
 
-    def read(self, rows: int | None = None):
+    def read(
+        self, rows: int | None = None
+    ) -> tuple[
+        Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
+    ]:
         try:
             content = self._get_lines(rows)
         except StopIteration:
@@ -284,9 +295,11 @@ def read(self, rows: int | None = None):
         conv_data = self._convert_data(data)
         columns, conv_data = self._do_date_conversions(columns, conv_data)
 
-        index, columns = self._make_index(conv_data, alldata, columns, indexnamerow)
+        index, result_columns = self._make_index(
+            conv_data, alldata, columns, indexnamerow
+        )
 
-        return index, columns, conv_data
+        return index, result_columns, conv_data
 
     def _exclude_implicit_index(
         self,
@@ -596,7 +609,7 @@ def _handle_usecols(
             self._col_indices = sorted(col_indices)
         return columns
 
-    def _buffered_line(self):
+    def _buffered_line(self) -> list[Scalar]:
         """
         Return a line from buffer, filling buffer if required.
         """
@@ -799,7 +812,17 @@ def _next_iter_line(self, row_num: int) -> list[Scalar] | None:
                 self._alert_malformed(msg, row_num)
             return None
 
+    @overload
     def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
+        ...
+
+    @overload
+    def _check_comments(self, lines: list[list[str]]) -> list[list[str]]:
+        ...
+
+    def _check_comments(
+        self, lines: list[list[Scalar]] | list[list[str]]
+    ) -> list[list[Scalar]] | list[list[str]]:
         if self.comment is None:
             return lines
         ret = []
@@ -886,7 +909,9 @@ def _clear_buffer(self) -> None:
 
     _implicit_index = False
 
-    def _get_index_name(self, columns: list[Hashable]):
+    def _get_index_name(
+        self, columns: list[Hashable]
+    ) -> tuple[list[Hashable] | None, list[Hashable], list[Hashable]]:
         """
         Try several cases to get lines:
 
@@ -951,8 +976,8 @@ def _get_index_name(self, columns: list[Hashable]):
 
         else:
             # Case 2
-            (index_name, columns_, self.index_col) = self._clean_index_names(
-                columns, self.index_col, self.unnamed_cols
+            (index_name, _, self.index_col) = self._clean_index_names(
+                columns, self.index_col
             )
 
         return index_name, orig_names, columns
@@ -1040,7 +1065,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
                 ]
         return zipped_content
 
-    def _get_lines(self, rows: int | None = None):
+    def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
         lines = self.buf
         new_rows = None
 

From 99b1285765e5ceb05bdf3cfa112adb2fbc9b6dfa Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 1 Dec 2021 21:15:36 +0100
Subject: [PATCH 2/6] Fix bug

---
 pandas/io/parsers/python_parser.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index d12725fce7f1a..689ec1198ab7c 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -13,10 +13,10 @@
     DefaultDict,
     Hashable,
     Iterator,
+    List,
     Mapping,
     Sequence,
     cast,
-    overload,
 )
 import warnings
 
@@ -211,15 +211,16 @@ class MyDialect(csv.Dialect):
                 # attempt to sniff the delimiter from the first valid line,
                 # i.e. no comment line and not in skiprows
                 line: str = f.readline()
-                lines: list[str] = self._check_comments([[line]])[0]
+                lines = self._check_comments([[line]])[0]
                 while self.skipfunc(self.pos) or not lines:
                     self.pos += 1
                     line = f.readline()
                     lines = self._check_comments([[line]])[0]
+                lines_str = cast(List[str], lines)
 
                 # since `line` was a string, lines will be a list containing
                 # only a single string
-                first_line = lines[0]
+                first_line = lines_str[0]
 
                 self.pos += 1
                 self.line_pos += 1
@@ -812,17 +813,7 @@ def _next_iter_line(self, row_num: int) -> list[Scalar] | None:
                 self._alert_malformed(msg, row_num)
             return None
 
-    @overload
     def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
-        ...
-
-    @overload
-    def _check_comments(self, lines: list[list[str]]) -> list[list[str]]:
-        ...
-
-    def _check_comments(
-        self, lines: list[list[Scalar]] | list[list[str]]
-    ) -> list[list[Scalar]] | list[list[str]]:
         if self.comment is None:
             return lines
         ret = []

From c1da4a9f12ae04acdcf0251bd0d3c436520cc845 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 1 Dec 2021 21:32:50 +0100
Subject: [PATCH 3/6] Fix assignment issue

---
 pandas/io/parsers/c_parser_wrapper.py | 1 +
 pandas/io/parsers/python_parser.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index cb0accfa1163a..d5c56aa6a0692 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -223,6 +223,7 @@ def _set_noconvert_columns(self):
 
     def read(self, nrows=None):
         index: Index | MultiIndex | Sequence[Hashable] | None
+        column_names: Sequence[Hashable] | MultiIndex
 
         try:
             if self.low_memory:
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 689ec1198ab7c..dd6bfc483be2f 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -220,7 +220,7 @@ class MyDialect(csv.Dialect):
 
                 # since `line` was a string, lines will be a list containing
                 # only a single string
-                first_line = lines_str[0]
+                first_line: str = lines_str[0]
 
                 self.pos += 1
                 self.line_pos += 1

From bfb096f23bcb7bf64732e3d50bbd885124c7a0ea Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 22 Dec 2021 17:01:21 +0100
Subject: [PATCH 4/6] Adress conflicts

---
 pandas/io/parsers/c_parser_wrapper.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index e4b5436d1c039..869b82a77da7c 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -232,7 +232,7 @@ def read(
         Sequence[Hashable] | MultiIndex,
         Mapping[Hashable, ArrayLike],
     ]:
-        index: Index | MultiIndex | Sequence[Hashable] | None
+        index: Index | MultiIndex | None
         column_names: Sequence[Hashable] | MultiIndex
         try:
             if self.low_memory:
@@ -300,6 +300,11 @@ def read(
 
             column_names, date_data = self._do_date_conversions(names, data)
 
+            # maybe create a mi on the columns
+            column_names = self._maybe_make_multi_index_columns(
+                column_names, self.col_names
+            )
+
         else:
             # rename dict keys
             data_tups = sorted(data.items())
@@ -324,11 +329,6 @@ def read(
             names, date_data = self._do_date_conversions(names, data)
             index, column_names = self._make_index(date_data, alldata, names)
 
-        # maybe create a mi on the columns
-        column_names = self._maybe_make_multi_index_columns(
-            column_names, self.col_names
-        )
-
         return index, column_names, date_data
 
     def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:

From b019ff74e38c9e0405973e2beee766120876a85b Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 22 Dec 2021 17:13:36 +0100
Subject: [PATCH 5/6] Remove unnecessary changes

---
 pandas/io/parsers/python_parser.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index a8f0e834e846d..2e508c55b98f7 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -221,15 +221,15 @@ class MyDialect(csv.Dialect):
 
                 # since `line` was a string, lines will be a list containing
                 # only a single string
-                first_line: str = lines_str[0]
+                line = lines_str[0]
 
                 self.pos += 1
                 self.line_pos += 1
-                sniffed = csv.Sniffer().sniff(first_line)
+                sniffed = csv.Sniffer().sniff(line)
                 dia.delimiter = sniffed.delimiter
 
                 # Note: encoding is irrelevant here
-                line_rdr = csv.reader(StringIO(first_line), dialect=dia)
+                line_rdr = csv.reader(StringIO(line), dialect=dia)
                 self.buf.extend(list(line_rdr))
 
             # Note: encoding is irrelevant here

From ee2f77aae9e20aee7aa4dafadc46edeefdfc38fa Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 28 Jan 2022 11:42:05 +0100
Subject: [PATCH 6/6] Adjust

---
 pandas/io/parsers/python_parser.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index d8c7873255ba3..68be818f4f3d4 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -173,7 +173,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds):
             )
         self.num = re.compile(regex)
 
-    def _make_reader(self, f: IO[str]) -> None:
+    def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
         sep = self.delimiter
 
         if sep is None or len(sep) == 1:
@@ -198,7 +198,7 @@ class MyDialect(csv.Dialect):
             else:
                 # attempt to sniff the delimiter from the first valid line,
                 # i.e. no comment line and not in skiprows
-                line: str = f.readline()
+                line = f.readline()
                 lines = self._check_comments([[line]])[0]
                 while self.skipfunc(self.pos) or not lines:
                     self.pos += 1
@@ -1146,7 +1146,7 @@ class FixedWidthReader(abc.Iterator):
 
     def __init__(
         self,
-        f: IO[str],
+        f: IO[str] | ReadCsvBuffer[str],
         colspecs: list[tuple[int, int]] | Literal["infer"],
         delimiter: str | None,
         comment: str | None,
@@ -1243,14 +1243,16 @@ def detect_colspecs(
         return edge_pairs
 
     def __next__(self) -> list[str]:
+        # Argument 1 to "next" has incompatible type "Union[IO[str],
+        # ReadCsvBuffer[str]]"; expected "SupportsNext[str]"
         if self.buffer is not None:
             try:
                 line = next(self.buffer)
             except StopIteration:
                 self.buffer = None
-                line = next(self.f)
+                line = next(self.f)  # type: ignore[arg-type]
         else:
-            line = next(self.f)
+            line = next(self.f)  # type: ignore[arg-type]
         # Note: 'colspecs' is a sequence of half-open intervals.
         return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
 
@@ -1267,7 +1269,7 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
         self.infer_nrows = kwds.pop("infer_nrows")
         PythonParser.__init__(self, f, **kwds)
 
-    def _make_reader(self, f: IO[str]) -> None:
+    def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
         self.data = FixedWidthReader(
             f,
             self.colspecs,