From d0bebe6c6e538b2f22e5ff895f80135260b6cf10 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 1 Dec 2021 20:20:54 +0100 Subject: [PATCH 1/6] TYP: Type python parser --- pandas/io/parsers/base_parser.py | 23 ++++++------ pandas/io/parsers/c_parser_wrapper.py | 23 +++++++++--- pandas/io/parsers/python_parser.py | 53 ++++++++++++++++++++------- 3 files changed, 68 insertions(+), 31 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5d03529654b0d..21dca55354e8b 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -376,9 +376,7 @@ def _extract_multi_indexer_columns( # clean the index_names index_names = header.pop(-1) - index_names, _, _ = self._clean_index_names( - index_names, self.index_col, self.unnamed_cols - ) + index_names, _, _ = self._clean_index_names(index_names, self.index_col) # extract the columns field_count = len(header[0]) @@ -455,21 +453,24 @@ def _maybe_make_multi_index_columns(self, columns, col_names=None): return columns @final - def _make_index(self, data, alldata, columns, indexnamerow=False): + def _make_index( + self, data, alldata, columns, indexnamerow=False + ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: + index: Index | None if not is_index_col(self.index_col) or not self.index_col: index = None elif not self._has_complex_date_col: - index = self._get_simple_index(alldata, columns) - index = self._agg_index(index) + simple_index = self._get_simple_index(alldata, columns) + index = self._agg_index(simple_index) elif self._has_complex_date_col: if not self._name_processed: (self.index_names, _, self.index_col) = self._clean_index_names( - list(columns), self.index_col, self.unnamed_cols + list(columns), self.index_col ) self._name_processed = True - index = self._get_complex_date_index(data, columns) - index = self._agg_index(index, try_parse_dates=False) + date_index = self._get_complex_date_index(data, columns) + index = self._agg_index(date_index, try_parse_dates=False) # add names for the index if indexnamerow: @@ -1022,7 +1023,7 @@ def _validate_usecols_arg(self, usecols): return usecols, usecols_dtype return usecols, None - def _clean_index_names(self, columns, index_col, unnamed_cols): + def _clean_index_names(self, columns, index_col): if not is_index_col(index_col): return None, columns, index_col @@ -1054,7 +1055,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols): # Only clean index names that were placeholders. for i, name in enumerate(index_names): - if isinstance(name, str) and name in unnamed_cols: + if isinstance(name, str) and name in self.unnamed_cols: index_names[i] = None return index_names, columns, index_col diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 05c963f2d2552..cb0accfa1163a 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -1,5 +1,9 @@ from __future__ import annotations +from typing import ( + Hashable, + Sequence, +) import warnings import numpy as np @@ -20,6 +24,10 @@ from pandas.core.dtypes.concat import union_categoricals from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas import ( + Index, + MultiIndex, +) from pandas.core.indexes.api import ensure_index_from_sequences from pandas.io.parsers.base_parser import ( @@ -172,7 +180,6 @@ def __init__( self.names, # type: ignore[has-type] # error: Cannot determine type of 'index_col' self.index_col, # type: ignore[has-type] - self.unnamed_cols, ) if self.index_names is None: @@ -215,6 +222,8 @@ def _set_noconvert_columns(self): self._reader.set_noconvert(col) def read(self, nrows=None): + index: Index | MultiIndex | Sequence[Hashable] | None + try: if self.low_memory: chunks = self._reader.read_low_memory(nrows) @@ -279,7 +288,7 @@ def read(self, nrows=None): data_tups = sorted(data.items()) data = {k: v for k, (i, v) in zip(names, data_tups)} - names, date_data = self._do_date_conversions(names, data) + column_names, date_data = self._do_date_conversions(names, data) else: # rename dict keys @@ -303,12 +312,14 @@ def read(self, nrows=None): data = {k: v for k, (i, v) in zip(names, data_tups)} names, date_data = self._do_date_conversions(names, data) - index, names = self._make_index(date_data, alldata, names) + index, column_names = self._make_index(date_data, alldata, names) # maybe create a mi on the columns - names = self._maybe_make_multi_index_columns(names, self.col_names) + column_names = self._maybe_make_multi_index_columns( + column_names, self.col_names + ) - return index, names, date_data + return index, column_names, date_data def _filter_usecols(self, names): # hackish @@ -325,7 +336,7 @@ def _get_index_names(self): if self._reader.leading_cols == 0 and self.index_col is not None: (idx_names, names, self.index_col) = self._clean_index_names( - names, self.index_col, self.unnamed_cols + names, self.index_col ) return names, idx_names diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 27d0944572024..d12725fce7f1a 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -9,12 +9,14 @@ import re import sys from typing import ( + IO, DefaultDict, Hashable, Iterator, Mapping, Sequence, cast, + overload, ) import warnings @@ -36,6 +38,11 @@ from pandas.core.dtypes.common import is_integer from pandas.core.dtypes.inference import is_dict_like +from pandas import ( + Index, + MultiIndex, +) + from pandas.io.parsers.base_parser import ( ParserBase, parser_defaults, @@ -178,7 +185,7 @@ def __init__( ) self.num = re.compile(regex) - def _make_reader(self, f) -> None: + def _make_reader(self, f: IO[str]) -> None: sep = self.delimiter if sep is None or len(sep) == 1: @@ -203,8 +210,8 @@ class MyDialect(csv.Dialect): else: # attempt to sniff the delimiter from the first valid line, # i.e. no comment line and not in skiprows - line = f.readline() - lines = self._check_comments([[line]])[0] + line: str = f.readline() + lines: list[str] = self._check_comments([[line]])[0] while self.skipfunc(self.pos) or not lines: self.pos += 1 line = f.readline() @@ -212,15 +219,15 @@ class MyDialect(csv.Dialect): # since `line` was a string, lines will be a list containing # only a single string - line = lines[0] + first_line = lines[0] self.pos += 1 self.line_pos += 1 - sniffed = csv.Sniffer().sniff(line) + sniffed = csv.Sniffer().sniff(first_line) dia.delimiter = sniffed.delimiter # Note: encoding is irrelevant here - line_rdr = csv.reader(StringIO(line), dialect=dia) + line_rdr = csv.reader(StringIO(first_line), dialect=dia) self.buf.extend(list(line_rdr)) # Note: encoding is irrelevant here @@ -244,7 +251,11 @@ def _read(): # TextIOWrapper, mmap, None]") self.data = reader # type: ignore[assignment] - def read(self, rows: int | None = None): + def read( + self, rows: int | None = None + ) -> tuple[ + Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike] + ]: try: content = self._get_lines(rows) except StopIteration: @@ -284,9 +295,11 @@ def read(self, rows: int | None = None): conv_data = self._convert_data(data) columns, conv_data = self._do_date_conversions(columns, conv_data) - index, columns = self._make_index(conv_data, alldata, columns, indexnamerow) + index, result_columns = self._make_index( + conv_data, alldata, columns, indexnamerow + ) - return index, columns, conv_data + return index, result_columns, conv_data def _exclude_implicit_index( self, @@ -596,7 +609,7 @@ def _handle_usecols( self._col_indices = sorted(col_indices) return columns - def _buffered_line(self): + def _buffered_line(self) -> list[Scalar]: """ Return a line from buffer, filling buffer if required. """ @@ -799,7 +812,17 @@ def _next_iter_line(self, row_num: int) -> list[Scalar] | None: self._alert_malformed(msg, row_num) return None + @overload def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: + ... + + @overload + def _check_comments(self, lines: list[list[str]]) -> list[list[str]]: + ... + + def _check_comments( + self, lines: list[list[Scalar]] | list[list[str]] + ) -> list[list[Scalar]] | list[list[str]]: if self.comment is None: return lines ret = [] @@ -886,7 +909,9 @@ def _clear_buffer(self) -> None: _implicit_index = False - def _get_index_name(self, columns: list[Hashable]): + def _get_index_name( + self, columns: list[Hashable] + ) -> tuple[list[Hashable] | None, list[Hashable], list[Hashable]]: """ Try several cases to get lines: @@ -951,8 +976,8 @@ def _get_index_name(self, columns: list[Hashable]): else: # Case 2 - (index_name, columns_, self.index_col) = self._clean_index_names( - columns, self.index_col, self.unnamed_cols + (index_name, _, self.index_col) = self._clean_index_names( + columns, self.index_col ) return index_name, orig_names, columns @@ -1040,7 +1065,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: ] return zipped_content - def _get_lines(self, rows: int | None = None): + def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: lines = self.buf new_rows = None From 99b1285765e5ceb05bdf3cfa112adb2fbc9b6dfa Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 1 Dec 2021 21:15:36 +0100 Subject: [PATCH 2/6] Fix bug --- pandas/io/parsers/python_parser.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index d12725fce7f1a..689ec1198ab7c 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -13,10 +13,10 @@ DefaultDict, Hashable, Iterator, + List, Mapping, Sequence, cast, - overload, ) import warnings @@ -211,15 +211,16 @@ class MyDialect(csv.Dialect): # attempt to sniff the delimiter from the first valid line, # i.e. no comment line and not in skiprows line: str = f.readline() - lines: list[str] = self._check_comments([[line]])[0] + lines = self._check_comments([[line]])[0] while self.skipfunc(self.pos) or not lines: self.pos += 1 line = f.readline() lines = self._check_comments([[line]])[0] + lines_str = cast(List[str], lines) # since `line` was a string, lines will be a list containing # only a single string - first_line = lines[0] + first_line = lines_str[0] self.pos += 1 self.line_pos += 1 @@ -812,17 +813,7 @@ def _next_iter_line(self, row_num: int) -> list[Scalar] | None: self._alert_malformed(msg, row_num) return None - @overload def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: - ... - - @overload - def _check_comments(self, lines: list[list[str]]) -> list[list[str]]: - ... - - def _check_comments( - self, lines: list[list[Scalar]] | list[list[str]] - ) -> list[list[Scalar]] | list[list[str]]: if self.comment is None: return lines ret = [] From c1da4a9f12ae04acdcf0251bd0d3c436520cc845 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 1 Dec 2021 21:32:50 +0100 Subject: [PATCH 3/6] Fix assignment issue --- pandas/io/parsers/c_parser_wrapper.py | 1 + pandas/io/parsers/python_parser.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index cb0accfa1163a..d5c56aa6a0692 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -223,6 +223,7 @@ def _set_noconvert_columns(self): def read(self, nrows=None): index: Index | MultiIndex | Sequence[Hashable] | None + column_names: Sequence[Hashable] | MultiIndex try: if self.low_memory: diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 689ec1198ab7c..dd6bfc483be2f 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -220,7 +220,7 @@ class MyDialect(csv.Dialect): # since `line` was a string, lines will be a list containing # only a single string - first_line = lines_str[0] + first_line: str = lines_str[0] self.pos += 1 self.line_pos += 1 From bfb096f23bcb7bf64732e3d50bbd885124c7a0ea Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 22 Dec 2021 17:01:21 +0100 Subject: [PATCH 4/6] Adress conflicts --- pandas/io/parsers/c_parser_wrapper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index e4b5436d1c039..869b82a77da7c 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -232,7 +232,7 @@ def read( Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike], ]: - index: Index | MultiIndex | Sequence[Hashable] | None + index: Index | MultiIndex | None column_names: Sequence[Hashable] | MultiIndex try: if self.low_memory: @@ -300,6 +300,11 @@ def read( column_names, date_data = self._do_date_conversions(names, data) + # maybe create a mi on the columns + column_names = self._maybe_make_multi_index_columns( + column_names, self.col_names + ) + else: # rename dict keys data_tups = sorted(data.items()) @@ -324,11 +329,6 @@ def read( names, date_data = self._do_date_conversions(names, data) index, column_names = self._make_index(date_data, alldata, names) - # maybe create a mi on the columns - column_names = self._maybe_make_multi_index_columns( - column_names, self.col_names - ) - return index, column_names, date_data def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]: From b019ff74e38c9e0405973e2beee766120876a85b Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 22 Dec 2021 17:13:36 +0100 Subject: [PATCH 5/6] Remove unnecessary changes --- pandas/io/parsers/python_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index a8f0e834e846d..2e508c55b98f7 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -221,15 +221,15 @@ class MyDialect(csv.Dialect): # since `line` was a string, lines will be a list containing # only a single string - first_line: str = lines_str[0] + line = lines_str[0] self.pos += 1 self.line_pos += 1 - sniffed = csv.Sniffer().sniff(first_line) + sniffed = csv.Sniffer().sniff(line) dia.delimiter = sniffed.delimiter # Note: encoding is irrelevant here - line_rdr = csv.reader(StringIO(first_line), dialect=dia) + line_rdr = csv.reader(StringIO(line), dialect=dia) self.buf.extend(list(line_rdr)) # Note: encoding is irrelevant here From ee2f77aae9e20aee7aa4dafadc46edeefdfc38fa Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 28 Jan 2022 11:42:05 +0100 Subject: [PATCH 6/6] Adjust --- pandas/io/parsers/python_parser.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index d8c7873255ba3..68be818f4f3d4 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -173,7 +173,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds): ) self.num = re.compile(regex) - def _make_reader(self, f: IO[str]) -> None: + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None: sep = self.delimiter if sep is None or len(sep) == 1: @@ -198,7 +198,7 @@ class MyDialect(csv.Dialect): else: # attempt to sniff the delimiter from the first valid line, # i.e. no comment line and not in skiprows - line: str = f.readline() + line = f.readline() lines = self._check_comments([[line]])[0] while self.skipfunc(self.pos) or not lines: self.pos += 1 @@ -1146,7 +1146,7 @@ class FixedWidthReader(abc.Iterator): def __init__( self, - f: IO[str], + f: IO[str] | ReadCsvBuffer[str], colspecs: list[tuple[int, int]] | Literal["infer"], delimiter: str | None, comment: str | None, @@ -1243,14 +1243,16 @@ def detect_colspecs( return edge_pairs def __next__(self) -> list[str]: + # Argument 1 to "next" has incompatible type "Union[IO[str], + # ReadCsvBuffer[str]]"; expected "SupportsNext[str]" if self.buffer is not None: try: line = next(self.buffer) except StopIteration: self.buffer = None - line = next(self.f) + line = next(self.f) # type: ignore[arg-type] else: - line = next(self.f) + line = next(self.f) # type: ignore[arg-type] # Note: 'colspecs' is a sequence of half-open intervals. return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] @@ -1267,7 +1269,7 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None: self.infer_nrows = kwds.pop("infer_nrows") PythonParser.__init__(self, f, **kwds) - def _make_reader(self, f: IO[str]) -> None: + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None: self.data = FixedWidthReader( f, self.colspecs,