Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TYP: Typ part of python_parser #44406

Merged
merged 18 commits into from Nov 28, 2021
7 changes: 6 additions & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Expand Up @@ -110,7 +110,12 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
multi_index_named = False
frame.columns = self.names
# we only need the frame not the names
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
# error: Incompatible types in assignment (expression has type
# "Union[List[Union[Union[str, int, float, bool], Union[Period, Timestamp,
# Timedelta, Any]]], Index]", variable has type "Index") [assignment]
frame.columns, frame = self._do_date_conversions( # type: ignore[assignment]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to typing: Can _do_date_conversions return a new DataFrame object? If yes, this line should maybe be re-written as

columns, frame = self._do_date_conversions(...)
frame.columns = columns

otherwise, python will set columns on the old frame object and then replace the frame object.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but it will return the input without modification. This happens only if self.parse_dates is None. In this case the input is returned, so should be fine

frame.columns, frame
)
if self.index_col is not None:
for i, item in enumerate(self.index_col):
if is_integer(item):
Expand Down
63 changes: 53 additions & 10 deletions pandas/io/parsers/base_parser.py
Expand Up @@ -9,10 +9,13 @@
Any,
Callable,
DefaultDict,
Hashable,
Iterable,
Mapping,
Sequence,
cast,
final,
overload,
)
import warnings

Expand Down Expand Up @@ -55,6 +58,7 @@
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import isna

from pandas import DataFrame
from pandas.core import algorithms
from pandas.core.arrays import Categorical
from pandas.core.indexes.api import (
Expand Down Expand Up @@ -237,7 +241,7 @@ def _open_handles(
errors=kwds.get("encoding_errors", "strict"),
)

def _validate_parse_dates_presence(self, columns: list[str]) -> None:
def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> None:
"""
Check if parse_dates are in columns.

Expand Down Expand Up @@ -321,11 +325,24 @@ def _should_parse_dates(self, i: int) -> bool:

@final
def _extract_multi_indexer_columns(
self, header, index_names, passed_names: bool = False
self,
header,
index_names: list | None,
passed_names: bool = False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When a function contains as much logic as this one, I imagine it takes a bit of effort to establish the types and in the process work out what a function is actually doing for our internal functions. In these cases, it maybe worthwhile improving the docstrings while that knowledge is fresh.

So making the first line a one-liner, adding descriptions (or where they are passed from) for the function parameters and also descriptions of the return types is most welcome.

For the internal functions, we don't need the types for the function parameters and return values in the docstrings as we are adding the type annotations.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sounds good. Will add descriptions for the more complicated functions in the future

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, will enhance when fully typing the function

):
"""
extract and return the names, index_names, col_names
header is a list-of-lists returned from the parsers
Extract and return the names, index_names, col_names if the column
names are a MultiIndex.

Parameters
----------
header: list of lists
The header rows
index_names: list, optional
The names of the future index
passed_names: bool, default False
A flag specifying if names where passed

"""
if len(header) < 2:
return header[0], index_names, None, passed_names
Expand Down Expand Up @@ -384,15 +401,15 @@ def extract(r):
return names, index_names, col_names, passed_names

@final
def _maybe_dedup_names(self, names):
def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
# see gh-7160 and gh-9424: this helps to provide
# immediate alleviation of the duplicate names
# issue and appears to be satisfactory to users,
# but ultimately, not needing to butcher the names
# would be nice!
if self.mangle_dupe_cols:
names = list(names) # so we can index
counts: DefaultDict[int | str | tuple, int] = defaultdict(int)
counts: DefaultDict[Hashable, int] = defaultdict(int)
is_potential_mi = _is_potential_multi_index(names, self.index_col)

for i, col in enumerate(names):
Expand All @@ -402,6 +419,8 @@ def _maybe_dedup_names(self, names):
counts[col] = cur_count + 1

if is_potential_mi:
# for mypy
assert isinstance(col, tuple)
jreback marked this conversation as resolved.
Show resolved Hide resolved
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
else:
col = f"{col}.{cur_count}"
Expand Down Expand Up @@ -533,7 +552,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
@final
def _convert_to_ndarrays(
self,
dct: dict,
dct: Mapping,
na_values,
na_fvalues,
verbose: bool = False,
Expand Down Expand Up @@ -617,7 +636,7 @@ def _convert_to_ndarrays(

@final
def _set_noconvert_dtype_columns(
self, col_indices: list[int], names: list[int | str | tuple]
self, col_indices: list[int], names: Sequence[Hashable]
) -> set[int]:
"""
Set the columns that should not undergo dtype conversions.
Expand Down Expand Up @@ -801,7 +820,27 @@ def _cast_types(self, values, cast_type, column):
) from err
return values

def _do_date_conversions(self, names, data):
@overload
def _do_date_conversions(
self,
names: Index,
data: DataFrame,
) -> tuple[Sequence[Hashable] | Index, DataFrame]:
...

@overload
def _do_date_conversions(
self,
names: Sequence[Hashable],
data: Mapping[Hashable, ArrayLike],
) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
...

def _do_date_conversions(
self,
names: Sequence[Hashable] | Index,
data: Mapping[Hashable, ArrayLike] | DataFrame,
) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:
# returns data, columns

if self.parse_dates is not None:
Expand All @@ -817,7 +856,11 @@ def _do_date_conversions(self, names, data):

return names, data

def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None:
def _check_data_length(
self,
columns: Sequence[Hashable],
data: Sequence[ArrayLike],
) -> None:
"""Checks if length of data is equal to length of column names.

One set of trailing commas is allowed. self.index_col not False
Expand Down
8 changes: 4 additions & 4 deletions pandas/io/parsers/c_parser_wrapper.py
Expand Up @@ -279,7 +279,7 @@ def read(self, nrows=None):
data_tups = sorted(data.items())
data = {k: v for k, (i, v) in zip(names, data_tups)}

names, data = self._do_date_conversions(names, data)
names, date_data = self._do_date_conversions(names, data)

else:
# rename dict keys
Expand All @@ -302,13 +302,13 @@ def read(self, nrows=None):

data = {k: v for k, (i, v) in zip(names, data_tups)}

names, data = self._do_date_conversions(names, data)
index, names = self._make_index(data, alldata, names)
names, date_data = self._do_date_conversions(names, data)
index, names = self._make_index(date_data, alldata, names)

# maybe create a mi on the columns
names = self._maybe_make_multi_index_columns(names, self.col_names)

return index, names, data
return index, names, date_data

def _filter_usecols(self, names):
# hackish
Expand Down