New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
TYP: Typ part of python_parser #44406
Changes from all commits
9b351a1
a1a465e
7f5b52b
9db189b
467421b
a719b92
b39888c
16c5ccc
6421012
20857c8
ddb1f23
97425f1
2870987
2e85ac5
8bd226b
91177a1
45c646a
41aca2f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,10 +10,13 @@ | |
Any, | ||
Callable, | ||
DefaultDict, | ||
Hashable, | ||
Iterable, | ||
Mapping, | ||
Sequence, | ||
cast, | ||
final, | ||
overload, | ||
) | ||
import warnings | ||
|
||
|
@@ -56,6 +59,7 @@ | |
from pandas.core.dtypes.dtypes import CategoricalDtype | ||
from pandas.core.dtypes.missing import isna | ||
|
||
from pandas import DataFrame | ||
from pandas.core import algorithms | ||
from pandas.core.arrays import Categorical | ||
from pandas.core.indexes.api import ( | ||
|
@@ -241,7 +245,7 @@ def _open_handles( | |
errors=kwds.get("encoding_errors", "strict"), | ||
) | ||
|
||
def _validate_parse_dates_presence(self, columns: list[str]) -> Iterable: | ||
def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: | ||
""" | ||
Check if parse_dates are in columns. | ||
|
||
|
@@ -337,11 +341,24 @@ def _should_parse_dates(self, i: int) -> bool: | |
|
||
@final | ||
def _extract_multi_indexer_columns( | ||
self, header, index_names, passed_names: bool = False | ||
self, | ||
header, | ||
index_names: list | None, | ||
passed_names: bool = False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When a function contains as much logic as this one, I imagine it takes a bit of effort to establish the types and in the process work out what a function is actually doing for our internal functions. In these cases, it maybe worthwhile improving the docstrings while that knowledge is fresh. So making the first line a one-liner, adding descriptions (or where they are passed from) for the function parameters and also descriptions of the return types is most welcome. For the internal functions, we don't need the types for the function parameters and return values in the docstrings as we are adding the type annotations. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This sounds good. Will add descriptions for the more complicated functions in the future There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, will enhance when fully typing the function |
||
): | ||
""" | ||
extract and return the names, index_names, col_names | ||
header is a list-of-lists returned from the parsers | ||
Extract and return the names, index_names, col_names if the column | ||
names are a MultiIndex. | ||
|
||
Parameters | ||
---------- | ||
header: list of lists | ||
The header rows | ||
index_names: list, optional | ||
The names of the future index | ||
passed_names: bool, default False | ||
A flag specifying if names where passed | ||
|
||
""" | ||
if len(header) < 2: | ||
return header[0], index_names, None, passed_names | ||
|
@@ -400,15 +417,15 @@ def extract(r): | |
return names, index_names, col_names, passed_names | ||
|
||
@final | ||
def _maybe_dedup_names(self, names): | ||
def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: | ||
# see gh-7160 and gh-9424: this helps to provide | ||
# immediate alleviation of the duplicate names | ||
# issue and appears to be satisfactory to users, | ||
# but ultimately, not needing to butcher the names | ||
# would be nice! | ||
if self.mangle_dupe_cols: | ||
names = list(names) # so we can index | ||
counts: DefaultDict[int | str | tuple, int] = defaultdict(int) | ||
counts: DefaultDict[Hashable, int] = defaultdict(int) | ||
is_potential_mi = _is_potential_multi_index(names, self.index_col) | ||
|
||
for i, col in enumerate(names): | ||
|
@@ -418,6 +435,8 @@ def _maybe_dedup_names(self, names): | |
counts[col] = cur_count + 1 | ||
|
||
if is_potential_mi: | ||
# for mypy | ||
assert isinstance(col, tuple) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
col = col[:-1] + (f"{col[-1]}.{cur_count}",) | ||
else: | ||
col = f"{col}.{cur_count}" | ||
|
@@ -572,7 +591,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: | |
@final | ||
def _convert_to_ndarrays( | ||
self, | ||
dct: dict, | ||
dct: Mapping, | ||
na_values, | ||
na_fvalues, | ||
verbose: bool = False, | ||
|
@@ -664,7 +683,7 @@ def _convert_to_ndarrays( | |
|
||
@final | ||
def _set_noconvert_dtype_columns( | ||
self, col_indices: list[int], names: list[int | str | tuple] | ||
self, col_indices: list[int], names: Sequence[Hashable] | ||
) -> set[int]: | ||
""" | ||
Set the columns that should not undergo dtype conversions. | ||
|
@@ -848,7 +867,27 @@ def _cast_types(self, values, cast_type, column): | |
) from err | ||
return values | ||
|
||
def _do_date_conversions(self, names, data): | ||
@overload | ||
def _do_date_conversions( | ||
self, | ||
names: Index, | ||
data: DataFrame, | ||
) -> tuple[Sequence[Hashable] | Index, DataFrame]: | ||
... | ||
|
||
@overload | ||
def _do_date_conversions( | ||
self, | ||
names: Sequence[Hashable], | ||
data: Mapping[Hashable, ArrayLike], | ||
) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: | ||
... | ||
|
||
def _do_date_conversions( | ||
self, | ||
names: Sequence[Hashable] | Index, | ||
data: Mapping[Hashable, ArrayLike] | DataFrame, | ||
) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: | ||
# returns data, columns | ||
|
||
if self.parse_dates is not None: | ||
|
@@ -864,7 +903,11 @@ def _do_date_conversions(self, names, data): | |
|
||
return names, data | ||
|
||
def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: | ||
def _check_data_length( | ||
self, | ||
columns: Sequence[Hashable], | ||
data: Sequence[ArrayLike], | ||
) -> None: | ||
"""Checks if length of data is equal to length of column names. | ||
|
||
One set of trailing commas is allowed. self.index_col not False | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not related to typing: Can
_do_date_conversions
return a newDataFrame
object? If yes, this line should maybe be re-written asotherwise, python will set
columns
on the oldframe
object and then replace theframe
object.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but it will return the input without modification. This happens only if self.parse_dates is None. In this case the input is returned, so should be fine