Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecated usecols with out of bounds indices in read_csv #41130

Merged
merged 10 commits into from
May 13, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,7 @@ Deprecations
- The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` is deprecated and will be removed in a future version (:issue:`37643`)
- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`)
- Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`)
- Deprecated using ``usecols`` with out of bounds indices for ``read_csv`` with ``engine="c"`` (:issue:`25623`)

.. ---------------------------------------------------------------------------

Expand Down
11 changes: 11 additions & 0 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -947,6 +947,17 @@ cdef class TextReader:
f"{self.table_width - self.leading_cols} "
f"and found {num_cols}")

if (self.usecols is not None and not callable(self.usecols) and
all(isinstance(u, int) for u in self.usecols)):
missing_usecols = [col for col in self.usecols if col >= num_cols]
if missing_usecols:
warnings.warn(
"Defining usecols with out of bounds indices is deprecated "
"and will raise a ParserError in a future version.",
FutureWarning,
stacklevel=6,
)

results = {}
nused = 0
for i in range(self.table_width):
Expand Down
27 changes: 21 additions & 6 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
Tuple,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -477,7 +478,7 @@ def _infer_columns(self):
if self.usecols is not None:
# Set _use_cols. We don't store columns because they are
# overwritten.
self._handle_usecols(columns, names)
self._handle_usecols(columns, names, num_original_columns)
else:
num_original_columns = len(names)
if self._col_indices is not None and len(names) != len(
Expand All @@ -487,7 +488,9 @@ def _infer_columns(self):
else:
columns = [names]
else:
columns = self._handle_usecols(columns, columns[0])
columns = self._handle_usecols(
columns, columns[0], num_original_columns
)
else:
try:
line = self._buffered_line()
Expand All @@ -506,10 +509,12 @@ def _infer_columns(self):
columns = [[f"{self.prefix}{i}" for i in range(ncols)]]
else:
columns = [list(range(ncols))]
columns = self._handle_usecols(columns, columns[0])
columns = self._handle_usecols(
columns, columns[0], num_original_columns
)
else:
if self.usecols is None or len(names) >= num_original_columns:
columns = self._handle_usecols([names], names)
columns = self._handle_usecols([names], names, num_original_columns)
num_original_columns = len(names)
else:
if not callable(self.usecols) and len(names) != len(self.usecols):
Expand All @@ -518,13 +523,13 @@ def _infer_columns(self):
"header fields in the file"
)
# Ignore output but set used columns.
self._handle_usecols([names], names)
self._handle_usecols([names], names, ncols)
columns = [names]
num_original_columns = ncols

return columns, num_original_columns, unnamed_cols

def _handle_usecols(self, columns, usecols_key):
def _handle_usecols(self, columns, usecols_key, num_original_columns):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Brief docstring on this new parameter to explain how it differs from columns (and why we couldn't just use columns.length in the logic).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you type args here

"""
Sets self._col_indices

Expand All @@ -549,6 +554,16 @@ def _handle_usecols(self, columns, usecols_key):
else:
col_indices.append(col)
else:
missing_usecols = [
col for col in self.usecols if col >= num_original_columns
]
if missing_usecols:
warnings.warn(
"Defining usecols with out of bounds indices is deprecated "
"and will raise a ParserError in a future version.",
FutureWarning,
stacklevel=8,
)
col_indices = self.usecols

columns = [
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/io/parser/usecols/test_usecols_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,8 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
a,b
1,2
"""
result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
expected = DataFrame({"a": [1], "b": [None]})
if names is None and parser.engine == "python":
expected = DataFrame({"a": [1]})
Expand Down