Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: squeeze argument in read_csv/read_table/read_excel #43427

Merged
merged 11 commits into from
Sep 10, 2021
5 changes: 5 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,10 @@ Returning Series
Using the ``squeeze`` keyword, the parser will return output with a single column
as a ``Series``:

.. deprecated:: 1.4.0
Users should append ``.squeeze("columns")`` to the DataFrame returned by
``read_csv`` instead.

.. ipython:: python
:suppress:

Expand All @@ -1217,6 +1221,7 @@ as a ``Series``:
fh.write(data)

.. ipython:: python
:okwarning:

print(open("tmp.csv").read())

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ Other Deprecations
- Deprecated :meth:`Index.reindex` with a non-unique index (:issue:`42568`)
- Deprecated :meth:`.Styler.render` in favour of :meth:`.Styler.to_html` (:issue:`42140`)
- Deprecated passing in a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`)
- Deprecated the ``squeeze`` argument to :meth:`read_csv`, :meth:`read_table`, and :meth:`read_excel`. Users should squeeze the DataFrame afterwards with ``.squeeze("columns")`` instead. (:issue:`43242`)

.. ---------------------------------------------------------------------------

Expand Down
10 changes: 7 additions & 3 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@
Returns a subset of the columns according to behavior above.
squeeze : bool, default False
If the parsed data only contains one column then return a Series.

.. deprecated:: 1.4.0
Append ``.squeeze("columns")`` to the call to ``read_excel`` to squeeze
the data.
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
Use `object` to preserve data as stored in Excel and not interpret dtype.
Expand Down Expand Up @@ -337,7 +341,7 @@ def read_excel(
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
dtype: DtypeArg | None = None,
engine=None,
converters=None,
Expand Down Expand Up @@ -481,7 +485,7 @@ def parse(
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
dtype: DtypeArg | None = None,
true_values=None,
false_values=None,
Expand Down Expand Up @@ -1243,7 +1247,7 @@ def parse(
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
converters=None,
true_values=None,
false_values=None,
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@
"chunksize": None,
"verbose": False,
"encoding": None,
"squeeze": False,
"squeeze": None,
"compression": None,
"mangle_dupe_cols": True,
"infer_datetime_format": False,
Expand Down
1 change: 0 additions & 1 deletion pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class CParserWrapper(ParserBase):
def __init__(self, src: FilePathOrBuffer, **kwds):
self.kwds = kwds
kwds = kwds.copy()

ParserBase.__init__(self, kwds)

self.low_memory = kwds.pop("low_memory", False)
Expand Down
26 changes: 20 additions & 6 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Appender,
deprecate_nonkeyword_arguments,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -131,6 +132,10 @@
parsing time and lower memory usage.
squeeze : bool, default False
If the parsed data only contains one column then return a Series.

.. deprecated:: 1.4.0
Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze
the data.
prefix : str, optional
Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
mangle_dupe_cols : bool, default True
Expand Down Expand Up @@ -439,7 +444,11 @@
"low_memory",
}

_deprecated_defaults: dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None}
_deprecated_defaults: dict[str, Any] = {
"error_bad_lines": None,
"warn_bad_lines": None,
"squeeze": None,
}


def validate_integer(name, val, min_val=0):
Expand Down Expand Up @@ -552,7 +561,7 @@ def read_csv(
names=lib.no_default,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
prefix=lib.no_default,
mangle_dupe_cols=True,
# General Parsing Configuration
Expand Down Expand Up @@ -650,7 +659,7 @@ def read_table(
names=lib.no_default,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
prefix=lib.no_default,
mangle_dupe_cols=True,
# General Parsing Configuration
Expand Down Expand Up @@ -867,11 +876,12 @@ def __init__(self, f, engine=None, **kwds):

self.chunksize = options.pop("chunksize", None)
self.nrows = options.pop("nrows", None)
self.squeeze = options.pop("squeeze", False)

self._check_file_or_buffer(f, engine)
self.options, self.engine = self._clean_options(options, engine)

self.squeeze = self.options.pop("squeeze", False)

if "has_index_names" in kwds:
self.options["has_index_names"] = kwds["has_index_names"]

Expand Down Expand Up @@ -1050,7 +1060,7 @@ def _clean_options(self, options, engine):
f"The {arg} argument has been deprecated and will be "
"removed in a future version.\n\n"
)
warnings.warn(msg, FutureWarning, stacklevel=7)
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
else:
result[arg] = parser_default

Expand Down Expand Up @@ -1100,6 +1110,10 @@ def _clean_options(self, options, engine):
result["na_values"] = na_values
result["na_fvalues"] = na_fvalues
result["skiprows"] = skiprows
# Default for squeeze is none since we need to check
# if user sets it. We then set to False to preserve
# previous behavior.
result["squeeze"] = False if options["squeeze"] is None else options["squeeze"]

return result, engine

Expand Down Expand Up @@ -1149,7 +1163,7 @@ def read(self, nrows=None):
self._currow += new_rows

if self.squeeze and len(df.columns) == 1:
return df[df.columns[0]].copy()
return df.squeeze("columns").copy()
return df

def get_chunk(self, size=None):
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/frame/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,8 +1039,7 @@ def test_to_csv_compression(self, df, encoding, compression):
compression=compression,
encoding=encoding,
index_col=0,
squeeze=True,
)
).squeeze("columns")
tm.assert_frame_equal(df, result)

# explicitly make sure file is compressed
Expand Down
27 changes: 17 additions & 10 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,18 +1194,25 @@ def test_read_excel_squeeze(self, read_ext):
# GH 12157
f = "test_squeeze" + read_ext

actual = pd.read_excel(f, sheet_name="two_columns", index_col=0, squeeze=True)
expected = Series([2, 3, 4], [4, 5, 6], name="b")
expected.index.name = "a"
tm.assert_series_equal(actual, expected)
with tm.assert_produces_warning(
FutureWarning,
match="The squeeze argument has been deprecated "
"and will be removed in a future version.\n\n",
):
actual = pd.read_excel(
f, sheet_name="two_columns", index_col=0, squeeze=True
)
expected = Series([2, 3, 4], [4, 5, 6], name="b")
expected.index.name = "a"
tm.assert_series_equal(actual, expected)

actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
tm.assert_frame_equal(actual, expected)
actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
tm.assert_frame_equal(actual, expected)

actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
expected = Series([1, 2, 3], name="a")
tm.assert_series_equal(actual, expected)
actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
expected = Series([1, 2, 3], name="a")
tm.assert_series_equal(actual, expected)

def test_deprecated_kwargs(self, read_ext):
with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
Expand Down
36 changes: 25 additions & 11 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def test_1000_sep(all_parsers):
tm.assert_frame_equal(result, expected)


def test_squeeze(all_parsers):
@pytest.mark.parametrize("squeeze", [True, False])
def test_squeeze(all_parsers, squeeze):
data = """\
a,1
b,2
Expand All @@ -138,13 +139,25 @@ def test_squeeze(all_parsers):
index = Index(["a", "b", "c"], name=0)
expected = Series([1, 2, 3], name=1, index=index)

result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True)
tm.assert_series_equal(result, expected)
result = parser.read_csv_check_warnings(
FutureWarning,
"The squeeze argument has been deprecated "
"and will be removed in a future version.\n\n",
StringIO(data),
index_col=0,
header=None,
squeeze=squeeze,
)
if not squeeze:
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
else:
tm.assert_series_equal(result, expected)

# see gh-8217
#
# Series should not be a view.
assert not result._is_view
# see gh-8217
#
# Series should not be a view.
assert not result._is_view


@xfail_pyarrow
Expand Down Expand Up @@ -847,12 +860,13 @@ def test_deprecated_bad_lines_warns(all_parsers, csv1, on_bad_lines):
# GH 15122
parser = all_parsers
kwds = {f"{on_bad_lines}_bad_lines": False}
with tm.assert_produces_warning(
parser.read_csv_check_warnings(
FutureWarning,
match=f"The {on_bad_lines}_bad_lines argument has been deprecated "
f"The {on_bad_lines}_bad_lines argument has been deprecated "
"and will be removed in a future version.\n\n",
):
parser.read_csv(csv1, **kwds)
csv1,
**kwds,
)


def test_malformed_second_line(all_parsers):
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/io/parser/common/test_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from pandas import (
DataFrame,
Series,
concat,
)
import pandas._testing as tm
Expand Down Expand Up @@ -94,7 +93,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs):

def test_iteration_open_handle(all_parsers):
parser = all_parsers
kwargs = {"squeeze": True, "header": None}
kwargs = {"header": None}

with tm.ensure_clean() as path:
with open(path, "w") as f:
Expand All @@ -106,5 +105,5 @@ def test_iteration_open_handle(all_parsers):
break

result = parser.read_csv(f, **kwargs)
expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0)
tm.assert_series_equal(result, expected)
expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
tm.assert_frame_equal(result, expected)
11 changes: 11 additions & 0 deletions pandas/tests/io/parser/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
read_csv,
read_table,
)
import pandas._testing as tm


class BaseParser:
Expand All @@ -27,6 +28,16 @@ def read_csv(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_csv(*args, **kwargs)

def read_csv_check_warnings(
self, warn_type: type[Warning], warn_msg: str, *args, **kwargs
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_csv is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(warn_type, match=warn_msg):
return read_csv(*args, **kwargs)

def read_table(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_table(*args, **kwargs)
Expand Down
9 changes: 8 additions & 1 deletion pandas/tests/io/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,14 @@ def test_series_compression_defaults_to_infer(
extension = icom._compression_to_extension[compression_only]
with tm.ensure_clean("compressed" + extension) as path:
getattr(input, write_method)(path, **write_kwargs)
output = read_method(path, compression=compression_only, **read_kwargs)
if "squeeze" in read_kwargs:
kwargs = read_kwargs.copy()
del kwargs["squeeze"]
output = read_method(path, compression=compression_only, **kwargs).squeeze(
"columns"
)
else:
output = read_method(path, compression=compression_only, **read_kwargs)
tm.assert_series_equal(output, input, check_names=False)


Expand Down
15 changes: 7 additions & 8 deletions pandas/tests/series/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@

class TestSeriesToCSV:
def read_csv(self, path, **kwargs):
params = {"squeeze": True, "index_col": 0, "header": None, "parse_dates": True}
params = {"index_col": 0, "header": None, "parse_dates": True}
params.update(**kwargs)

header = params.get("header")
out = pd.read_csv(path, **params)
out = pd.read_csv(path, **params).squeeze("columns")

if header is None:
out.name = out.index.name = None
Expand Down Expand Up @@ -138,8 +138,7 @@ def test_to_csv_compression(self, s, encoding, compression):
compression=compression,
encoding=encoding,
index_col=0,
squeeze=True,
)
).squeeze("columns")
tm.assert_series_equal(s, result)

# test the round trip using file handle - to_csv -> read_csv
Expand All @@ -153,8 +152,7 @@ def test_to_csv_compression(self, s, encoding, compression):
compression=compression,
encoding=encoding,
index_col=0,
squeeze=True,
)
).squeeze("columns")
tm.assert_series_equal(s, result)

# explicitly ensure file was compressed
Expand All @@ -164,7 +162,8 @@ def test_to_csv_compression(self, s, encoding, compression):

with tm.decompress_file(filename, compression) as fh:
tm.assert_series_equal(
s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding)
s,
pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"),
)

def test_to_csv_interval_index(self):
Expand All @@ -173,7 +172,7 @@ def test_to_csv_interval_index(self):

with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path:
s.to_csv(path, header=False)
result = self.read_csv(path, index_col=0, squeeze=True)
result = self.read_csv(path, index_col=0)

# can't roundtrip intervalindex via read_csv so check string repr (GH 23595)
expected = s.copy()
Expand Down