Deprecate prefix argument in read_csv and read_table (#44713)

pandas-dev · Dec 3, 2021 · 8eb0b1b · 8eb0b1b
1 parent 1895062
commit 8eb0b1b
Show file tree

Hide file tree

Showing 7 changed files with 84 additions and 26 deletions.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -161,6 +161,18 @@ squeeze : boolean, default ``False``
      the data.
 prefix : str, default ``None``
   Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
+
+  .. deprecated:: 1.4.0
+     Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
+
+  .. ipython:: python
+
+     data = "col1,col2,col3\na,b,1"
+
+     df = pd.read_csv(StringIO(data))
+     df.columns = [f"pre_{col}" for col in df.columns]
+     df
+
 mangle_dupe_cols : boolean, default ``True``
   Duplicate columns will be specified as 'X', 'X.1'...'X.N', rather than 'X'...'X'.
   Passing in ``False`` will cause data to be overwritten if there are duplicate

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -517,6 +517,7 @@ Other Deprecations
 - Deprecated casting behavior when setting timezone-aware value(s) into a timezone-aware :class:`Series` or :class:`DataFrame` column when the timezones do not match. Previously this cast to object dtype. In a future version, the values being inserted will be converted to the series or column's existing timezone (:issue:`37605`)
 - Deprecated casting behavior when passing an item with mismatched-timezone to :meth:`DatetimeIndex.insert`, :meth:`DatetimeIndex.putmask`, :meth:`DatetimeIndex.where` :meth:`DatetimeIndex.fillna`, :meth:`Series.mask`, :meth:`Series.where`, :meth:`Series.fillna`, :meth:`Series.shift`, :meth:`Series.replace`, :meth:`Series.reindex` (and :class:`DataFrame` column analogues). In the past this has cast to object dtype. In a future version, these will cast the passed item to the index or series's timezone (:issue:`37605`)
 - Deprecated the 'errors' keyword argument in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, and meth:`DataFrame.mask`; in a future version the argument will be removed (:issue:`44294`)
+- Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`)
 - Deprecated :meth:`PeriodIndex.astype` to ``datetime64[ns]`` or ``DatetimeTZDtype``, use ``obj.to_timestamp(how).tz_localize(dtype.tz)`` instead (:issue:`44398`)
 - Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`)
 - Deprecated passing arguments as positional for :func:`read_fwf` other than ``filepath_or_buffer`` (:issue:`41485`):

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -143,6 +143,9 @@
         the data.
 prefix : str, optional
     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
+
+    .. deprecated:: 1.4.0
+       Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
 mangle_dupe_cols : bool, default True
     Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
     'X'...'X'. Passing in False will cause data to be overwritten if there
@@ -461,6 +464,9 @@ class _DeprecationConfig(NamedTuple):
     "squeeze": _DeprecationConfig(
         None, 'Append .squeeze("columns") to the call to squeeze.'
     ),
+    "prefix": _DeprecationConfig(
+        None, "Use a list comprehension on the column names in the future."
+    ),
 }
 
 

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -823,7 +823,8 @@ def test_names_and_prefix_not_None_raises(all_parsers, func):
     parser = all_parsers
     msg = "Specified named and prefix; you can only specify one."
     with pytest.raises(ValueError, match=msg):
-        getattr(parser, func)(f, names=["a", "b"], prefix="x")
+        with tm.assert_produces_warning(FutureWarning):
+            getattr(parser, func)(f, names=["a", "b"], prefix="x")
 
 
 @pytest.mark.parametrize("func", ["read_csv", "read_table"])
@@ -833,7 +834,15 @@ def test_names_and_prefix_explicit_None(all_parsers, names, prefix, func):
     f = StringIO("a,b\n1,2")
     expected = DataFrame({"x0": ["a", "1"], "x1": ["b", "2"]})
     parser = all_parsers
-    result = getattr(parser, func)(f, names=names, sep=",", prefix=prefix, header=None)
+    if prefix is not None:
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = getattr(parser, func)(
+                f, names=names, sep=",", prefix=prefix, header=None
+            )
+    else:
+        result = getattr(parser, func)(
+            f, names=names, sep=",", prefix=prefix, header=None
+        )
     tm.assert_frame_equal(result, expected)
 
 

diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
@@ -128,7 +128,8 @@ def test_read_csv_raises_on_header_prefix(all_parsers):
     s = StringIO("0,1\n2,3")
 
     with pytest.raises(ValueError, match=msg):
-        parser.read_csv(s, header=0, prefix="_X")
+        with tm.assert_produces_warning(FutureWarning):
+            parser.read_csv(s, header=0, prefix="_X")
 
 
 def test_unexpected_keyword_parameter_exception(all_parsers):

diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py
@@ -82,7 +82,8 @@ def test_no_header_prefix(all_parsers):
 6,7,8,9,10
 11,12,13,14,15
 """
-    result = parser.read_csv(StringIO(data), prefix="Field", header=None)
+    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+        result = parser.read_csv(StringIO(data), prefix="Field", header=None)
     expected = DataFrame(
         [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
         columns=["Field0", "Field1", "Field2", "Field3", "Field4"],
@@ -457,7 +458,11 @@ def test_no_header(all_parsers, kwargs, names):
     expected = DataFrame(
         [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
     )
-    result = parser.read_csv(StringIO(data), header=None, **kwargs)
+    if "prefix" in kwargs.keys():
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = parser.read_csv(StringIO(data), header=None, **kwargs)
+    else:
+        result = parser.read_csv(StringIO(data), header=None, **kwargs)
     tm.assert_frame_equal(result, expected)
 
 

diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -171,14 +171,21 @@ def date_parser(*date_cols):
         """
         return parsing.try_parse_dates(parsing.concat_date_cols(date_cols))
 
-    result = parser.read_csv(
+    kwds = {
+        "header": None,
+        "date_parser": date_parser,
+        "prefix": "X",
+        "parse_dates": {"actual": [1, 2], "nominal": [1, 3]},
+        "keep_date_col": keep_date_col,
+    }
+    result = parser.read_csv_check_warnings(
+        FutureWarning,
+        "The prefix argument has been deprecated "
+        "and will be removed in a future version. .*\n\n",
         StringIO(data),
-        header=None,
-        date_parser=date_parser,
-        prefix="X",
-        parse_dates={"actual": [1, 2], "nominal": [1, 3]},
-        keep_date_col=keep_date_col,
+        **kwds,
     )
+
     expected = DataFrame(
         [
             [
@@ -309,13 +316,20 @@ def test_multiple_date_col(all_parsers, keep_date_col):
 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
 """
     parser = all_parsers
-    result = parser.read_csv(
+    kwds = {
+        "header": None,
+        "prefix": "X",
+        "parse_dates": [[1, 2], [1, 3]],
+        "keep_date_col": keep_date_col,
+    }
+    result = parser.read_csv_check_warnings(
+        FutureWarning,
+        "The prefix argument has been deprecated "
+        "and will be removed in a future version. .*\n\n",
         StringIO(data),
-        header=None,
-        prefix="X",
-        parse_dates=[[1, 2], [1, 3]],
-        keep_date_col=keep_date_col,
+        **kwds,
     )
+
     expected = DataFrame(
         [
             [
@@ -427,8 +441,13 @@ def test_date_col_as_index_col(all_parsers):
 KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
 """
     parser = all_parsers
-    result = parser.read_csv(
-        StringIO(data), header=None, prefix="X", parse_dates=[1], index_col=1
+    kwds = {"header": None, "prefix": "X", "parse_dates": [1], "index_col": 1}
+    result = parser.read_csv_check_warnings(
+        FutureWarning,
+        "The prefix argument has been deprecated "
+        "and will be removed in a future version. .*\n\n",
+        StringIO(data),
+        **kwds,
     )
 
     index = Index(
@@ -477,14 +496,19 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning):
     parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
     parser = all_parsers
 
-    with tm.assert_produces_warning(warning, check_stacklevel=False):
-        result = parser.read_csv(
-            StringIO(data),
-            header=None,
-            date_parser=date_parser,
-            parse_dates=parse_dates,
-            prefix="X",
-        )
+    kwds = {
+        "header": None,
+        "prefix": "X",
+        "parse_dates": parse_dates,
+        "date_parser": date_parser,
+    }
+    result = parser.read_csv_check_warnings(
+        FutureWarning,
+        "The prefix argument has been deprecated "
+        "and will be removed in a future version. .*\n\n",
+        StringIO(data),
+        **kwds,
+    )
 
     expected = DataFrame(
         [