API: Warn about dups in names for read_csv

xref gh-17095.
pandas-dev · Sep 24, 2017 · 9fcffa7 · 9fcffa7
1 parent 4004367
commit 9fcffa7
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 33 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -113,8 +113,8 @@ header : int or list of ints, default ``'infer'``
   rather than the first line of the file.
 names : array-like, default ``None``
   List of column names to use. If file contains no header row, then you should
-  explicitly pass ``header=None``. Duplicates in this list are not allowed unless
-  ``mangle_dupe_cols=True``, which is the default.
+  explicitly pass ``header=None``. Duplicates in this list will cause
+    a ``UserWarning`` to be issued.
 index_col :  int or sequence or ``False``, default ``None``
   Column to use as the row labels of the DataFrame. If a sequence is given, a
   MultiIndex is used. If you have a malformed file with delimiters at the end of

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -422,6 +422,7 @@ Other API Changes
 - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
 - Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now
   raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
+- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
 - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
 - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
 - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -84,8 +84,8 @@
     rather than the first line of the file.
 names : array-like, default None
     List of column names to use. If file contains no header row, then you
-    should explicitly pass header=None. Duplicates in this list are not
-    allowed unless mangle_dupe_cols=True, which is the default.
+    should explicitly pass header=None. Duplicates in this list will cause
+    a ``UserWarning`` to be issued.
 index_col : int or sequence or False, default None
     Column to use as the row labels of the DataFrame. If a sequence is given, a
     MultiIndex is used. If you have a malformed file with delimiters at the end
@@ -385,6 +385,32 @@ def _validate_integer(name, val, min_val=0):
     return val
 
 
+def _validate_names(names):
+    """
+    Check if the `names` parameter contains duplicates.
+
+    If duplicates are found, we issue a warning before returning.
+
+    Parameters
+    ----------
+    names : array-like or None
+        An array containing a list of the names used for the output DataFrame.
+
+    Returns
+    -------
+    names : array-like or None
+        The original `names` parameter.
+    """
+
+    if names is not None:
+        if len(names) != len(set(names)):
+            msg = ("Duplicate names specified. This "
+                   "will raise an error in the future.")
+            warnings.warn(msg, UserWarning, stacklevel=3)
+
+    return names
+
+
 def _read(filepath_or_buffer, kwds):
     """Generic reader of line files."""
     encoding = kwds.get('encoding', None)
@@ -407,6 +433,9 @@ def _read(filepath_or_buffer, kwds):
     chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
     nrows = _validate_integer('nrows', kwds.get('nrows', None))
 
+    # Check for duplicates in names.
+    _validate_names(kwds.get("names", None))
+
     # Create the parser.
     parser = TextFileReader(filepath_or_buffer, **kwds)
 

diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -1357,20 +1357,6 @@ def test_euro_decimal_format(self):
         assert df2['Number2'].dtype == float
         assert df2['Number3'].dtype == float
 
-    def test_read_duplicate_names(self):
-        # See gh-7160
-        data = "a,b,a\n0,1,2\n3,4,5"
-        df = self.read_csv(StringIO(data))
-        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
-                             columns=['a', 'b', 'a.1'])
-        tm.assert_frame_equal(df, expected)
-
-        data = "0,1,2\n3,4,5"
-        df = self.read_csv(StringIO(data), names=["a", "b", "a"])
-        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
-                             columns=['a', 'b', 'a.1'])
-        tm.assert_frame_equal(df, expected)
-
     def test_inf_parsing(self):
         data = """\
 ,A

diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
@@ -204,10 +204,11 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(self):
         result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
         tm.assert_frame_equal(result, expected, check_index_type=False)
 
-        data = ''
-        result = self.read_csv(StringIO(data), names=['one', 'one'],
-                               dtype={0: 'u1', 1: 'f'})
-        tm.assert_frame_equal(result, expected, check_index_type=False)
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            data = ''
+            result = self.read_csv(StringIO(data), names=['one', 'one'],
+                                   dtype={0: 'u1', 1: 'f'})
+            tm.assert_frame_equal(result, expected, check_index_type=False)
 
     def test_raise_on_passed_int_dtype_with_nas(self):
         # see gh-2631

diff --git a/pandas/tests/io/parser/mangle_dupes.py b/pandas/tests/io/parser/mangle_dupes.py
@@ -7,6 +7,9 @@
 """
 
 from pandas.compat import StringIO
+from pandas import DataFrame
+
+import pandas.util.testing as tm
 
 
 class DupeColumnTests(object):
@@ -25,6 +28,21 @@ def test_basic(self):
                                        mangle_dupe_cols=True)
             assert list(df.columns) == expected
 
+    def test_basic_names(self):
+        # See gh-7160
+        data = "a,b,a\n0,1,2\n3,4,5"
+        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
+                             columns=["a", "b", "a.1"])
+
+        df = self.read_csv(StringIO(data))
+        tm.assert_frame_equal(df, expected)
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            data = "0,1,2\n3,4,5"
+            df = self.read_csv(StringIO(data),
+                               names=["a", "b", "a"])
+            tm.assert_frame_equal(df, expected)
+
     def test_thorough_mangle_columns(self):
         # see gh-17060
         data = "a,a,a.1\n1,2,3"
@@ -45,20 +63,26 @@ def test_thorough_mangle_names(self):
         # see gh-17095
         data = "a,b,b\n1,2,3"
         names = ["a.1", "a.1", "a.1.1"]
-        df = self.read_csv(StringIO(data), sep=",", names=names,
-                           mangle_dupe_cols=True)
-        assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            df = self.read_csv(StringIO(data), sep=",", names=names,
+                               mangle_dupe_cols=True)
+            assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
 
         data = "a,b,c,d,e,f\n1,2,3,4,5,6"
         names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
-        df = self.read_csv(StringIO(data), sep=",", names=names,
-                           mangle_dupe_cols=True)
-        assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
-                                    "a.1.1.1.1", "a.1.1.1.1.1"]
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            df = self.read_csv(StringIO(data), sep=",", names=names,
+                               mangle_dupe_cols=True)
+            assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
+                                        "a.1.1.1.1", "a.1.1.1.1.1"]
 
         data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
         names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
-        df = self.read_csv(StringIO(data), sep=",", names=names,
-                           mangle_dupe_cols=True)
-        assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
-                                    "a.2", "a.2.1", "a.3.1"]
+
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            df = self.read_csv(StringIO(data), sep=",", names=names,
+                               mangle_dupe_cols=True)
+            assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
+                                        "a.2", "a.2.1", "a.3.1"]