diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 0025f8d098d81..db710e73a1286 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -278,6 +278,7 @@ I/O ^^^ - Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`) +- Bug in :func:`read_csv` in which specified column names were not being thoroughly de-duplicated (:issue:`17095`) - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) - Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`). - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`). diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ea0bb104338b6..41b0cdd6dd250 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1318,14 +1318,18 @@ def _maybe_dedup_names(self, names): # would be nice! if self.mangle_dupe_cols: names = list(names) # so we can index - counts = {} + counts = defaultdict(int) for i, col in enumerate(names): - cur_count = counts.get(col, 0) + cur_count = counts[col] - if cur_count > 0: - names[i] = '%s.%d' % (col, cur_count) + while cur_count > 0: + counts[col] = cur_count + 1 + col = '%s.%d' % (col, cur_count) + cur_count = counts[col] + + names[i] = col counts[col] = cur_count + 1 return names @@ -2330,15 +2334,15 @@ def _infer_columns(self): this_columns.append(c) if not have_mi_columns and self.mangle_dupe_cols: - counts = {} + counts = defaultdict(int) for i, col in enumerate(this_columns): - cur_count = counts.get(col, 0) + cur_count = counts[col] while cur_count > 0: counts[col] = cur_count + 1 col = "%s.%d" % (col, cur_count) - cur_count = counts.get(col, 0) + cur_count = counts[col] this_columns[i] = col counts[col] = cur_count + 1 diff --git a/pandas/tests/io/parser/mangle_dupes.py b/pandas/tests/io/parser/mangle_dupes.py index 70ecfe51c0f09..e2efb1377f8b0 100644 --- a/pandas/tests/io/parser/mangle_dupes.py +++ b/pandas/tests/io/parser/mangle_dupes.py @@ -25,7 +25,7 @@ def test_basic(self): mangle_dupe_cols=True) assert list(df.columns) == expected - def test_thorough_mangle(self): + def test_thorough_mangle_columns(self): # see gh-17060 data = "a,a,a.1\n1,2,3" df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True) @@ -40,3 +40,25 @@ def test_thorough_mangle(self): df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True) assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"] + + def test_thorough_mangle_names(self): + # see gh-17095 + data = "a,b,b\n1,2,3" + names = ["a.1", "a.1", "a.1.1"] + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"] + + data = "a,b,c,d,e,f\n1,2,3,4,5,6" + names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"] + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", + "a.1.1.1.1", "a.1.1.1.1.1"] + + data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7" + names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"] + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", + "a.2", "a.2.1", "a.3.1"]