Skip to content

Commit

Permalink
BUG: Thoroughly dedup column names in read_csv
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung committed Jul 27, 2017
1 parent c6e5bf6 commit 0aa97a7
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 8 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ I/O
^^^

- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`)
- Bug in :func:`read_csv` in which specified column names were not being thoroughly de-duplicated (:issue:`17095`)
- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`).
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
Expand Down
18 changes: 11 additions & 7 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1318,14 +1318,18 @@ def _maybe_dedup_names(self, names):
# would be nice!
if self.mangle_dupe_cols:
names = list(names) # so we can index
counts = {}
counts = defaultdict(int)

for i, col in enumerate(names):
cur_count = counts.get(col, 0)
cur_count = counts[col]

if cur_count > 0:
names[i] = '%s.%d' % (col, cur_count)
while cur_count > 0:
counts[col] = cur_count + 1

col = '%s.%d' % (col, cur_count)
cur_count = counts[col]

names[i] = col
counts[col] = cur_count + 1

return names
Expand Down Expand Up @@ -2330,15 +2334,15 @@ def _infer_columns(self):
this_columns.append(c)

if not have_mi_columns and self.mangle_dupe_cols:
counts = {}
counts = defaultdict(int)

for i, col in enumerate(this_columns):
cur_count = counts.get(col, 0)
cur_count = counts[col]

while cur_count > 0:
counts[col] = cur_count + 1
col = "%s.%d" % (col, cur_count)
cur_count = counts.get(col, 0)
cur_count = counts[col]

this_columns[i] = col
counts[col] = cur_count + 1
Expand Down
24 changes: 23 additions & 1 deletion pandas/tests/io/parser/mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_basic(self):
mangle_dupe_cols=True)
assert list(df.columns) == expected

def test_thorough_mangle(self):
def test_thorough_mangle_columns(self):
# see gh-17060
data = "a,a,a.1\n1,2,3"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
Expand All @@ -40,3 +40,25 @@ def test_thorough_mangle(self):
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]

def test_thorough_mangle_names(self):
# see gh-17095
data = "a,b,b\n1,2,3"
names = ["a.1", "a.1", "a.1.1"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]

data = "a,b,c,d,e,f\n1,2,3,4,5,6"
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]

data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]

0 comments on commit 0aa97a7

Please sign in to comment.