Skip to content

Commit

Permalink
BUG: 'Unnamed' != unnamed column in CSV
Browse files Browse the repository at this point in the history
False criterion was causing errors when
specified headers appeared to capture
a seemingly unnamed row, just because
they had the string "Unnamed" in it.
  • Loading branch information
gfyoung committed Nov 14, 2018
1 parent e413c49 commit 52cc5c9
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 6 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Expand Up @@ -1315,6 +1315,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`)
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
Expand Down
13 changes: 7 additions & 6 deletions pandas/io/parsers.py
Expand Up @@ -1387,22 +1387,23 @@ def extract(r):
columns = lzip(*[extract(r) for r in header])
names = ic + columns

def tostr(x):
def to_str(x):
return str(x) if not isinstance(x, compat.string_types) else x

# if we find 'Unnamed' all of a single level, then our header was too
# long
# If we find unnamed columns all in a single
# level, then our header was too long.
for n in range(len(columns[0])):
if all('Unnamed' in tostr(c[n]) for c in columns):
if all(to_str(c[n]) in self.unnamed_cols for c in columns):
raise ParserError(
"Passed header=[%s] are too many rows for this "
"multi_index of columns"
% ','.join(str(x) for x in self.header)
)

# clean the column names (if we have an index_col)
# Clean the column names (if we have an index_col).
if len(ic):
col_names = [r[0] if len(r[0]) and 'Unnamed' not in r[0] else None
col_names = [r[0] if (len(r[0]) and
r[0] not in self.unnamed_cols) else None
for r in header]
else:
col_names = [None] * len(header)
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/io/parser/header.py
Expand Up @@ -11,6 +11,7 @@
import pytest

from pandas.compat import StringIO, lrange, u
from pandas.errors import ParserError

from pandas import DataFrame, Index, MultiIndex
import pandas.util.testing as tm
Expand Down Expand Up @@ -360,3 +361,47 @@ def test_mangles_multi_index(self):
('A', 'one.1.1'), ('B', 'two'),
('B', 'two.1')]))
tm.assert_frame_equal(df, expected)

@pytest.mark.parametrize("index_col", [None, [0]])
@pytest.mark.parametrize("columns", [None,
(["", "Unnamed"]),
(["Unnamed", ""]),
(["Unnamed", "NotUnnamed"])])
def test_multi_index_unnamed(self, index_col, columns):
# see gh-23687
#
# When specifying a multi-index header, make sure that
# we don't error just because one of the rows in our header
# has ALL column names containing the string "Unnamed". The
# correct condition to check is whether the row contains
# ALL columns that did not have names (and instead were given
# placeholder ones).
header = [0, 1]

if index_col is None:
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
else:
data = (",".join([""] + (columns or ["", ""])) +
"\n,0,1\n0,2,3\n1,4,5\n")

if columns is None:
msg = (r"Passed header=\[0,1\] are too "
r"many rows for this multi_index of columns")
with pytest.raises(ParserError, match=msg):
self.read_csv(StringIO(data), header=header,
index_col=index_col)
else:
result = self.read_csv(StringIO(data), header=header,
index_col=index_col)
template = "Unnamed: {i}_level_0"
exp_columns = []

for i, col in enumerate(columns):
if not col: # Unnamed.
col = template.format(i=i if index_col is None else i + 1)

exp_columns.append(col)

columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
tm.assert_frame_equal(result, expected)

0 comments on commit 52cc5c9

Please sign in to comment.