Skip to content

Commit

Permalink
BUG: Fix parse empty df
Browse files Browse the repository at this point in the history
closes #14515

This commit fixes a bug where `read_csv` failed when given a file with
a multiindex header and empty content. Because pandas reads index
names as a separate line following the header lines, the reader looks
for the line with index names in it. If the content of the dataframe
is empty, the reader will choke. This bug surfaced after
#6618 stopped writing an
extra line after multiindex columns, which led to a situation where
pandas could write CSV's that it couldn't then read.     This commit
changes that behavior by explicitly checking if the index name row
exists, and processing it correctly if it doesn't.

Author: Ben Kandel <ben.kandel@gmail.com>

Closes #14596 from bkandel/fix-parse-empty-df and squashes the following commits:

32e3b0a [Ben Kandel] lint
e6b1237 [Ben Kandel] lint
fedfff8 [Ben Kandel] fix multiindex column parsing
518982d [Ben Kandel] move to 0.19.2
fc23e5c [Ben Kandel] fix errant this_columns
3d9bbdd [Ben Kandel] whatsnew
68eadf3 [Ben Kandel] Modify test.
17e44dd [Ben Kandel] fix python parser too
72adaf2 [Ben Kandel] remove unnecessary test
bfe0423 [Ben Kandel] typo
2f64d57 [Ben Kandel] pep8
b8200e4 [Ben Kandel] BUG: read_csv with empty df
  • Loading branch information
Ben Kandel authored and jreback committed Nov 22, 2016
1 parent f609640 commit f862b52
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 7 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.19.2.txt
Expand Up @@ -29,7 +29,7 @@ Bug Fixes

- Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)

- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)



Expand Down
16 changes: 13 additions & 3 deletions pandas/io/parsers.py
Expand Up @@ -1509,10 +1509,11 @@ def read(self, nrows=None):
if self._first_chunk:
self._first_chunk = False
names = self._maybe_dedup_names(self.orig_names)

index, columns, col_dict = _get_empty_meta(
names, self.index_col, self.index_names,
dtype=self.kwds.get('dtype'))
columns = self._maybe_make_multi_index_columns(
columns, self.col_names)

if self.usecols is not None:
columns = self._filter_usecols(columns)
Expand Down Expand Up @@ -1979,8 +1980,11 @@ def read(self, rows=None):
if not len(content): # pragma: no cover
# DataFrame with the right metadata, even though it's length 0
names = self._maybe_dedup_names(self.orig_names)
return _get_empty_meta(names, self.index_col,
self.index_names)
index, columns, col_dict = _get_empty_meta(
names, self.index_col, self.index_names)
columns = self._maybe_make_multi_index_columns(
columns, self.col_names)
return index, columns, col_dict

# handle new style for names in index
count_empty_content_vals = count_empty_vals(content[0])
Expand Down Expand Up @@ -2083,6 +2087,12 @@ def _infer_columns(self):
# We have an empty file, so check
# if columns are provided. That will
# serve as the 'line' for parsing
if have_mi_columns and hr > 0:
if clear_buffer:
self._clear_buffer()
columns.append([None] * len(columns[-1]))
return columns, num_original_columns

if not self.names:
raise EmptyDataError(
"No columns to parse from file")
Expand Down
22 changes: 22 additions & 0 deletions pandas/io/tests/parser/common.py
Expand Up @@ -606,6 +606,28 @@ def test_multi_index_no_level_names(self):
expected = self.read_csv(StringIO(data), index_col=[1, 0])
tm.assert_frame_equal(df, expected, check_names=False)

def test_multi_index_blank_df(self):
# GH 14545
data = """a,b
"""
df = self.read_csv(StringIO(data), header=[0])
expected = DataFrame(columns=['a', 'b'])
tm.assert_frame_equal(df, expected)
round_trip = self.read_csv(StringIO(
expected.to_csv(index=False)), header=[0])
tm.assert_frame_equal(round_trip, expected)

data_multiline = """a,b
c,d
"""
df2 = self.read_csv(StringIO(data_multiline), header=[0, 1])
cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')])
expected2 = DataFrame(columns=cols)
tm.assert_frame_equal(df2, expected2)
round_trip = self.read_csv(StringIO(
expected2.to_csv(index=False)), header=[0, 1])
tm.assert_frame_equal(round_trip, expected2)

def test_no_unnamed_index(self):
data = """ id c0 c1 c2
0 1 0 a b
Expand Down
6 changes: 4 additions & 2 deletions pandas/parser.pyx
Expand Up @@ -717,7 +717,9 @@ cdef class TextReader:
start = self.parser.line_start[0]

# e.g., if header=3 and file only has 2 lines
elif self.parser.lines < hr + 1:
elif (self.parser.lines < hr + 1
and not isinstance(self.orig_header, list)) or (
self.parser.lines < hr):
msg = self.orig_header
if isinstance(msg, list):
msg = "[%s], len of %d," % (
Expand Down Expand Up @@ -940,7 +942,7 @@ cdef class TextReader:
raise_parser_error('Error tokenizing data', self.parser)
footer = self.skipfooter

if self.parser_start == self.parser.lines:
if self.parser_start >= self.parser.lines:
raise StopIteration
self._end_clock('Tokenization')

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_to_csv.py
Expand Up @@ -587,7 +587,7 @@ def _make_frame(names=None):
df = _make_frame(True)
df.to_csv(path, tupleize_cols=False)

for i in [5, 6, 7]:
for i in [6, 7]:
msg = 'len of {i}, but only 5 lines in file'.format(i=i)
with assertRaisesRegexp(ParserError, msg):
read_csv(path, tupleize_cols=False,
Expand Down

0 comments on commit f862b52

Please sign in to comment.