BUG: Fix parse empty df

closes #14515 This commit fixes a bug where `read_csv` failed when given a file with a multiindex header and empty content. Because pandas reads index names as a separate line following the header lines, the reader looks for the line with index names in it. If the content of the dataframe is empty, the reader will choke. This bug surfaced after #6618 stopped writing an extra line after multiindex columns, which led to a situation where pandas could write CSV's that it couldn't then read. This commit changes that behavior by explicitly checking if the index name row exists, and processing it correctly if it doesn't. Author: Ben Kandel <ben.kandel@gmail.com> Closes #14596 from bkandel/fix-parse-empty-df and squashes the following commits: 32e3b0a [Ben Kandel] lint e6b1237 [Ben Kandel] lint fedfff8 [Ben Kandel] fix multiindex column parsing 518982d [Ben Kandel] move to 0.19.2 fc23e5c [Ben Kandel] fix errant this_columns 3d9bbdd [Ben Kandel] whatsnew 68eadf3 [Ben Kandel] Modify test. 17e44dd [Ben Kandel] fix python parser too 72adaf2 [Ben Kandel] remove unnecessary test bfe0423 [Ben Kandel] typo 2f64d57 [Ben Kandel] pep8 b8200e4 [Ben Kandel] BUG: read_csv with empty df
pandas-dev · Nov 22, 2016 · f862b52 · f862b52
1 parent f609640
commit f862b52
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 7 deletions.
diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt
@@ -29,7 +29,7 @@ Bug Fixes
 
 - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
 - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
-
+- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
 
 
 

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1509,10 +1509,11 @@ def read(self, nrows=None):
             if self._first_chunk:
                 self._first_chunk = False
                 names = self._maybe_dedup_names(self.orig_names)
-
                 index, columns, col_dict = _get_empty_meta(
                     names, self.index_col, self.index_names,
                     dtype=self.kwds.get('dtype'))
+                columns = self._maybe_make_multi_index_columns(
+                    columns, self.col_names)
 
                 if self.usecols is not None:
                     columns = self._filter_usecols(columns)
@@ -1979,8 +1980,11 @@ def read(self, rows=None):
         if not len(content):  # pragma: no cover
             # DataFrame with the right metadata, even though it's length 0
             names = self._maybe_dedup_names(self.orig_names)
-            return _get_empty_meta(names, self.index_col,
-                                   self.index_names)
+            index, columns, col_dict = _get_empty_meta(
+                names, self.index_col, self.index_names)
+            columns = self._maybe_make_multi_index_columns(
+                columns, self.col_names)
+            return index, columns, col_dict
 
         # handle new style for names in index
         count_empty_content_vals = count_empty_vals(content[0])
@@ -2083,6 +2087,12 @@ def _infer_columns(self):
                     # We have an empty file, so check
                     # if columns are provided. That will
                     # serve as the 'line' for parsing
+                    if have_mi_columns and hr > 0:
+                        if clear_buffer:
+                            self._clear_buffer()
+                        columns.append([None] * len(columns[-1]))
+                        return columns, num_original_columns
+
                     if not self.names:
                         raise EmptyDataError(
                             "No columns to parse from file")

diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -606,6 +606,28 @@ def test_multi_index_no_level_names(self):
         expected = self.read_csv(StringIO(data), index_col=[1, 0])
         tm.assert_frame_equal(df, expected, check_names=False)
 
+    def test_multi_index_blank_df(self):
+        # GH 14545
+        data = """a,b
+"""
+        df = self.read_csv(StringIO(data), header=[0])
+        expected = DataFrame(columns=['a', 'b'])
+        tm.assert_frame_equal(df, expected)
+        round_trip = self.read_csv(StringIO(
+            expected.to_csv(index=False)), header=[0])
+        tm.assert_frame_equal(round_trip, expected)
+
+        data_multiline = """a,b
+c,d
+"""
+        df2 = self.read_csv(StringIO(data_multiline), header=[0, 1])
+        cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')])
+        expected2 = DataFrame(columns=cols)
+        tm.assert_frame_equal(df2, expected2)
+        round_trip = self.read_csv(StringIO(
+            expected2.to_csv(index=False)), header=[0, 1])
+        tm.assert_frame_equal(round_trip, expected2)
+
     def test_no_unnamed_index(self):
         data = """ id c0 c1 c2
 0 1 0 a b

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -717,7 +717,9 @@ cdef class TextReader:
                     start = self.parser.line_start[0]
 
                 # e.g., if header=3 and file only has 2 lines
-                elif self.parser.lines < hr + 1:
+                elif (self.parser.lines < hr + 1
+                      and not isinstance(self.orig_header, list)) or (
+                          self.parser.lines < hr):
                     msg = self.orig_header
                     if isinstance(msg, list):
                         msg = "[%s], len of %d," % (
@@ -940,7 +942,7 @@ cdef class TextReader:
                 raise_parser_error('Error tokenizing data', self.parser)
             footer = self.skipfooter
 
-        if self.parser_start == self.parser.lines:
+        if self.parser_start >= self.parser.lines:
             raise StopIteration
         self._end_clock('Tokenization')
 

diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -587,7 +587,7 @@ def _make_frame(names=None):
             df = _make_frame(True)
             df.to_csv(path, tupleize_cols=False)
 
-            for i in [5, 6, 7]:
+            for i in [6, 7]:
                 msg = 'len of {i}, but only 5 lines in file'.format(i=i)
                 with assertRaisesRegexp(ParserError, msg):
                     read_csv(path, tupleize_cols=False,