Skip to content

Commit

Permalink
BUG: Respect 'usecols' parameter even when CSV rows are uneven
Browse files Browse the repository at this point in the history
Closes #12203 by overriding the row alignment checks for both engines
when the `usecols` parameter is passed into `read_csv`.

Author: gfyoung <gfyoung17@gmail.com>

Closes #12551 from gfyoung/usecol_long_lines and squashes the following commits:

d3824dc [gfyoung] BUG: Respect 'usecols' parameter even when CSV rows are uneven
  • Loading branch information
gfyoung authored and jreback committed Mar 20, 2016
1 parent 9fe2dd2 commit e55875e
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 3 deletions.
7 changes: 7 additions & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ API changes



- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)








Expand Down Expand Up @@ -95,6 +101,7 @@ Performance Improvements

Bug Fixes
~~~~~~~~~
- ``usecols`` parameter in ``pd.read_csv`` is now respected even when the lines of a CSV file are not even (:issue:`12203`)

- Bug in ``Period`` and ``PeriodIndex`` creation raises ``KeyError`` if ``freq="Minute"`` is specified. Note that "Minute" freq is deprecated in v0.17.0, and recommended to use ``freq="T"`` instead (:issue:`11854`)
- Bug in printing data which contains ``Period`` with different ``freq`` raises ``ValueError`` (:issue:`12615`)
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1984,7 +1984,9 @@ def _rows_to_cols(self, content):
raise ValueError('skip footer cannot be negative')

# Loop through rows to verify lengths are correct.
if col_len != zip_len and self.index_col is not False:
if (col_len != zip_len and
self.index_col is not False and
self.usecols is None):
i = 0
for (i, l) in enumerate(content):
if len(l) != col_len:
Expand Down
31 changes: 31 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2664,6 +2664,37 @@ def test_empty_header_read(count):
for count in range(1, 101):
test_empty_header_read(count)

def test_uneven_lines_with_usecols(self):
# See gh-12203
csv = r"""a,b,c
0,1,2
3,4,5,6,7
8,9,10
"""

# make sure that an error is still thrown
# when the 'usecols' parameter is not provided
msg = "Expected \d+ fields in line \d+, saw \d+"
with tm.assertRaisesRegexp(ValueError, msg):
df = self.read_csv(StringIO(csv))

expected = DataFrame({
'a': [0, 3, 8],
'b': [1, 4, 9]
})

usecols = [0, 1]
df = self.read_csv(StringIO(csv), usecols=usecols)
tm.assert_frame_equal(df, expected)

usecols = ['a', 1]
df = self.read_csv(StringIO(csv), usecols=usecols)
tm.assert_frame_equal(df, expected)

usecols = ['a', 'b']
df = self.read_csv(StringIO(csv), usecols=usecols)
tm.assert_frame_equal(df, expected)


class TestPythonParser(ParserTests, tm.TestCase):

Expand Down
6 changes: 5 additions & 1 deletion pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ cdef extern from "parser/tokenizer.h":
int allow_embedded_newline
int strict # raise exception on bad CSV */

int usecols

int expected_fields
int error_bad_lines
int warn_bad_lines
Expand Down Expand Up @@ -350,6 +352,8 @@ cdef class TextReader:
self.compression = compression
self.memory_map = memory_map

self.parser.usecols = (usecols is not None)

self._setup_parser_source(source)
parser_set_default_options(self.parser)

Expand Down Expand Up @@ -1208,7 +1212,7 @@ cdef class TextReader:
else:
return None

class CParserError(Exception):
class CParserError(ValueError):
pass


Expand Down
3 changes: 2 additions & 1 deletion pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,8 @@ static int end_line(parser_t *self) {
/* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */

if (!(self->lines <= self->header_end + 1)
&& (self->expected_fields < 0 && fields > ex_fields)) {
&& (self->expected_fields < 0 && fields > ex_fields)
&& !(self->usecols)) {
// increment file line count
self->file_lines++;

Expand Down
2 changes: 2 additions & 0 deletions pandas/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ typedef struct parser_t {
int allow_embedded_newline;
int strict; /* raise exception on bad CSV */

int usecols; // Boolean: 1: usecols provided, 0: none provided

int expected_fields;
int error_bad_lines;
int warn_bad_lines;
Expand Down

0 comments on commit e55875e

Please sign in to comment.