Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix inconsistent C engine quoting behaviour #13411

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -287,11 +287,10 @@ lineterminator : str (length 1), default ``None``
quotechar : str (length 1)
The character used to denote the start and end of a quoted item. Quoted items
can include the delimiter and it will be ignored.
quoting : int or ``csv.QUOTE_*`` instance, default ``None``
quoting : int or ``csv.QUOTE_*`` instance, default ``0``
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't this still defaulted to None?, meaning don't consider quoting at all? (and NOT QUOTE_NONE?) or are these the same

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. The signature says quoting=0
  2. Passing in None raises an error (it always has been)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, you are essentially fixing the documentation, ok then

Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
``QUOTE_MINIMAL`` (0), ``QUOTE_ALL`` (1), ``QUOTE_NONNUMERIC`` (2) or
``QUOTE_NONE`` (3). Default (``None``) results in ``QUOTE_MINIMAL``
behavior.
``QUOTE_NONE`` (3).
doublequote : boolean, default ``True``
When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``,
indicate whether or not to interpret two consecutive ``quotechar`` elements
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,8 @@ Bug Fixes
- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`)
- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`)
- Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`)
- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`)



Expand Down
3 changes: 1 addition & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,9 @@
quotechar : str (length 1), optional
The character used to denote the start and end of a quoted item. Quoted
items can include the delimiter and it will be ignored.
quoting : int or csv.QUOTE_* instance, default None
quoting : int or csv.QUOTE_* instance, default 0
Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe advertise that these are in pandas.io.common as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, might as well. Will add.

QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
Default (None) results in QUOTE_MINIMAL behavior.
doublequote : boolean, default ``True``
When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
whether or not to interpret two consecutive quotechar elements INSIDE a
Expand Down
140 changes: 140 additions & 0 deletions pandas/io/tests/parser/quoting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# -*- coding: utf-8 -*-

"""
Tests that quoting specifications are properly handled
during parsing for all of the parsers defined in parsers.py
"""

import csv
import pandas.util.testing as tm

from pandas import DataFrame
from pandas.compat import StringIO


class QuotingTests(object):

def test_bad_quote_char(self):
data = '1,2,3'

# Python 2.x: "...must be an 1-character..."
# Python 3.x: "...must be a 1-character..."
msg = '"quotechar" must be a(n)? 1-character string'
tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
StringIO(data), quotechar='foo')

msg = 'quotechar must be set if quoting enabled'
tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
StringIO(data), quotechar=None,
quoting=csv.QUOTE_MINIMAL)

msg = '"quotechar" must be string, not int'
tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
StringIO(data), quotechar=2)

def test_bad_quoting(self):
data = '1,2,3'

msg = '"quoting" must be an integer'
tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
StringIO(data), quoting='foo')

# quoting must in the range [0, 3]
msg = 'bad "quoting" value'
tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
StringIO(data), quoting=5)

def test_quote_char_basic(self):
data = 'a,b,c\n1,2,"cat"'
expected = DataFrame([[1, 2, 'cat']],
columns=['a', 'b', 'c'])
result = self.read_csv(StringIO(data), quotechar='"')
tm.assert_frame_equal(result, expected)

def test_quote_char_various(self):
data = 'a,b,c\n1,2,"cat"'
expected = DataFrame([[1, 2, 'cat']],
columns=['a', 'b', 'c'])
quote_chars = ['~', '*', '%', '$', '@', 'P']

for quote_char in quote_chars:
new_data = data.replace('"', quote_char)
result = self.read_csv(StringIO(new_data), quotechar=quote_char)
tm.assert_frame_equal(result, expected)

def test_null_quote_char(self):
data = 'a,b,c\n1,2,3'

# sanity checks
msg = 'quotechar must be set if quoting enabled'

tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
StringIO(data), quotechar=None,
quoting=csv.QUOTE_MINIMAL)

tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
StringIO(data), quotechar='',
quoting=csv.QUOTE_MINIMAL)

# no errors should be raised if quoting is None
expected = DataFrame([[1, 2, 3]],
columns=['a', 'b', 'c'])

result = self.read_csv(StringIO(data), quotechar=None,
quoting=csv.QUOTE_NONE)
tm.assert_frame_equal(result, expected)

result = self.read_csv(StringIO(data), quotechar='',
quoting=csv.QUOTE_NONE)
tm.assert_frame_equal(result, expected)

def test_quoting_various(self):
data = '1,2,"foo"'
cols = ['a', 'b', 'c']

# QUOTE_MINIMAL and QUOTE_ALL apply only to
# the CSV writer, so they should have no
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should these be an error then? (or you are saying they don't do anything so might as well accept them)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely not an error. Just leave it alone is best.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But maybe we shouldn't mention them explicitly as options in documentation then?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well they are technically valid to Python's csv.reader class, so at this point I would leave them so that users know that they are options for completeness.

# special effect for the CSV reader
expected = DataFrame([[1, 2, 'foo']], columns=cols)

# test default (afterwards, arguments are all explicit)
result = self.read_csv(StringIO(data), names=cols)
tm.assert_frame_equal(result, expected)

result = self.read_csv(StringIO(data), quotechar='"',
quoting=csv.QUOTE_MINIMAL, names=cols)
tm.assert_frame_equal(result, expected)

result = self.read_csv(StringIO(data), quotechar='"',
quoting=csv.QUOTE_ALL, names=cols)
tm.assert_frame_equal(result, expected)

# QUOTE_NONE tells the reader to do no special handling
# of quote characters and leave them alone
expected = DataFrame([[1, 2, '"foo"']], columns=cols)
result = self.read_csv(StringIO(data), quotechar='"',
quoting=csv.QUOTE_NONE, names=cols)
tm.assert_frame_equal(result, expected)

# QUOTE_NONNUMERIC tells the reader to cast
# all non-quoted fields to float
expected = DataFrame([[1.0, 2.0, 'foo']], columns=cols)
result = self.read_csv(StringIO(data), quotechar='"',
quoting=csv.QUOTE_NONNUMERIC,
names=cols)
tm.assert_frame_equal(result, expected)

def test_double_quote(self):
data = 'a,b\n3,"4 "" 5"'

expected = DataFrame([[3, '4 " 5']],
columns=['a', 'b'])
result = self.read_csv(StringIO(data), quotechar='"',
doublequote=True)
tm.assert_frame_equal(result, expected)

expected = DataFrame([[3, '4 " 5"']],
columns=['a', 'b'])
result = self.read_csv(StringIO(data), quotechar='"',
doublequote=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand this one (I mean: reading the docs, I would expect '4 "" 5') Can you explain the logic of this result?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not entirely sure from the Python engine perspective but from the tokenizer.c perspective, the two quotes that you see are because they are interpreted as in-field quotations so they are processed like normal characters.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But why the quote after the 5? (so why '4 " 5"' and not '4 " 5') Because isn't this quote the ending quote of the string (since quotechar='"')?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it is not. The quote is considered in field. Follow along with the code and you'll see.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can see indeed the logic in the code. So after a closing quote, the rest of the field is regarded as a normal field, and cannot be a quoted field anymore, regardless of occurring quotes.
But you could also set the state to START_FIELD instead of IN_FIELD

Anyway, this is a rather pathological case, so not that important. I just found it a bit strange (and as you are adding the behaviour explicitly as a test, wanted to check it)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, okay. Thanks for clarifying!

tm.assert_frame_equal(result, expected)
3 changes: 2 additions & 1 deletion pandas/io/tests/parser/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .common import ParserTests
from .header import HeaderTests
from .comment import CommentTests
from .quoting import QuotingTests
from .usecols import UsecolsTests
from .skiprows import SkipRowsTests
from .index_col import IndexColTests
Expand All @@ -28,7 +29,7 @@ class BaseParser(CommentTests, CompressionTests,
IndexColTests, MultithreadTests,
NAvaluesTests, ParseDatesTests,
ParserTests, SkipRowsTests,
UsecolsTests):
UsecolsTests, QuotingTests):
def read_csv(self, *args, **kwargs):
raise NotImplementedError

Expand Down
42 changes: 33 additions & 9 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ from libc.string cimport strncpy, strlen, strcmp, strcasecmp
cimport libc.stdio as stdio
import warnings

from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE
from cpython cimport (PyObject, PyBytes_FromString,
PyBytes_AsString, PyBytes_Check,
PyUnicode_Check, PyUnicode_AsUTF8String)
Expand Down Expand Up @@ -283,6 +284,7 @@ cdef class TextReader:
object compression
object mangle_dupe_cols
object tupleize_cols
list dtype_cast_order
set noconvert, usecols

def __cinit__(self, source,
Expand Down Expand Up @@ -393,8 +395,13 @@ cdef class TextReader:
raise ValueError('Only length-1 escapes supported')
self.parser.escapechar = ord(escapechar)

self.parser.quotechar = ord(quotechar)
self.parser.quoting = quoting
self._set_quoting(quotechar, quoting)

# TODO: endianness just a placeholder?
if quoting == QUOTE_NONNUMERIC:
self.dtype_cast_order = ['<f8', '<i8', '|b1', '|O8']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why don't you use somthing like float, int, bool, object? (e.g. human readable)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I c you got this from below. nvm then.

Copy link
Member Author

@gfyoung gfyoung Jun 9, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just copied and pasted what was used before. I can change it though to that.

Just saw your second comment - weird GitHub mistiming in which I missed your comment but it somehow was made before I made mine 😕

else:
self.dtype_cast_order = ['<i8', '<f8', '|b1', '|O8']

if comment is not None:
if len(comment) > 1:
Expand Down Expand Up @@ -548,6 +555,29 @@ cdef class TextReader:
def set_error_bad_lines(self, int status):
self.parser.error_bad_lines = status

def _set_quoting(self, quote_char, quoting):
if not isinstance(quoting, int):
raise TypeError('"quoting" must be an integer')

if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
raise TypeError('bad "quoting" value')

if not isinstance(quote_char, (str, bytes)) and quote_char is not None:
dtype = type(quote_char).__name__
raise TypeError('"quotechar" must be string, '
'not {dtype}'.format(dtype=dtype))

if quote_char is None or quote_char == '':
if quoting != QUOTE_NONE:
raise TypeError("quotechar must be set if quoting enabled")
self.parser.quoting = quoting
self.parser.quotechar = -1
elif len(quote_char) > 1: # 0-len case handled earlier
raise TypeError('"quotechar" must be a 1-character string')
else:
self.parser.quoting = quoting
self.parser.quotechar = ord(quote_char)

cdef _make_skiprow_set(self):
if isinstance(self.skiprows, (int, np.integer)):
parser_set_skipfirstnrows(self.parser, self.skiprows)
Expand Down Expand Up @@ -1066,7 +1096,7 @@ cdef class TextReader:
return self._string_convert(i, start, end, na_filter, na_hashset)
else:
col_res = None
for dt in dtype_cast_order:
for dt in self.dtype_cast_order:
try:
col_res, na_count = self._convert_with_dtype(
dt, i, start, end, na_filter, 0, na_hashset, na_flist)
Expand Down Expand Up @@ -1847,12 +1877,6 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL:
return table


# if at first you don't succeed...

# TODO: endianness just a placeholder?
cdef list dtype_cast_order = ['<i8', '<f8', '|b1', '|O8']


cdef raise_parser_error(object base, parser_t *parser):
message = '%s. C error: ' % base
if parser.error_msg != NULL:
Expand Down