From 883df65b095709e2146b1f36c9697b92eb913731 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 17 Jun 2016 12:28:09 -0400 Subject: [PATCH] BUG: Fix inconsistent C engine quoting behaviour 1) Add significant testing to quoting in read_csv 2) Fix bug in C engine in which a NULL `quotechar` would raise even though `quoting=csv.QUOTE_NONE`. 3) Fix bug in C engine in which `quoting=csv.QUOTE_NONNUMERIC` wouldn't cause non-quoted fields to be cast to `float`. Author: gfyoung Closes #13411 from gfyoung/quoting-read-csv-tests and squashes the following commits: 0e791a5 [gfyoung] BUG: Fix inconsistent C engine quoting behaviour --- doc/source/io.rst | 5 +- doc/source/merging.rst | 4 +- doc/source/whatsnew/v0.18.2.txt | 24 +++-- pandas/io/parsers.py | 3 +- pandas/io/tests/parser/quoting.py | 140 +++++++++++++++++++++++++ pandas/io/tests/parser/test_parsers.py | 3 +- pandas/parser.pyx | 42 ++++++-- pandas/tools/merge.py | 5 +- 8 files changed, 196 insertions(+), 30 deletions(-) create mode 100644 pandas/io/tests/parser/quoting.py diff --git a/doc/source/io.rst b/doc/source/io.rst index 61625104f5c1d..b011072d8c3fb 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -287,11 +287,10 @@ lineterminator : str (length 1), default ``None`` quotechar : str (length 1) The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. -quoting : int or ``csv.QUOTE_*`` instance, default ``None`` +quoting : int or ``csv.QUOTE_*`` instance, default ``0`` Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of ``QUOTE_MINIMAL`` (0), ``QUOTE_ALL`` (1), ``QUOTE_NONNUMERIC`` (2) or - ``QUOTE_NONE`` (3). Default (``None``) results in ``QUOTE_MINIMAL`` - behavior. + ``QUOTE_NONE`` (3). doublequote : boolean, default ``True`` When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate whether or not to interpret two consecutive ``quotechar`` elements diff --git a/doc/source/merging.rst b/doc/source/merging.rst index c629c4e5ea7f7..b69d0d8ba3015 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -1113,7 +1113,7 @@ Timeseries friendly merging Merging Ordered Data ~~~~~~~~~~~~~~~~~~~~ -A :func:`pd.merge_ordered` function allows combining time series and other +A :func:`merge_ordered` function allows combining time series and other ordered data. In particular it has an optional ``fill_method`` keyword to fill/interpolate missing data: @@ -1135,7 +1135,7 @@ Merging AsOf .. versionadded:: 0.18.2 -A :func:`pd.merge_asof` is similar to an ordered left-join except that we match on nearest key rather than equal keys. For each row in the ``left`` DataFrame, we select the last row in the ``right`` DataFrame whose ``on`` key is less than the left's key. Both DataFrames must be sorted by the key. +A :func:`merge_asof` is similar to an ordered left-join except that we match on nearest key rather than equal keys. For each row in the ``left`` DataFrame, we select the last row in the ``right`` DataFrame whose ``on`` key is less than the left's key. Both DataFrames must be sorted by the key. Optionally an asof merge can perform a group-wise merge. This matches the ``by`` key equally, in addition to the nearest match on the ``on`` key. diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index db2bccf6ac349..c0251f7170534 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -9,7 +9,7 @@ We recommend that all users upgrade to this version. Highlights include: - - ``pd.merge_asof()`` for asof-style time-series joining, see :ref:`here ` +- :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` .. contents:: What's new in v0.18.2 :local: @@ -22,8 +22,8 @@ New features .. _whatsnew_0182.enhancements.asof_merge: -``pd.merge_asof()`` for asof-style time-series joining -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:func:`merge_asof` for asof-style time-series joining +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A long-time requested feature has been added through the :func:`merge_asof` function, to support asof style joining of time-series. (:issue:`1870`). Full documentation is @@ -113,10 +113,10 @@ passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` me .. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: -``pd.read_csv`` has improved support for duplicate column names -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:func:`read_csv` has improved support for duplicate column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:ref:`Duplicate column names ` are now supported in ``pd.read_csv()`` whether +:ref:`Duplicate column names ` are now supported in :func:`read_csv` whether they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) .. ipython :: python @@ -187,7 +187,7 @@ Other enhancements - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) -- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) ` (:issue:`10008`, :issue:`13156`) +- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here ` (:issue:`10008`, :issue:`13156`) - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) .. ipython:: python @@ -406,6 +406,8 @@ New Behavior: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) +Furthermore: + - Passing duplicated ``percentiles`` will now raise a ``ValueError``. - Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) @@ -462,7 +464,7 @@ Bug Fixes - Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) -- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) +- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()`` ); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) - Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) @@ -473,9 +475,9 @@ Bug Fixes - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) - Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) -- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame`` appropriately when empty (:issue:`13212`) - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) -- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue: `13306`) +- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue:`13306`) - Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) @@ -493,6 +495,8 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) +- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) +- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 475eb73812666..9baff67845dac 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -202,10 +202,9 @@ quotechar : str (length 1), optional The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. -quoting : int or csv.QUOTE_* instance, default None +quoting : int or csv.QUOTE_* instance, default 0 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). - Default (None) results in QUOTE_MINIMAL behavior. doublequote : boolean, default ``True`` When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate whether or not to interpret two consecutive quotechar elements INSIDE a diff --git a/pandas/io/tests/parser/quoting.py b/pandas/io/tests/parser/quoting.py new file mode 100644 index 0000000000000..d0f1493be0621 --- /dev/null +++ b/pandas/io/tests/parser/quoting.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +""" +Tests that quoting specifications are properly handled +during parsing for all of the parsers defined in parsers.py +""" + +import csv +import pandas.util.testing as tm + +from pandas import DataFrame +from pandas.compat import StringIO + + +class QuotingTests(object): + + def test_bad_quote_char(self): + data = '1,2,3' + + # Python 2.x: "...must be an 1-character..." + # Python 3.x: "...must be a 1-character..." + msg = '"quotechar" must be a(n)? 1-character string' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar='foo') + + msg = 'quotechar must be set if quoting enabled' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar=None, + quoting=csv.QUOTE_MINIMAL) + + msg = '"quotechar" must be string, not int' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar=2) + + def test_bad_quoting(self): + data = '1,2,3' + + msg = '"quoting" must be an integer' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quoting='foo') + + # quoting must in the range [0, 3] + msg = 'bad "quoting" value' + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quoting=5) + + def test_quote_char_basic(self): + data = 'a,b,c\n1,2,"cat"' + expected = DataFrame([[1, 2, 'cat']], + columns=['a', 'b', 'c']) + result = self.read_csv(StringIO(data), quotechar='"') + tm.assert_frame_equal(result, expected) + + def test_quote_char_various(self): + data = 'a,b,c\n1,2,"cat"' + expected = DataFrame([[1, 2, 'cat']], + columns=['a', 'b', 'c']) + quote_chars = ['~', '*', '%', '$', '@', 'P'] + + for quote_char in quote_chars: + new_data = data.replace('"', quote_char) + result = self.read_csv(StringIO(new_data), quotechar=quote_char) + tm.assert_frame_equal(result, expected) + + def test_null_quote_char(self): + data = 'a,b,c\n1,2,3' + + # sanity checks + msg = 'quotechar must be set if quoting enabled' + + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar=None, + quoting=csv.QUOTE_MINIMAL) + + tm.assertRaisesRegexp(TypeError, msg, self.read_csv, + StringIO(data), quotechar='', + quoting=csv.QUOTE_MINIMAL) + + # no errors should be raised if quoting is None + expected = DataFrame([[1, 2, 3]], + columns=['a', 'b', 'c']) + + result = self.read_csv(StringIO(data), quotechar=None, + quoting=csv.QUOTE_NONE) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), quotechar='', + quoting=csv.QUOTE_NONE) + tm.assert_frame_equal(result, expected) + + def test_quoting_various(self): + data = '1,2,"foo"' + cols = ['a', 'b', 'c'] + + # QUOTE_MINIMAL and QUOTE_ALL apply only to + # the CSV writer, so they should have no + # special effect for the CSV reader + expected = DataFrame([[1, 2, 'foo']], columns=cols) + + # test default (afterwards, arguments are all explicit) + result = self.read_csv(StringIO(data), names=cols) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), quotechar='"', + quoting=csv.QUOTE_MINIMAL, names=cols) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), quotechar='"', + quoting=csv.QUOTE_ALL, names=cols) + tm.assert_frame_equal(result, expected) + + # QUOTE_NONE tells the reader to do no special handling + # of quote characters and leave them alone + expected = DataFrame([[1, 2, '"foo"']], columns=cols) + result = self.read_csv(StringIO(data), quotechar='"', + quoting=csv.QUOTE_NONE, names=cols) + tm.assert_frame_equal(result, expected) + + # QUOTE_NONNUMERIC tells the reader to cast + # all non-quoted fields to float + expected = DataFrame([[1.0, 2.0, 'foo']], columns=cols) + result = self.read_csv(StringIO(data), quotechar='"', + quoting=csv.QUOTE_NONNUMERIC, + names=cols) + tm.assert_frame_equal(result, expected) + + def test_double_quote(self): + data = 'a,b\n3,"4 "" 5"' + + expected = DataFrame([[3, '4 " 5']], + columns=['a', 'b']) + result = self.read_csv(StringIO(data), quotechar='"', + doublequote=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[3, '4 " 5"']], + columns=['a', 'b']) + result = self.read_csv(StringIO(data), quotechar='"', + doublequote=False) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index fda7b28769647..21f903342a611 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -11,6 +11,7 @@ from .common import ParserTests from .header import HeaderTests from .comment import CommentTests +from .quoting import QuotingTests from .usecols import UsecolsTests from .skiprows import SkipRowsTests from .index_col import IndexColTests @@ -28,7 +29,7 @@ class BaseParser(CommentTests, CompressionTests, IndexColTests, MultithreadTests, NAvaluesTests, ParseDatesTests, ParserTests, SkipRowsTests, - UsecolsTests): + UsecolsTests, QuotingTests): def read_csv(self, *args, **kwargs): raise NotImplementedError diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 063b2158d999a..3928bc8472113 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -7,6 +7,7 @@ from libc.string cimport strncpy, strlen, strcmp, strcasecmp cimport libc.stdio as stdio import warnings +from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE from cpython cimport (PyObject, PyBytes_FromString, PyBytes_AsString, PyBytes_Check, PyUnicode_Check, PyUnicode_AsUTF8String) @@ -283,6 +284,7 @@ cdef class TextReader: object compression object mangle_dupe_cols object tupleize_cols + list dtype_cast_order set noconvert, usecols def __cinit__(self, source, @@ -393,8 +395,13 @@ cdef class TextReader: raise ValueError('Only length-1 escapes supported') self.parser.escapechar = ord(escapechar) - self.parser.quotechar = ord(quotechar) - self.parser.quoting = quoting + self._set_quoting(quotechar, quoting) + + # TODO: endianness just a placeholder? + if quoting == QUOTE_NONNUMERIC: + self.dtype_cast_order = [' 1: @@ -548,6 +555,29 @@ cdef class TextReader: def set_error_bad_lines(self, int status): self.parser.error_bad_lines = status + def _set_quoting(self, quote_char, quoting): + if not isinstance(quoting, int): + raise TypeError('"quoting" must be an integer') + + if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: + raise TypeError('bad "quoting" value') + + if not isinstance(quote_char, (str, bytes)) and quote_char is not None: + dtype = type(quote_char).__name__ + raise TypeError('"quotechar" must be string, ' + 'not {dtype}'.format(dtype=dtype)) + + if quote_char is None or quote_char == '': + if quoting != QUOTE_NONE: + raise TypeError("quotechar must be set if quoting enabled") + self.parser.quoting = quoting + self.parser.quotechar = -1 + elif len(quote_char) > 1: # 0-len case handled earlier + raise TypeError('"quotechar" must be a 1-character string') + else: + self.parser.quoting = quoting + self.parser.quotechar = ord(quote_char) + cdef _make_skiprow_set(self): if isinstance(self.skiprows, (int, np.integer)): parser_set_skipfirstnrows(self.parser, self.skiprows) @@ -1066,7 +1096,7 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) else: col_res = None - for dt in dtype_cast_order: + for dt in self.dtype_cast_order: try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_flist) @@ -1847,12 +1877,6 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL: return table -# if at first you don't succeed... - -# TODO: endianness just a placeholder? -cdef list dtype_cast_order = ['>> (pd.merge_ordered(left, right, on='a') ... .ffill()