Skip to content

Commit

Permalink
ENH: support decimal option in PythonParser #12933
Browse files Browse the repository at this point in the history
closes #12933

Author: Camilo Cota <ccota@riplife.es>

Closes #13189 from camilocot/12933 and squashes the following commits:

465272e [Camilo Cota] Benchmark decimal option in read_csv for c engine
9f42d0c [Camilo Cota] double backticks around decimal and engine='python'
dc8ca62 [Camilo Cota] fix test_empty_decimal_marker comment
49613fe [Camilo Cota] Assert read_csv error message in test_empty_decimal_marker
d821052 [Camilo Cota] fix test_empty_decimal_marker comment
f71509d [Camilo Cota] Include descritive what's new line
803356e [Camilo Cota] set nonnum regex in init method
1472d80 [Camilo Cota] Include the issue number in what's new
b560fda [Camilo Cota] Fix what's new
dc7acd1 [Camilo Cota] ENH: support decimal option in PythonParser #12933
  • Loading branch information
Camilo Cota authored and jreback committed May 22, 2016
1 parent b88eb35 commit 19ebee5
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 60 deletions.
60 changes: 56 additions & 4 deletions asv_bench/benchmarks/parser_vb.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,42 @@ class read_csv_default_converter(object):
goal_time = 0.2

def setup(self):
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n
0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n
0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n
0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n
0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n"""
self.data = (self.data * 200)

def time_read_csv_default_converter(self):
read_csv(StringIO(self.data), sep=',', header=None, float_precision=None)


class read_csv_default_converter_with_decimal(object):
goal_time = 0.2

def setup(self):
self.data = """0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n
0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n
0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n
0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n
0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n"""
self.data = (self.data * 200)

def time_read_csv_default_converter_with_decimal(self):
read_csv(StringIO(self.data), sep=';', header=None,
float_precision=None, decimal=',')


class read_csv_precise_converter(object):
goal_time = 0.2

def setup(self):
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n
0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n
0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n
0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n
0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n"""
self.data = (self.data * 200)

def time_read_csv_precise_converter(self):
Expand All @@ -45,7 +69,11 @@ class read_csv_roundtrip_converter(object):
goal_time = 0.2

def setup(self):
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n
0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n
0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n
0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n
0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n"""
self.data = (self.data * 200)

def time_read_csv_roundtrip_converter(self):
Expand Down Expand Up @@ -109,4 +137,28 @@ def setup(self):
self.data = (self.data * 200)

def time_read_table_multiple_date_baseline(self):
read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])
read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1])


class read_csv_default_converter_python_engine(object):
goal_time = 0.2

def setup(self):
self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n '
self.data = (self.data * 200)

def time_read_csv_default_converter(self):
read_csv(StringIO(self.data), sep=',', header=None,
float_precision=None, engine='python')


class read_csv_default_converter_with_decimal_python_engine(object):
goal_time = 0.2

def setup(self):
self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n '
self.data = (self.data * 200)

def time_read_csv_default_converter_with_decimal(self):
read_csv(StringIO(self.data), sep=';', header=None,
float_precision=None, decimal=',', engine='python')
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ Other enhancements

pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30)

- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)

.. _whatsnew_0182.api:

API changes
Expand Down
37 changes: 30 additions & 7 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds):
'keep_default_na': True,
'thousands': None,
'comment': None,
'decimal': b'.',

# 'engine': 'c',
'parse_dates': False,
Expand Down Expand Up @@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines': True,
'warn_bad_lines': True,
'dtype': None,
'decimal': b'.',
'float_precision': None
}

Expand All @@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds):
'error_bad_lines',
'warn_bad_lines',
'dtype',
'decimal',
'float_precision',
])

Expand Down Expand Up @@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds):
self.converters = kwds['converters']

self.thousands = kwds['thousands']
self.decimal = kwds['decimal']
self.comment = kwds['comment']
self._comment_lines = []

Expand Down Expand Up @@ -1639,6 +1639,15 @@ def __init__(self, f, **kwds):
else:
self._no_thousands_columns = None

if len(self.decimal) != 1:
raise ValueError('Only length-1 decimal markers supported')

if self.thousands is None:
self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
else:
self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
self.decimal))

def _set_no_thousands_columns(self):
# Create a set of column ids that are not to be stripped of thousands
# operators.
Expand Down Expand Up @@ -2050,22 +2059,35 @@ def _check_empty(self, lines):
def _check_thousands(self, lines):
if self.thousands is None:
return lines
nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands)

return self._search_replace_num_columns(lines=lines,
search=self.thousands,
replace='')

def _search_replace_num_columns(self, lines, search, replace):
ret = []
for l in lines:
rl = []
for i, x in enumerate(l):
if (not isinstance(x, compat.string_types) or
self.thousands not in x or
search not in x or
(self._no_thousands_columns and
i in self._no_thousands_columns) or
nonnum.search(x.strip())):
self.nonnum.search(x.strip())):
rl.append(x)
else:
rl.append(x.replace(self.thousands, ''))
rl.append(x.replace(search, replace))
ret.append(rl)
return ret

def _check_decimal(self, lines):
if self.decimal == _parser_defaults['decimal']:
return lines

return self._search_replace_num_columns(lines=lines,
search=self.decimal,
replace='.')

def _clear_buffer(self):
self.buf = []

Expand Down Expand Up @@ -2249,7 +2271,8 @@ def _get_lines(self, rows=None):
lines = self._check_comments(lines)
if self.skip_blank_lines:
lines = self._check_empty(lines)
return self._check_thousands(lines)
lines = self._check_thousands(lines)
return self._check_decimal(lines)


def _make_date_converter(date_parser=None, dayfirst=False,
Expand Down
45 changes: 0 additions & 45 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,17 +353,6 @@ def test_disable_bool_parsing(self):
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
self.assertEqual(result['B'][2], '')

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)

def test_custom_lineterminator(self):
data = 'a,b,c~1,2,3~4,5,6'

Expand Down Expand Up @@ -444,40 +433,6 @@ def test_raise_on_no_columns(self):
data = "\n\n\n"
self.assertRaises(ValueError, self.read_csv, StringIO(data))

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_grow_boundary_at_cap(self):
# See gh-12494
#
Expand Down
53 changes: 49 additions & 4 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def test_empty_decimal_marker(self):
1|2,334|5
10|13|10.
"""
# C parser: supports only length-1 decimals
# Python parser: 'decimal' not supported yet
self.assertRaises(ValueError, self.read_csv,
StringIO(data), decimal='')
# Parsers support only length-1 decimals
msg = 'Only length-1 decimal markers supported'
with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv(StringIO(data), decimal='')

def test_read_csv(self):
if not compat.PY3:
Expand Down Expand Up @@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self):
result = self.read_table(f, squeeze=True, header=None)
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
tm.assert_series_equal(result, expected)

def test_1000_sep_with_decimal(self):
data = """A|B|C
1|2,334.01|5
10|13|10.
"""
expected = DataFrame({
'A': [1, 10],
'B': [2334.01, 13],
'C': [5, 10.]
})

tm.assert_equal(expected.A.dtype, 'int64')
tm.assert_equal(expected.B.dtype, 'float')
tm.assert_equal(expected.C.dtype, 'float')

df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data), sep='|',
thousands=',', decimal='.')
tm.assert_frame_equal(df, expected)

data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
df = self.read_csv(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

df = self.read_table(StringIO(data_with_odd_sep),
sep='|', thousands='.', decimal=',')
tm.assert_frame_equal(df, expected)

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""

df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
self.assertEqual(df2['Number1'].dtype, float)
self.assertEqual(df2['Number2'].dtype, float)
self.assertEqual(df2['Number3'].dtype, float)

0 comments on commit 19ebee5

Please sign in to comment.