Skip to content
Permalink
Browse files

ENH: added new cache_dates parameter for read_csv func (#25990)

  • Loading branch information...
anmyachev authored and jreback committed May 7, 2019
1 parent 2f182aa commit f65742cd47d6b364a2cbf150869b282c5aa7daf0
Showing with 50 additions and 5 deletions.
  1. +20 −0 asv_bench/benchmarks/io/csv.py
  2. +6 −0 doc/source/user_guide/io.rst
  3. +1 −0 doc/source/whatsnew/v0.25.0.rst
  4. +23 −5 pandas/io/parsers.py
@@ -4,6 +4,7 @@
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Categorical, date_range, read_csv
from pandas.io.parsers import _parser_defaults
from io import StringIO

from ..pandas_vb_common import BaseIO
@@ -232,6 +233,25 @@ def time_baseline(self):
names=list(string.digits[:9]))


class ReadCSVCachedParseDates(StringIORewind):
params = ([True, False],)
param_names = ['do_cache']

def setup(self, do_cache):
data = ('\n'.join('10/{}'.format(year)
for year in range(2000, 2100)) + '\n') * 10
self.StringIO_input = StringIO(data)

def time_read_csv_cached(self, do_cache):
# kwds setting here is used to avoid breaking tests in
# previous version of pandas, because this is api changes
kwds = {}
if 'cache_dates' in _parser_defaults:
kwds['cache_dates'] = do_cache
read_csv(self.data(self.StringIO_input), header=None,
parse_dates=[0], **kwds)


class ReadCSVMemoryGrowth(BaseIO):

chunksize = 20
@@ -271,6 +271,12 @@ date_parser : function, default ``None``
(corresponding to the columns defined by parse_dates) as arguments.
dayfirst : boolean, default ``False``
DD/MM format dates, international and European format.
cache_dates : boolean, default True
If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce significant speed-up when parsing duplicate
date strings, especially ones with timezone offsets.

.. versionadded:: 0.25.0

Iteration
+++++++++
@@ -375,6 +375,7 @@ I/O
- Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`)
- Fixed memory leak in :meth:`DataFrame.to_json` when dealing with numeric data (:issue:`24889`)
- Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`)
- Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`)

Plotting
^^^^^^^^
@@ -235,6 +235,12 @@
arguments.
dayfirst : bool, default False
DD/MM format dates, international and European format.
cache_dates : boolean, default True
If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce significant speed-up when parsing duplicate
date strings, especially ones with timezone offsets.
.. versionadded:: 0.25.0
iterator : bool, default False
Return TextFileReader object for iteration or getting chunks with
``get_chunk()``.
@@ -476,6 +482,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
'false_values': None,
'converters': None,
'dtype': None,
'cache_dates': True,

'thousands': None,
'comment': None,
@@ -577,6 +584,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
keep_date_col=False,
date_parser=None,
dayfirst=False,
cache_dates=True,

# Iteration
iterator=False,
@@ -683,6 +691,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
keep_date_col=keep_date_col,
dayfirst=dayfirst,
date_parser=date_parser,
cache_dates=cache_dates,

nrows=nrows,
iterator=iterator,
@@ -1379,11 +1388,13 @@ def __init__(self, kwds):
self.tupleize_cols = kwds.get('tupleize_cols', False)
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
self.cache_dates = kwds.pop('cache_dates', True)

self._date_conv = _make_date_converter(
date_parser=self.date_parser,
dayfirst=self.dayfirst,
infer_datetime_format=self.infer_datetime_format
infer_datetime_format=self.infer_datetime_format,
cache_dates=self.cache_dates
)

# validate header options for mi
@@ -3173,7 +3184,7 @@ def _get_lines(self, rows=None):


def _make_date_converter(date_parser=None, dayfirst=False,
infer_datetime_format=False):
infer_datetime_format=False, cache_dates=True):
def converter(*date_cols):
if date_parser is None:
strs = _concat_date_cols(date_cols)
@@ -3184,16 +3195,22 @@ def converter(*date_cols):
utc=None,
dayfirst=dayfirst,
errors='ignore',
infer_datetime_format=infer_datetime_format
infer_datetime_format=infer_datetime_format,
cache=cache_dates
).to_numpy()

except ValueError:
return tools.to_datetime(
parsing.try_parse_dates(strs, dayfirst=dayfirst))
parsing.try_parse_dates(strs, dayfirst=dayfirst),
cache=cache_dates
)
else:
try:
result = tools.to_datetime(
date_parser(*date_cols), errors='ignore')
date_parser(*date_cols),
errors='ignore',
cache=cache_dates
)
if isinstance(result, datetime.datetime):
raise Exception('scalar parser')
return result
@@ -3203,6 +3220,7 @@ def converter(*date_cols):
parsing.try_parse_dates(_concat_date_cols(date_cols),
parser=date_parser,
dayfirst=dayfirst),
cache=cache_dates,
errors='ignore')
except Exception:
return generic_parser(date_parser, *date_cols)

0 comments on commit f65742c

Please sign in to comment.
You can’t perform that action at this time.