Skip to content

Commit

Permalink
support lineterminator in read_csv, don't pass to pd
Browse files Browse the repository at this point in the history
  • Loading branch information
mrocklin committed Apr 26, 2016
1 parent 83c7e29 commit e7e3035
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 5 deletions.
7 changes: 3 additions & 4 deletions dask/dataframe/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ def read_csv_from_bytes(block_lists, header, head, kwargs, collection=True,

def read_csv(filename, blocksize=2**25, chunkbytes=None,
collection=True, lineterminator='\n', compression=None,
**kwargs):
kwargs.update({'lineterminator': lineterminator})
enforce_dtypes=True, sample=10000, **kwargs):
if chunkbytes is not None:
warn("Deprecation warning: chunksize csv keyword renamed to blocksize")
blocksize=chunkbytes
Expand All @@ -111,13 +110,13 @@ def read_csv(filename, blocksize=2**25, chunkbytes=None,
b_lineterminator = lineterminator.encode()
sample, values = read_bytes(filename, delimiter=b_lineterminator,
blocksize=blocksize,
sample=10000, compression=compression)
sample=sample, compression=compression)
if not isinstance(values[0], (tuple, list)):
values = [values]
header = sample.split(b_lineterminator)[0] + b_lineterminator
head = pd.read_csv(BytesIO(sample), **kwargs)

df = read_csv_from_bytes(values, header, head, kwargs,
collection=collection)
collection=collection, enforce_dtypes=enforce_dtypes)

return df
10 changes: 9 additions & 1 deletion dask/dataframe/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from dask.compatibility import gzip_compress
from dask.dataframe.csv import read_csv_from_bytes, bytes_read_csv, read_csv
from dask.dataframe.utils import eq
from dask.utils import filetexts
from dask.utils import filetexts, filetext


compute = partial(compute, get=get_sync)
Expand Down Expand Up @@ -168,3 +168,11 @@ def test_warn_non_seekable_files(capsys):

with pytest.raises(NotImplementedError):
df = read_csv('2014-01-*.csv', compression='foo')


def test_windows_line_terminator():
text = 'a,b\r\n1,2\r\n2,3\r\n3,4\r\n4,5\r\n5,6\r\n6,7'
with filetext(text) as fn:
df = read_csv(fn, blocksize=5, lineterminator='\r\n')
assert df.b.sum().compute() == 2 + 3 + 4 + 5 + 6 + 7
assert df.a.sum().compute() == 1 + 2 + 3 + 4 + 5 + 6

0 comments on commit e7e3035

Please sign in to comment.