Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix generator support in fromdicts - use file cache instead of iterto… #625

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Changes
=======

Version 1.7.11
--------------

* Fix generator support in fromdicts to use file cache
By :user:`arturponinski`, :issue:`625`.

Version 1.7.10
--------------

Expand Down
72 changes: 68 additions & 4 deletions petl/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@

# standard library dependencies
import io
import itertools
import json
import inspect
from json.encoder import JSONEncoder
from os import unlink
from tempfile import NamedTemporaryFile

from petl.compat import PY2
from petl.compat import pickle
from petl.io.sources import read_source_from_arg, write_source_from_arg
# internal dependencies
from petl.util.base import data, Table, dicts as _dicts, iterpeek
from petl.util.base import data, Table, dicts as _dicts, iterpeek, iterchunk


def fromjson(source, *args, **kwargs):
Expand Down Expand Up @@ -140,6 +142,24 @@ def fromdicts(dicts, header=None, sample=1000, missing=None):
| 'c' | 2 |
+-----+-----+

Argument `dicts` can also be a generator, the output of generator
is iterated and cached using a temporary file to support further
transforms and multiple passes of the table:

>>> import petl as etl
>>> dicts = ({"foo": chr(ord("a")+i), "bar":i+1} for i in range(3))
>>> table1 = etl.fromdicts(dicts, header=['foo', 'bar'])
>>> table1
+-----+-----+
| foo | bar |
+=====+=====+
| 'a' | 1 |
+-----+-----+
| 'b' | 2 |
+-----+-----+
| 'c' | 3 |
+-----+-----+

If `header` is not specified, `sample` items from `dicts` will be
inspected to discovery dictionary keys. Note that the order in which
dictionary keys are discovered may not be stable,
Expand All @@ -156,6 +176,16 @@ def fromdicts(dicts, header=None, sample=1000, missing=None):
:func:`petl.transform.headers.sortheader` on the resulting table to
guarantee stability.

.. versionchanged:: 1.7.5

Full support of generators passed as `dicts` has been added, leveraging
`itertools.tee`.

.. versionchanged:: 1.7.11

Generator support has been modified to use temporary file cache
instead of `itertools.tee` due to high memory usage.

"""
view = DictsGeneratorView if inspect.isgenerator(dicts) else DictsView
return view(dicts, header=header, sample=sample, missing=missing)
Expand All @@ -175,9 +205,43 @@ def __iter__(self):

class DictsGeneratorView(DictsView):

def __init__(self, dicts, header=None, sample=1000, missing=None):
super(DictsGeneratorView, self).__init__(dicts, header, sample, missing)

Check notice

Code scanning

Consider using Python 3 style super() without arguments (super-with-arguments)

Consider using Python 3 style super() without arguments (super-with-arguments)
self._filecache = None

def __iter__(self):
self.dicts, dicts = itertools.tee(self.dicts)
return iterdicts(dicts, self._header, self.sample, self.missing)
if not self._header:
self._determine_header()
yield self._header

if not self._filecache:
self._filecache = NamedTemporaryFile(delete=False, mode='wb')

Check notice

Code scanning

Consider using 'with' for resource-allocating operations (consider-using-with)

Consider using 'with' for resource-allocating operations (consider-using-with)
it = iter(self.dicts)
for o in it:
row = tuple(o[f] if f in o else self.missing for f in self._header)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
row = tuple(o[f] if f in o else self.missing for f in self._header)
row = tuple(o.get(f, self.missing) for f in self._header)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commited in the branch that has superseded this one: 662b785

pickle.dump(row, self._filecache, protocol=-1)
self._filecache.flush()
self._filecache.close()

for row in iterchunk(self._filecache.name):
yield row

def _determine_header(self):
it = iter(self.dicts)
header = list()
peek, it = iterpeek(it, self.sample)
self.dicts = it
if isinstance(peek, dict):
peek = [peek]
for o in peek:
if hasattr(o, 'keys'):
header += [k for k in o.keys() if k not in header]
self._header = tuple(header)
return it

def __del__(self):
if self._filecache:
unlink(self._filecache.name)


def iterjlines(f, header, missing):
Expand Down
45 changes: 40 additions & 5 deletions petl/test/io/test_json.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


from collections import OrderedDict
from tempfile import NamedTemporaryFile
import json

Expand Down Expand Up @@ -121,7 +121,6 @@ def test_fromdicts_onepass():


def test_fromdicts_ordered():
from collections import OrderedDict
data = [OrderedDict([('foo', 'a'), ('bar', 1)]),
OrderedDict([('foo', 'b')]),
OrderedDict([('foo', 'c'), ('bar', 2), ('baz', True)])]
Expand Down Expand Up @@ -181,7 +180,6 @@ def test_fromdicts_header_does_not_raise():


def test_fromdicts_header_list():
from collections import OrderedDict
data = [OrderedDict([('foo', 'a'), ('bar', 1)]),
OrderedDict([('foo', 'b'), ('bar', 2)]),
OrderedDict([('foo', 'c'), ('bar', 2)])]
Expand All @@ -195,10 +193,21 @@ def test_fromdicts_header_list():
ieq(expect, actual)
ieq(expect, actual)

def test_fromdicts_generator_twice():
def generator():
yield OrderedDict([('foo', 'a'), ('bar', 1)])
yield OrderedDict([('foo', 'b'), ('bar', 2)])
yield OrderedDict([('foo', 'c'), ('bar', 2)])

def test_fromdicts_header_generator():
from collections import OrderedDict
actual = fromdicts(generator())
expect = (('foo', 'bar'),
('a', 1),
('b', 2),
('c', 2))
ieq(expect, actual)
ieq(expect, actual)

def test_fromdicts_generator_header():
def generator():
yield OrderedDict([('foo', 'a'), ('bar', 1)])
yield OrderedDict([('foo', 'b'), ('bar', 2)])
Expand All @@ -213,3 +222,29 @@ def generator():
('c', 2))
ieq(expect, actual)
ieq(expect, actual)


def test_fromdicts_generator_random_access():
def generator():
for i in range(5):
yield OrderedDict([('n', i), ('foo', 100*i), ('bar', 200*i)])

actual = fromdicts(generator(), sample=3)
assert actual.header() == ('n', 'foo', 'bar')
# first pass
it1 = iter(actual)
first_row1 = next(it1)
first_row2 = next(it1)
# second pass
it2 = iter(actual)
second_row1 = next(it2)
second_row2 = next(it2)
assert first_row1 == second_row1
assert first_row2 == second_row2
# reverse order
second_row3 = next(it2)
first_row3 = next(it1)
assert second_row3 == first_row3

Check warning

Code scanning / Bandit (reported by Codacy)

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
ieq(actual, actual)
assert actual.header() == ('n', 'foo', 'bar')
assert len(actual) == 6

Check warning

Code scanning / Bandit (reported by Codacy)

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.

Use of assert detected. The enclosed code will be removed when compiling to optimised byte code.
13 changes: 1 addition & 12 deletions petl/transform/sorts.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import petl.config as config
from petl.comparison import comparable_itemgetter
from petl.util.base import Table, asindices
from petl.util.base import iterchunk as _iterchunk


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -115,18 +116,6 @@ def sort(table, key=None, reverse=False, buffersize=None, tempdir=None,
Table.sort = sort


def _iterchunk(fn):
# reopen so iterators from file cache are independent
debug('iterchunk, opening %s' % fn)
with open(fn, 'rb') as f:
try:
while True:
yield pickle.load(f)
except EOFError:
pass
debug('end of iterchunk, closed %s' % fn)


class _Keyed(namedtuple('Keyed', ['key', 'obj'])):
# Override default behavior of namedtuple comparisons, only keys need to be compared for heapmerge
def __eq__(self, other):
Expand Down
18 changes: 17 additions & 1 deletion petl/util/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import absolute_import, print_function, division


import logging
import pickle

Check warning

Code scanning / Bandit (reported by Codacy)

Consider possible security implications associated with pickle module.

Consider possible security implications associated with pickle module.
import re
from itertools import islice, chain, cycle, product,\
permutations, combinations, takewhile, dropwhile, \
Expand All @@ -16,6 +17,9 @@
from petl.comparison import comparable_itemgetter


logger = logging.getLogger(__name__)
debug = logger.debug

class IterContainer(object):

def __contains__(self, item):
Expand Down Expand Up @@ -742,6 +746,18 @@ def iterpeek(it, n=1):
return peek, chain(peek, it)


def iterchunk(fn):
# reopen so iterators from file cache are independent
debug('iterchunk, opening %s' % fn)

Check notice

Code scanning

Formatting a regular string which could be a f-string (consider-using-f-string)

Formatting a regular string which could be a f-string (consider-using-f-string)

Check notice

Code scanning

Use lazy % formatting in logging functions (logging-not-lazy)

Use lazy % formatting in logging functions (logging-not-lazy)
with open(fn, 'rb') as f:
try:
while True:
yield pickle.load(f)
except EOFError:
pass
debug('end of iterchunk, closed %s' % fn)

Check notice

Code scanning

Formatting a regular string which could be a f-string (consider-using-f-string)

Formatting a regular string which could be a f-string (consider-using-f-string)

Check notice

Code scanning

Use lazy % formatting in logging functions (logging-not-lazy)

Use lazy % formatting in logging functions (logging-not-lazy)


def empty():
"""
Return an empty table. Can be useful when building up a table from a set
Expand Down