Skip to content

Commit

Permalink
Merge 7a326b1 into 0be2735
Browse files Browse the repository at this point in the history
  • Loading branch information
arturponinski committed Jul 13, 2022
2 parents 0be2735 + 7a326b1 commit f0ec1c3
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 22 deletions.
44 changes: 40 additions & 4 deletions petl/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@

# standard library dependencies
import io
import itertools
import json
import inspect
from json.encoder import JSONEncoder
from os import unlink
from tempfile import NamedTemporaryFile

from petl.compat import PY2
from petl.compat import pickle
from petl.io.sources import read_source_from_arg, write_source_from_arg
# internal dependencies
from petl.util.base import data, Table, dicts as _dicts, iterpeek
from petl.util.base import data, Table, dicts as _dicts, iterpeek, iterchunk


def fromjson(source, *args, **kwargs):
Expand Down Expand Up @@ -175,9 +177,43 @@ def __iter__(self):

class DictsGeneratorView(DictsView):

def __init__(self, dicts, header=None, sample=1000, missing=None):
super(DictsGeneratorView, self).__init__(dicts, header, sample, missing)
self._filecache = None

def __iter__(self):
self.dicts, dicts = itertools.tee(self.dicts)
return iterdicts(dicts, self._header, self.sample, self.missing)
if not self._header:
self._determine_header()
yield self._header

if not self._filecache:
self._filecache = NamedTemporaryFile(delete=False, mode='wb')
it = iter(self.dicts)
for o in it:
row = tuple(o[f] if f in o else self.missing for f in self._header)
pickle.dump(row, self._filecache, protocol=-1)
self._filecache.flush()
self._filecache.close()

for row in iterchunk(self._filecache.name):
yield row

def _determine_header(self):
it = iter(self.dicts)
header = list()
peek, it = iterpeek(it, self.sample)
self.dicts = it
if isinstance(peek, dict):
peek = [peek]
for o in peek:
if hasattr(o, 'keys'):
header += [k for k in o.keys() if k not in header]
self._header = tuple(header)
return it

def __del__(self):
if self._filecache:
unlink(self._filecache.name)


def iterjlines(f, header, missing):
Expand Down
45 changes: 40 additions & 5 deletions petl/test/io/test_json.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


from collections import OrderedDict
from tempfile import NamedTemporaryFile
import json

Expand Down Expand Up @@ -121,7 +121,6 @@ def test_fromdicts_onepass():


def test_fromdicts_ordered():
from collections import OrderedDict
data = [OrderedDict([('foo', 'a'), ('bar', 1)]),
OrderedDict([('foo', 'b')]),
OrderedDict([('foo', 'c'), ('bar', 2), ('baz', True)])]
Expand Down Expand Up @@ -181,7 +180,6 @@ def test_fromdicts_header_does_not_raise():


def test_fromdicts_header_list():
from collections import OrderedDict
data = [OrderedDict([('foo', 'a'), ('bar', 1)]),
OrderedDict([('foo', 'b'), ('bar', 2)]),
OrderedDict([('foo', 'c'), ('bar', 2)])]
Expand All @@ -195,10 +193,21 @@ def test_fromdicts_header_list():
ieq(expect, actual)
ieq(expect, actual)

def test_fromdicts_generator_twice():
def generator():
yield OrderedDict([('foo', 'a'), ('bar', 1)])
yield OrderedDict([('foo', 'b'), ('bar', 2)])
yield OrderedDict([('foo', 'c'), ('bar', 2)])

def test_fromdicts_header_generator():
from collections import OrderedDict
actual = fromdicts(generator())
expect = (('foo', 'bar'),
('a', 1),
('b', 2),
('c', 2))
ieq(expect, actual)
ieq(expect, actual)

def test_fromdicts_generator_header():
def generator():
yield OrderedDict([('foo', 'a'), ('bar', 1)])
yield OrderedDict([('foo', 'b'), ('bar', 2)])
Expand All @@ -213,3 +222,29 @@ def generator():
('c', 2))
ieq(expect, actual)
ieq(expect, actual)


def test_fromdicts_generator_random_access():
def generator():
for i in range(5):
yield OrderedDict([('n', i), ('foo', 100*i), ('bar', 200*i)])

actual = fromdicts(generator(), sample=3)
assert actual.header() == ('n', 'foo', 'bar')
# first pass
it1 = iter(actual)
first_row1 = next(it1)
first_row2 = next(it1)
# second pass
it2 = iter(actual)
second_row1 = next(it2)
second_row2 = next(it2)
assert first_row1 == second_row1
assert first_row2 == second_row2
# reverse order
second_row3 = next(it2)
first_row3 = next(it1)
assert second_row3 == first_row3
ieq(actual, actual)
assert actual.header() == ('n', 'foo', 'bar')
assert len(actual) == 6
13 changes: 1 addition & 12 deletions petl/transform/sorts.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import petl.config as config
from petl.comparison import comparable_itemgetter
from petl.util.base import Table, asindices
from petl.util.base import iterchunk as _iterchunk


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -115,18 +116,6 @@ def sort(table, key=None, reverse=False, buffersize=None, tempdir=None,
Table.sort = sort


def _iterchunk(fn):
# reopen so iterators from file cache are independent
debug('iterchunk, opening %s' % fn)
with open(fn, 'rb') as f:
try:
while True:
yield pickle.load(f)
except EOFError:
pass
debug('end of iterchunk, closed %s' % fn)


class _Keyed(namedtuple('Keyed', ['key', 'obj'])):
# Override default behavior of namedtuple comparisons, only keys need to be compared for heapmerge
def __eq__(self, other):
Expand Down
18 changes: 17 additions & 1 deletion petl/util/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import absolute_import, print_function, division


import logging
import pickle
import re
from itertools import islice, chain, cycle, product,\
permutations, combinations, takewhile, dropwhile, \
Expand All @@ -16,6 +17,9 @@
from petl.comparison import comparable_itemgetter


logger = logging.getLogger(__name__)
debug = logger.debug

class IterContainer(object):

def __contains__(self, item):
Expand Down Expand Up @@ -742,6 +746,18 @@ def iterpeek(it, n=1):
return peek, chain(peek, it)


def iterchunk(fn):
# reopen so iterators from file cache are independent
debug('iterchunk, opening %s' % fn)
with open(fn, 'rb') as f:
try:
while True:
yield pickle.load(f)
except EOFError:
pass
debug('end of iterchunk, closed %s' % fn)


def empty():
"""
Return an empty table. Can be useful when building up a table from a set
Expand Down

0 comments on commit f0ec1c3

Please sign in to comment.