-
Notifications
You must be signed in to change notification settings - Fork 191
/
bcolz.py
202 lines (163 loc) · 5.94 KB
/
bcolz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
import itertools
from petl.compat import string_types, text_type
from petl.util.base import Table, iterpeek
from petl.io.numpy import construct_dtype
def frombcolz(source, expression=None, outcols=None, limit=None, skip=0):
"""Extract a table from a bcolz ctable, e.g.::
>>> import petl as etl
>>>
>>> def example_from_bcolz():
... import bcolz
... cols = [
... ['apples', 'oranges', 'pears'],
... [1, 3, 7],
... [2.5, 4.4, .1]
... ]
... names = ('foo', 'bar', 'baz')
... ctbl = bcolz.ctable(cols, names=names)
... return etl.frombcolz(ctbl)
>>>
>>> example_from_bcolz() # doctest: +SKIP
+-----------+-----+-----+
| foo | bar | baz |
+===========+=====+=====+
| 'apples' | 1 | 2.5 |
+-----------+-----+-----+
| 'oranges' | 3 | 4.4 |
+-----------+-----+-----+
| 'pears' | 7 | 0.1 |
+-----------+-----+-----+
If `expression` is provided it will be executed by bcolz and only
matching rows returned, e.g.::
>>> tbl2 = etl.frombcolz(ctbl, expression='bar > 1') # doctest: +SKIP
>>> tbl2 # doctest: +SKIP
+-----------+-----+-----+
| foo | bar | baz |
+===========+=====+=====+
| 'oranges' | 3 | 4.4 |
+-----------+-----+-----+
| 'pears' | 7 | 0.1 |
+-----------+-----+-----+
.. versionadded:: 1.1.0
"""
return BcolzView(source, expression=expression, outcols=outcols,
limit=limit, skip=skip)
class BcolzView(Table):
def __init__(self, source, expression=None, outcols=None, limit=None,
skip=0):
self.source = source
self.expression = expression
self.outcols = outcols
self.limit = limit
self.skip = skip
def __iter__(self):
# obtain ctable
if isinstance(self.source, string_types):
import bcolz
ctbl = bcolz.open(self.source, mode='r')
else:
# assume bcolz ctable
ctbl = self.source
# obtain header
if self.outcols is None:
header = tuple(ctbl.names)
else:
header = tuple(self.outcols)
assert all(h in ctbl.names for h in header), 'invalid outcols'
yield header
# obtain iterator
if self.expression is None:
it = ctbl.iter(outcols=self.outcols, skip=self.skip,
limit=self.limit)
else:
it = ctbl.where(self.expression, outcols=self.outcols, skip=self.skip,
limit=self.limit)
for row in it:
yield row
def tobcolz(table, dtype=None, sample=1000, **kwargs):
"""Load data into a bcolz ctable, e.g.::
>>> import petl as etl
>>>
>>> def example_to_bcolz():
... table = [('foo', 'bar', 'baz'),
... ('apples', 1, 2.5),
... ('oranges', 3, 4.4),
... ('pears', 7, .1)]
... return etl.tobcolz(table)
>>>
>>> ctbl = example_to_bcolz() # doctest: +SKIP
>>> ctbl # doctest: +SKIP
ctable((3,), [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')])
nbytes: 132; cbytes: 1023.98 KB; ratio: 0.00
cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
[('apples', 1, 2.5) ('oranges', 3, 4.4) ('pears', 7, 0.1)]
>>> ctbl.names # doctest: +SKIP
['foo', 'bar', 'baz']
>>> ctbl['foo'] # doctest: +SKIP
carray((3,), <U7)
nbytes := 84; cbytes := 511.98 KB; ratio: 0.00
cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
chunklen := 18724; chunksize: 524272; blocksize: 0
['apples' 'oranges' 'pears']
Other keyword arguments are passed through to the ctable constructor.
.. versionadded:: 1.1.0
"""
import bcolz
import numpy as np
it = iter(table)
peek, it = iterpeek(it, sample)
hdr = next(it)
# numpy is fussy about having tuples, need to make sure
it = (tuple(row) for row in it)
flds = list(map(text_type, hdr))
dtype = construct_dtype(flds, peek, dtype)
# create ctable
kwargs.setdefault('expectedlen', 1000000)
kwargs.setdefault('mode', 'w')
ctbl = bcolz.ctable(np.array([], dtype=dtype), **kwargs)
# fill chunk-wise
chunklen = sum(ctbl.cols[name].chunklen
for name in ctbl.names) // len(ctbl.names)
while True:
data = list(itertools.islice(it, chunklen))
data = np.array(data, dtype=dtype)
ctbl.append(data)
if len(data) < chunklen:
break
ctbl.flush()
return ctbl
def appendbcolz(table, obj, check_names=True):
"""Append data into a bcolz ctable. The `obj` argument can be either an
existing ctable or the name of a directory were an on-disk ctable is
stored.
.. versionadded:: 1.1.0
"""
import bcolz
import numpy as np
if isinstance(obj, string_types):
ctbl = bcolz.open(obj, mode='a')
else:
assert hasattr(obj, 'append') and hasattr(obj, 'names'), \
'expected rootdir or ctable, found %r' % obj
ctbl = obj
# setup
dtype = ctbl.dtype
it = iter(table)
hdr = next(it)
flds = list(map(text_type, hdr))
# check names match
if check_names:
assert tuple(flds) == tuple(ctbl.names), 'column names do not match'
# fill chunk-wise
chunklen = sum(ctbl.cols[name].chunklen
for name in ctbl.names) // len(ctbl.names)
while True:
data = list(itertools.islice(it, chunklen))
data = np.array(data, dtype=dtype)
ctbl.append(data)
if len(data) < chunklen:
break
ctbl.flush()
return ctbl