-
-
Notifications
You must be signed in to change notification settings - Fork 212
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
309 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
# Changelog | ||
|
||
## [Unreleased] | ||
### Added | ||
- LowCardinality type. | ||
|
||
## [0.0.19] - 2019-03-31 | ||
### Added | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ Features | |
* UUID | ||
* Decimal | ||
* IPv4/IPv6 | ||
* LowCardinality(T) | ||
|
||
- Query progress information. | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
from math import log | ||
|
||
from ..reader import read_binary_uint64 | ||
from ..writer import write_binary_int64 | ||
from .base import Column | ||
from .intcolumn import UInt8Column, UInt16Column, UInt32Column, UInt64Column | ||
|
||
|
||
def create_low_cardinality_column(spec, column_by_spec_getter): | ||
inner = spec[15:-1] | ||
nested = column_by_spec_getter(inner) | ||
return LowCardinalityColumn(nested) | ||
|
||
|
||
class LowCardinalityColumn(Column): | ||
""" | ||
Stores column as index (unique elements) and keys. | ||
Good for de-duplication of large values with low cardinality. | ||
""" | ||
int_types = { | ||
0: UInt8Column, | ||
1: UInt16Column, | ||
2: UInt32Column, | ||
3: UInt64Column | ||
} | ||
|
||
# Need to read additional keys. | ||
# Additional keys are stored before indexes as value N and N keys | ||
# after them. | ||
has_additional_keys_bit = 1 << 9 | ||
# Need to update dictionary. | ||
# It means that previous granule has different dictionary. | ||
need_update_dictionary = 1 << 10 | ||
|
||
serialization_type = has_additional_keys_bit | need_update_dictionary | ||
|
||
def __init__(self, nested_column, **kwargs): | ||
self.nested_column = nested_column | ||
super(LowCardinalityColumn, self).__init__(**kwargs) | ||
|
||
def read_state_prefix(self, buf): | ||
return read_binary_uint64(buf) | ||
|
||
def write_state_prefix(self, buf): | ||
# KeysSerializationVersion. See ClickHouse docs. | ||
write_binary_int64(1, buf) | ||
|
||
def _write_data(self, items, buf): | ||
index, keys = [], [] | ||
key_by_index_element = {} | ||
|
||
if self.nested_column.nullable: | ||
# First element represents NULL if column is nullable. | ||
index.append(0) | ||
# Prevent null map writing. Reset nested column nullable flag. | ||
self.nested_column.nullable = False | ||
|
||
for x in items: | ||
if x is None: | ||
# Zero element for null. | ||
keys.append(0) | ||
|
||
else: | ||
key = key_by_index_element.get(x) | ||
# Get key from index or add it to index. | ||
if key is None: | ||
key = len(key_by_index_element) | ||
key_by_index_element[x] = key | ||
index.append(x) | ||
|
||
keys.append(key + 1) | ||
else: | ||
for x in items: | ||
key = key_by_index_element.get(x) | ||
|
||
# Get key from index or add it to index. | ||
if key is None: | ||
key = len(key_by_index_element) | ||
key_by_index_element[x] = len(key_by_index_element) | ||
index.append(x) | ||
|
||
keys.append(key) | ||
|
||
# Do not write anything for empty column. | ||
# May happen while writing empty arrays. | ||
if not len(index): | ||
return | ||
|
||
int_type = int(log(len(index), 2) / 8) | ||
int_column = self.int_types[int_type]() | ||
|
||
serialization_type = self.serialization_type | int_type | ||
|
||
write_binary_int64(serialization_type, buf) | ||
write_binary_int64(len(index), buf) | ||
|
||
self.nested_column.write_data(index, buf) | ||
write_binary_int64(len(items), buf) | ||
int_column.write_data(keys, buf) | ||
|
||
def _read_data(self, n_items, buf, nulls_map=None): | ||
if not n_items: | ||
return tuple() | ||
|
||
serialization_type = read_binary_uint64(buf) | ||
|
||
# Lowest byte contains info about key type. | ||
key_type = serialization_type & 0xf | ||
keys_column = self.int_types[key_type]() | ||
|
||
nullable = self.nested_column.nullable | ||
# Prevent null map reading. Reset nested column nullable flag. | ||
self.nested_column.nullable = False | ||
|
||
index_size = read_binary_uint64(buf) | ||
index = self.nested_column.read_data(index_size, buf) | ||
if nullable: | ||
index = (None, ) + index[1:] | ||
|
||
read_binary_uint64(buf) # number of keys | ||
keys = keys_column.read_data(n_items, buf) | ||
|
||
return tuple(index[x] for x in keys) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
from datetime import date, timedelta | ||
from decimal import Decimal | ||
|
||
from tests.testcase import BaseTestCase | ||
from tests.util import require_server_version | ||
|
||
|
||
class LowCardinalityTestCase(BaseTestCase): | ||
@require_server_version(19, 3, 3) | ||
def test_uint8(self): | ||
with self.create_table('a LowCardinality(UInt8)'): | ||
data = [(x, ) for x in range(255)] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.emit_cli(query) | ||
self.assertEqual( | ||
inserted, | ||
'\n'.join(str(x[0]) for x in data) + '\n' | ||
) | ||
|
||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) | ||
|
||
@require_server_version(19, 3, 3) | ||
def test_int8(self): | ||
with self.create_table('a LowCardinality(Int8)'): | ||
data = [(x - 127, ) for x in range(255)] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.emit_cli(query) | ||
self.assertEqual( | ||
inserted, | ||
'\n'.join(str(x[0]) for x in data) + '\n' | ||
|
||
) | ||
|
||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) | ||
|
||
@require_server_version(19, 3, 3) | ||
def test_nullable_int8(self): | ||
with self.create_table('a LowCardinality(Nullable(Int8))'): | ||
data = [(None, ), (-1, ), (0, ), (1, ), (None, )] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.emit_cli(query) | ||
self.assertEqual(inserted, '\\N\n-1\n0\n1\n\\N\n') | ||
|
||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) | ||
|
||
@require_server_version(19, 3, 3) | ||
def test_date(self): | ||
with self.create_table('a LowCardinality(Date)'): | ||
start = date(1970, 1, 1) | ||
data = [(start + timedelta(x), ) for x in range(300)] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) | ||
|
||
@require_server_version(19, 3, 3) | ||
def test_float(self): | ||
with self.create_table('a LowCardinality(Float)'): | ||
data = [(float(x),) for x in range(300)] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) | ||
|
||
@require_server_version(19, 3, 3) | ||
def test_decimal(self): | ||
with self.create_table('a LowCardinality(Float)'): | ||
data = [(Decimal(x),) for x in range(300)] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) | ||
|
||
@require_server_version(19, 3, 3) | ||
def test_array(self): | ||
with self.create_table('a Array(LowCardinality(Int16))'): | ||
data = [((100, 500), )] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.emit_cli(query) | ||
self.assertEqual(inserted, '[100,500]\n') | ||
|
||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) | ||
|
||
@require_server_version(19, 3, 3) | ||
def test_empty_array(self): | ||
with self.create_table('a Array(LowCardinality(Int16))'): | ||
data = [(tuple(), )] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.emit_cli(query) | ||
self.assertEqual(inserted, '[]\n') | ||
|
||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) | ||
|
||
@require_server_version(19, 3, 3) | ||
def test_string(self): | ||
with self.create_table('a LowCardinality(String)'): | ||
data = [ | ||
('test', ), ('low', ), ('cardinality', ), | ||
('test', ), ('test', ), ('', ) | ||
] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.emit_cli(query) | ||
self.assertEqual( | ||
inserted, | ||
'test\nlow\ncardinality\ntest\ntest\n\n' | ||
) | ||
|
||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) | ||
|
||
@require_server_version(19, 3, 3) | ||
def test_fixed_string(self): | ||
with self.create_table('a LowCardinality(FixedString(12))'): | ||
data = [ | ||
('test', ), ('low', ), ('cardinality', ), | ||
('test', ), ('test', ), ('', ) | ||
] | ||
self.client.execute('INSERT INTO test (a) VALUES', data) | ||
|
||
query = 'SELECT * FROM test' | ||
inserted = self.emit_cli(query) | ||
self.assertEqual( | ||
inserted, | ||
'test\\0\\0\\0\\0\\0\\0\\0\\0\n' | ||
'low\\0\\0\\0\\0\\0\\0\\0\\0\\0\n' | ||
'cardinality\\0\n' | ||
'test\\0\\0\\0\\0\\0\\0\\0\\0\n' | ||
'test\\0\\0\\0\\0\\0\\0\\0\\0\n' | ||
'\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\n' | ||
) | ||
|
||
inserted = self.client.execute(query) | ||
self.assertEqual(inserted, data) |