-
Notifications
You must be signed in to change notification settings - Fork 49
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* read digital info sizes from env vars. * implement utils to split iterables into batches. * split large sql inserts into batches. * document insert max size env var. Co-authored-by: c0c0n3 <c0c0n3@users.noreply.github.com>
- Loading branch information
Showing
10 changed files
with
473 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import logging | ||
from objsize import get_deep_size | ||
from typing import Optional, Tuple | ||
|
||
from utils.cfgreader import BitSizeVar, EnvReader | ||
from utils.itersplit import IterCostSplitter | ||
|
||
INSERT_MAX_SIZE_VAR = 'INSERT_MAX_SIZE' | ||
""" | ||
The name of the environment variable to configure the insert max size. | ||
""" | ||
|
||
|
||
def _log(): | ||
return logging.getLogger(__name__) | ||
|
||
|
||
def configured_insert_max_size_in_bytes() -> Optional[int]: | ||
""" | ||
Read the insert max size env var and return its value in bytes if | ||
set to a parsable value or ``None`` otherwise. Notice if a value | ||
is present but is garbage we still return ``None`` but we also | ||
log a warning. | ||
:return: the max size in bytes if available, ``None`` otherwise. | ||
""" | ||
env_reader = EnvReader(log=_log().debug) | ||
parsed = env_reader.safe_read(BitSizeVar(INSERT_MAX_SIZE_VAR, None)) | ||
if parsed: | ||
return int(parsed.to_Byte()) | ||
return None | ||
|
||
|
||
def compute_row_size(r: Tuple) -> int: | ||
""" | ||
Compute the memory size, in bytes, of the given row's components. | ||
:param r: the row to insert. | ||
:return: the size in bytes. | ||
""" | ||
component_sizes = [get_deep_size(k) for k in r] | ||
return sum(component_sizes) | ||
|
||
|
||
def to_insert_batches(rows: [Tuple]) -> [[Tuple]]: | ||
""" | ||
Split the SQL rows to insert into batches so the Translator can insert | ||
each batch separately, i.e. issue a SQL insert statement for each batch | ||
as opposed to a single insert for the whole input lot. We do this since | ||
some backends (e.g. Crate) have a cap on how much data you can shovel | ||
in a single SQL (bulk) insert statement---see #445 about it. | ||
Split only if the insert max size env var holds a valid value. (If that's | ||
not the case, return a single batch with all input rows.) | ||
Splitting happens as explained in the ``IterCostSplitter`` docs with | ||
``compute_row_size`` as a cost function so the cost of each input row | ||
is the amount of bytes its components take up in memory and the value | ||
of the env var as a maximum batch size (= cost in bytes). | ||
:param rows: the rows the SQL translator lined up for an insert. | ||
:return: the insert batches. | ||
""" | ||
config_max_cost = configured_insert_max_size_in_bytes() | ||
if config_max_cost is None: | ||
return [rows] | ||
splitter = IterCostSplitter(cost_fn=compute_row_size, | ||
batch_max_cost=config_max_cost) | ||
return splitter.list_batches(rows) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
from itertools import takewhile | ||
import os | ||
import pytest | ||
import sys | ||
|
||
from translators.base_translator import TIME_INDEX_NAME | ||
from translators.insert_splitter import INSERT_MAX_SIZE_VAR | ||
from translators.tests.original_data_scenarios import full_table_name, \ | ||
gen_tenant_id, gen_entity, OriginalDataScenarios | ||
from translators.tests.test_original_data import translators, \ | ||
with_crate, with_timescale | ||
# NOTE. ^ your IDE is likely to tell you this is dead code, but it isn't | ||
# actually, we need to bring those two fixtures into scope to use them | ||
# with the lazy_fixture calls in 'translators'. | ||
|
||
|
||
def set_insert_max_size(number_of_bytes: int): | ||
os.environ[INSERT_MAX_SIZE_VAR] = f"{number_of_bytes}B" | ||
|
||
|
||
def clear_insert_max_size(): | ||
os.environ[INSERT_MAX_SIZE_VAR] = '' | ||
|
||
|
||
class DataGen: | ||
|
||
def __init__(self, insert_max_size: int, min_batches: int): | ||
self.insert_max_size = insert_max_size | ||
self.min_batches = min_batches | ||
self.unique_tenant_id = gen_tenant_id() | ||
|
||
@staticmethod | ||
def _compute_insert_vector_size_lower_bound(entity: dict) -> int: | ||
vs = entity['id'], entity['type'], entity[TIME_INDEX_NAME], \ | ||
entity['a_number']['value'], entity['an_attr']['value'] | ||
sz = [sys.getsizeof(v) for v in vs] | ||
return sum(sz) | ||
# NOTE. lower bound since it doesn't include e.g. fiware service. | ||
|
||
def _next_entity(self) -> (dict, int): | ||
eid = 0 | ||
size = 0 | ||
while True: | ||
eid += 1 | ||
e = gen_entity(entity_id=eid, attr_type='Number', attr_value=1) | ||
size += self._compute_insert_vector_size_lower_bound(e) | ||
yield e, size | ||
|
||
def generate_insert_payload(self) -> [dict]: | ||
""" | ||
Generate enough data that when the SQL translator is configured with | ||
the given insert_max_size value, it'll have to split the payload in | ||
at least min_batches. | ||
:return: the entities to insert. | ||
""" | ||
sz = self.insert_max_size * self.min_batches | ||
ts = takewhile(lambda t: t[1] <= sz, self._next_entity()) | ||
return [t[0] for t in ts] | ||
# NOTE. Actual number of batches >= min_batches. | ||
# In fact, say each entity row vector is actually 10 bytes, but our computed | ||
# lower bound is 5. Then with insert_max_size=10 and min_batches=3, es will | ||
# have 6 entities in it for a total payload of 60 which the translator should | ||
# then split into 6 batches. | ||
|
||
|
||
class TestDriver: | ||
|
||
def __init__(self, translator: OriginalDataScenarios, | ||
test_data: DataGen): | ||
self.translator = translator | ||
self.data = test_data | ||
|
||
def _do_insert(self, entities: [dict]): | ||
try: | ||
tid = self.data.unique_tenant_id | ||
self.translator.insert_entities(tid, entities) | ||
finally: | ||
clear_insert_max_size() | ||
|
||
def _assert_row_count(self, expected: int): | ||
table = full_table_name(self.data.unique_tenant_id) | ||
stmt = f"select count(*) as count from {table}" | ||
r = self.translator.query(stmt) | ||
assert r[0]['count'] == expected | ||
|
||
def run(self, with_batches: bool): | ||
if with_batches: | ||
set_insert_max_size(self.data.insert_max_size) | ||
|
||
entities = self.data.generate_insert_payload() | ||
self._do_insert(entities) | ||
self._assert_row_count(len(entities)) | ||
|
||
|
||
@pytest.mark.parametrize('translator', translators, | ||
ids=['timescale', 'crate']) | ||
def test_insert_all_entities_in_one_go(translator): | ||
test_data = DataGen(insert_max_size=1024, min_batches=2) | ||
driver = TestDriver(translator, test_data) | ||
driver.run(with_batches=False) | ||
|
||
|
||
@pytest.mark.parametrize('translator', translators, | ||
ids=['timescale', 'crate']) | ||
@pytest.mark.parametrize('min_batches', [2, 3, 4]) | ||
def test_insert_entities_in_batches(translator, min_batches): | ||
test_data = DataGen(insert_max_size=1024, min_batches=min_batches) | ||
driver = TestDriver(translator, test_data) | ||
driver.run(with_batches=True) | ||
|
||
|
||
# NOTE. Couldn't reproduce #445. | ||
# You can try this, but the exception I get is weirdly enough a connection | ||
# exception. Python will crunch data in memory for about 30 mins, then the | ||
# translator mysteriously fails w/ a connection exception, even though Crate | ||
# is up and running... | ||
# | ||
# def test_huge_crate_insert(with_crate): | ||
# test_data = DataGen(insert_max_size=2*1024*1024, min_batches=1024) | ||
# # ^ should produce at least 2GiB worth of entities!! | ||
# driver = TestDriver(with_crate, test_data) | ||
# driver.run(with_batches=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.