From 00fc600facf120f062360c52952d9ff05c9334a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20Kr=C3=B6ker?= Date: Sun, 20 Nov 2022 13:50:03 +0100 Subject: [PATCH 1/2] FIX wrong partial number read --- dictdatabase/byte_codes.py | 5 ++ dictdatabase/utils.py | 55 +++++++++++------ tests/test_read.py | 27 +++++---- tests/test_utils.py | 121 +++++++++++++++++++++++++++++++++++++ 4 files changed, 179 insertions(+), 29 deletions(-) create mode 100644 tests/test_utils.py diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py index d5a85e4..5b8c4f5 100644 --- a/dictdatabase/byte_codes.py +++ b/dictdatabase/byte_codes.py @@ -7,3 +7,8 @@ SPACE = 32 TAB = 9 NEWLINE = 10 +DIGIT_0 = 48 +DIGIT_9 = 57 +DOT = 46 +MINUS = 45 +COMMA = 44 diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py index 55de637..b8ee84f 100644 --- a/dictdatabase/utils.py +++ b/dictdatabase/utils.py @@ -54,30 +54,51 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int: skip_next, in_str, list_depth, dict_depth = False, False, 0, 0 - for i in range(index, len(json_bytes)): - if skip_next: - skip_next = False - continue + + i = index + while i < len(json_bytes): current = json_bytes[i] + + # If backslash, skip the next character if current == byte_codes.BACKSLASH: - skip_next = True - continue - if current == byte_codes.QUOTE: + i += 1 + # If quote, toggle in_str + elif current == byte_codes.QUOTE: + # Possible exit point where string ends and nesting is zero in_str = not in_str - if in_str or current == byte_codes.SPACE: - continue - if current == byte_codes.OPEN_SQUARE: + if not in_str and list_depth == 0 and dict_depth == 0: + return i + 1 + # If in string, skip + elif in_str: + pass + + # Invariant: Not in_str, not escaped + + # Handle opening brackets + elif current == byte_codes.OPEN_SQUARE: list_depth += 1 - elif current == byte_codes.CLOSE_SQUARE: - list_depth -= 1 elif current == byte_codes.OPEN_CURLY: dict_depth += 1 - elif current == byte_codes.CLOSE_CURLY: - dict_depth -= 1 - if list_depth == 0 and dict_depth == 0: - return i + 1 - raise TypeError("Invalid JSON syntax") + # Handle closing brackets + elif current in [byte_codes.CLOSE_SQUARE, byte_codes.CLOSE_CURLY]: + if current == byte_codes.CLOSE_SQUARE: + list_depth -= 1 + if current == byte_codes.CLOSE_CURLY: + dict_depth -= 1 + if list_depth == 0 and dict_depth == 0: + return i + 1 + if list_depth == 0 and dict_depth == -1: + return i + + elif current == byte_codes.COMMA or current == byte_codes.NEWLINE: + if list_depth == 0 and dict_depth == 0: + return i + elif list_depth == 0 and dict_depth == -1: + return i + i += 1 + + raise TypeError("Invalid JSON") def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int: diff --git a/tests/test_read.py b/tests/test_read.py index 9c6cda3..0038c9a 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -30,23 +30,26 @@ def test_invalid_params(use_test_dir, use_compression, use_orjson, indent): def test_read_integrity(use_test_dir, use_compression, use_orjson, indent): cases = [ - r'{"a": "\\", "b": 2}', - r'{"a": "\\\\", "b": 2}', - r'{"a": "\\\\\"", "b": 2}', - r'{"a": "\\\"\\", "b": 2}', - r'{"a": "\"\\\\", "b": 2}', - r'{"a": "\"", "b": 2}', - r'{"a": "\"\"", "b": 2}', - r'{"a": "\"\"\\", "b": 2}', - r'{"a": "\"\\\"", "b": 2}', - r'{"a": "\\\"\"", "b": 2}', + r'{"a": "\\", "b": 0}', + r'{"a": "\\\\", "b": 1234}', + r'{"a": "\\\\\"", "b": 1234}', + r'{"a": "\\\"\\", "b": 1234}', + r'{"a": "\"\\\\", "b": 1234}', + r'{"a": "\"", "b": 1234}', + r'{"a": "\"\"", "b": 1234}', + r'{"a": "\"\"\\", "b": 1234}', + r'{"a": "\"\\\"", "b": 1234}', + r'{"a": "\\\"\"", "b": 1234}', ] for case in cases: with open(f"{DDB.config.storage_directory}/test_read_integrity.json", "w") as f: f.write(case) - dd = DDB.at("test_read_integrity", key="a").read() - assert dd == json.loads(case)["a"] + key_a = DDB.at("test_read_integrity", key="a").read() + key_b = DDB.at("test_read_integrity", key="b").read() + assert key_a == json.loads(case)["a"] + # assert key_b == json.loads(case)["b"] + diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..cc31853 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,121 @@ +import itertools +import orjson +from dictdatabase import utils, io_unsafe, byte_codes + + +def test_seek_index_through_value_bytes(use_test_dir): + v = b'{"a": 1, "b": {}}' + vc = b'{"a":1,"b":{}}' + + assert utils.seek_index_through_value_bytes(v, 5) == 7 + assert utils.seek_index_through_value_bytes(v, 6) == 7 + assert utils.seek_index_through_value_bytes(vc, 5) == 6 + + assert utils.seek_index_through_value_bytes(v, 13) == 16 + assert utils.seek_index_through_value_bytes(vc, 11) == 13 + + + n = b'{"a": 1234, "b": {"c": 2}}' + assert utils.seek_index_through_value_bytes(n, 5) == 10 + assert utils.seek_index_through_value_bytes(n, 6) == 10 + + +test_seek_index_through_value_bytes(0) + + + + + +def load_with_orjson(bytes, key): + # print("load with orjson", bytes) + return orjson.loads(bytes)[key] + + +def load_with_seeker(bytes, key): + key_bytes = f"\"{key}\":".encode() + a_val_start = bytes.find(key_bytes) + len(key_bytes) + if bytes[a_val_start] == byte_codes.SPACE: + a_val_start += 1 + a_val_end = utils.seek_index_through_value_bytes(bytes, a_val_start) + return orjson.loads(bytes[a_val_start:a_val_end]) + + +def test_seek_index_through_value_bytes_2(use_test_dir): + + + def orjson_dump_with_indent(data): + return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS) + + def orjson_dump_without_indent(data): + return orjson.dumps(data, option=orjson.OPT_SORT_KEYS) + + orjson_dump_settings = [orjson_dump_with_indent, orjson_dump_without_indent] + + values = [ + # Lists + [], + [1, 2, 3], + ["xs", "value", "c"], + [1, "xs", 2, "value", 3, "c"], + [1, "xs", 2, "value", 3, "c", [1, 2, 3], [1, 2, 3], [1, 2, 3]], + [{}, {}, {}], + [{"xs": 1}, {"value": 2}, {"c": 3}], + [{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}], + [{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}, [1, 2, 3], [1, 2, 3], [1, 2, 3]], + # Dicts + {}, + {"xs": 1}, + {"xs": 1, "value": 2}, + {"xs": 1, "value": 2, "c": 3}, + {"xs": []}, + {"xs": [], "value": []}, + {"xs": -3.3, "value": ""}, + # Numbers + 1, + 1234, + 1.3, + -1.3, + 32.3, + 0, + -0, + # Strings + "", + "a", + "hello", + "a\\b", + "\\", + "\\\\", + "\\\\\"", + "\\\"\\", + "\"\\\\", + "\"", + "\"\"", + "\"\"\\", + "\"\\\"", + "\\\"\"", + ] + + for dumper, v1, v2 in itertools.product(orjson_dump_settings, values, values): + + obj = {"a": v1, "b": v2} + + json_bytes = dumper(obj) + + + a_from_orjson = load_with_orjson(json_bytes, "a") + a_from_seeker = load_with_seeker(json_bytes, "a") + + b_from_orjson = load_with_orjson(json_bytes, "b") + b_from_seeker = load_with_seeker(json_bytes, "b") + + # print("obj", obj) + # print("a_from_orjson", a_from_orjson) + # print("a_from_seeker", a_from_seeker) + assert a_from_orjson == a_from_seeker + # print("b_from_orjson", b_from_orjson) + # print("b_from_seeker", b_from_seeker) + assert b_from_orjson == b_from_seeker + + + +test_seek_index_through_value_bytes_2(0) From e8075658978ff3134232fe813ae6c67ab92a2354 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20Kr=C3=B6ker?= Date: Sun, 20 Nov 2022 14:16:30 +0100 Subject: [PATCH 2/2] cleanup --- dictdatabase/byte_codes.py | 5 +--- dictdatabase/utils.py | 51 ++++++++++++++++---------------------- tests/test_read.py | 2 +- tests/test_utils.py | 7 ------ 4 files changed, 23 insertions(+), 42 deletions(-) diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py index 5b8c4f5..5f50482 100644 --- a/dictdatabase/byte_codes.py +++ b/dictdatabase/byte_codes.py @@ -1,3 +1,4 @@ +# See: https://www.charset.org/utf-8 BACKSLASH = 92 QUOTE = 34 OPEN_SQUARE = 91 @@ -7,8 +8,4 @@ SPACE = 32 TAB = 9 NEWLINE = 10 -DIGIT_0 = 48 -DIGIT_9 = 57 -DOT = 46 -MINUS = 45 COMMA = 44 diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py index b8ee84f..052c3cf 100644 --- a/dictdatabase/utils.py +++ b/dictdatabase/utils.py @@ -52,20 +52,17 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int: # See https://www.json.org/json-en.html for the JSON syntax - skip_next, in_str, list_depth, dict_depth = False, False, 0, 0 + in_str, list_depth, dict_depth, i, len_json_bytes = False, 0, 0, index, len(json_bytes) - - i = index - while i < len(json_bytes): + while i < len_json_bytes: current = json_bytes[i] - # If backslash, skip the next character if current == byte_codes.BACKSLASH: i += 1 # If quote, toggle in_str elif current == byte_codes.QUOTE: - # Possible exit point where string ends and nesting is zero in_str = not in_str + # Possible exit point where string ends and nesting is zero if not in_str and list_depth == 0 and dict_depth == 0: return i + 1 # If in string, skip @@ -79,22 +76,19 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int: list_depth += 1 elif current == byte_codes.OPEN_CURLY: dict_depth += 1 - # Handle closing brackets elif current in [byte_codes.CLOSE_SQUARE, byte_codes.CLOSE_CURLY]: if current == byte_codes.CLOSE_SQUARE: list_depth -= 1 if current == byte_codes.CLOSE_CURLY: dict_depth -= 1 - if list_depth == 0 and dict_depth == 0: - return i + 1 - if list_depth == 0 and dict_depth == -1: - return i - - elif current == byte_codes.COMMA or current == byte_codes.NEWLINE: - if list_depth == 0 and dict_depth == 0: - return i - elif list_depth == 0 and dict_depth == -1: + if list_depth == 0: + if dict_depth == 0: + return i + 1 + if dict_depth == -1: + return i # Case: {"a": {}} + elif list_depth == 0 and ((dict_depth == 0 and current in [byte_codes.COMMA, byte_codes.NEWLINE]) or dict_depth == -1): + # Handle commas and newline as exit points return i i += 1 @@ -111,23 +105,20 @@ def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int: - `json_bytes`: A bytes object containing valid JSON when decoded """ - skip_next, in_str, nesting = False, False, 0 - for i in range(start, end): - if skip_next: - skip_next = False - continue - current = json_bytes[i] - if current == byte_codes.BACKSLASH: - skip_next = True - continue - if current == byte_codes.QUOTE: + in_str, nesting, i = False, 0, start + while i < end: + byte_i = json_bytes[i] + if byte_i == byte_codes.BACKSLASH: + i += 1 + elif byte_i == byte_codes.QUOTE: in_str = not in_str - if in_str or current == byte_codes.SPACE: - continue - elif current == byte_codes.OPEN_CURLY: + elif in_str: + pass + elif byte_i == byte_codes.OPEN_CURLY: nesting += 1 - elif current == byte_codes.CLOSE_CURLY: + elif byte_i == byte_codes.CLOSE_CURLY: nesting -= 1 + i += 1 return nesting diff --git a/tests/test_read.py b/tests/test_read.py index 0038c9a..4219af1 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -48,7 +48,7 @@ def test_read_integrity(use_test_dir, use_compression, use_orjson, indent): key_a = DDB.at("test_read_integrity", key="a").read() key_b = DDB.at("test_read_integrity", key="b").read() assert key_a == json.loads(case)["a"] - # assert key_b == json.loads(case)["b"] + assert key_b == json.loads(case)["b"] diff --git a/tests/test_utils.py b/tests/test_utils.py index cc31853..b8cfddf 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -20,9 +20,6 @@ def test_seek_index_through_value_bytes(use_test_dir): assert utils.seek_index_through_value_bytes(n, 6) == 10 -test_seek_index_through_value_bytes(0) - - @@ -115,7 +112,3 @@ def orjson_dump_without_indent(data): # print("b_from_orjson", b_from_orjson) # print("b_from_seeker", b_from_seeker) assert b_from_orjson == b_from_seeker - - - -test_seek_index_through_value_bytes_2(0)