Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dictdatabase/byte_codes.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# See: https://www.charset.org/utf-8
BACKSLASH = 92
QUOTE = 34
OPEN_SQUARE = 91
Expand All @@ -7,3 +8,4 @@
SPACE = 32
TAB = 9
NEWLINE = 10
COMMA = 44
78 changes: 45 additions & 33 deletions dictdatabase/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,32 +52,47 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:

# See https://www.json.org/json-en.html for the JSON syntax

skip_next, in_str, list_depth, dict_depth = False, False, 0, 0
in_str, list_depth, dict_depth, i, len_json_bytes = False, 0, 0, index, len(json_bytes)

for i in range(index, len(json_bytes)):
if skip_next:
skip_next = False
continue
while i < len_json_bytes:
current = json_bytes[i]
# If backslash, skip the next character
if current == byte_codes.BACKSLASH:
skip_next = True
continue
if current == byte_codes.QUOTE:
i += 1
# If quote, toggle in_str
elif current == byte_codes.QUOTE:
in_str = not in_str
if in_str or current == byte_codes.SPACE:
continue
if current == byte_codes.OPEN_SQUARE:
# Possible exit point where string ends and nesting is zero
if not in_str and list_depth == 0 and dict_depth == 0:
return i + 1
# If in string, skip
elif in_str:
pass

# Invariant: Not in_str, not escaped

# Handle opening brackets
elif current == byte_codes.OPEN_SQUARE:
list_depth += 1
elif current == byte_codes.CLOSE_SQUARE:
list_depth -= 1
elif current == byte_codes.OPEN_CURLY:
dict_depth += 1
elif current == byte_codes.CLOSE_CURLY:
dict_depth -= 1
if list_depth == 0 and dict_depth == 0:
return i + 1

raise TypeError("Invalid JSON syntax")
# Handle closing brackets
elif current in [byte_codes.CLOSE_SQUARE, byte_codes.CLOSE_CURLY]:
if current == byte_codes.CLOSE_SQUARE:
list_depth -= 1
if current == byte_codes.CLOSE_CURLY:
dict_depth -= 1
if list_depth == 0:
if dict_depth == 0:
return i + 1
if dict_depth == -1:
return i # Case: {"a": {}}
elif list_depth == 0 and ((dict_depth == 0 and current in [byte_codes.COMMA, byte_codes.NEWLINE]) or dict_depth == -1):
# Handle commas and newline as exit points
return i
i += 1

raise TypeError("Invalid JSON")


def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
Expand All @@ -90,23 +105,20 @@ def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
- `json_bytes`: A bytes object containing valid JSON when decoded
"""

skip_next, in_str, nesting = False, False, 0
for i in range(start, end):
if skip_next:
skip_next = False
continue
current = json_bytes[i]
if current == byte_codes.BACKSLASH:
skip_next = True
continue
if current == byte_codes.QUOTE:
in_str, nesting, i = False, 0, start
while i < end:
byte_i = json_bytes[i]
if byte_i == byte_codes.BACKSLASH:
i += 1
elif byte_i == byte_codes.QUOTE:
in_str = not in_str
if in_str or current == byte_codes.SPACE:
continue
elif current == byte_codes.OPEN_CURLY:
elif in_str:
pass
elif byte_i == byte_codes.OPEN_CURLY:
nesting += 1
elif current == byte_codes.CLOSE_CURLY:
elif byte_i == byte_codes.CLOSE_CURLY:
nesting -= 1
i += 1
return nesting


Expand Down
27 changes: 15 additions & 12 deletions tests/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,26 @@ def test_invalid_params(use_test_dir, use_compression, use_orjson, indent):

def test_read_integrity(use_test_dir, use_compression, use_orjson, indent):
cases = [
r'{"a": "\\", "b": 2}',
r'{"a": "\\\\", "b": 2}',
r'{"a": "\\\\\"", "b": 2}',
r'{"a": "\\\"\\", "b": 2}',
r'{"a": "\"\\\\", "b": 2}',
r'{"a": "\"", "b": 2}',
r'{"a": "\"\"", "b": 2}',
r'{"a": "\"\"\\", "b": 2}',
r'{"a": "\"\\\"", "b": 2}',
r'{"a": "\\\"\"", "b": 2}',
r'{"a": "\\", "b": 0}',
r'{"a": "\\\\", "b": 1234}',
r'{"a": "\\\\\"", "b": 1234}',
r'{"a": "\\\"\\", "b": 1234}',
r'{"a": "\"\\\\", "b": 1234}',
r'{"a": "\"", "b": 1234}',
r'{"a": "\"\"", "b": 1234}',
r'{"a": "\"\"\\", "b": 1234}',
r'{"a": "\"\\\"", "b": 1234}',
r'{"a": "\\\"\"", "b": 1234}',
]

for case in cases:
with open(f"{DDB.config.storage_directory}/test_read_integrity.json", "w") as f:
f.write(case)
dd = DDB.at("test_read_integrity", key="a").read()
assert dd == json.loads(case)["a"]
key_a = DDB.at("test_read_integrity", key="a").read()
key_b = DDB.at("test_read_integrity", key="b").read()
assert key_a == json.loads(case)["a"]
assert key_b == json.loads(case)["b"]




Expand Down
114 changes: 114 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import itertools
import orjson
from dictdatabase import utils, io_unsafe, byte_codes


def test_seek_index_through_value_bytes(use_test_dir):
v = b'{"a": 1, "b": {}}'
vc = b'{"a":1,"b":{}}'

assert utils.seek_index_through_value_bytes(v, 5) == 7
assert utils.seek_index_through_value_bytes(v, 6) == 7
assert utils.seek_index_through_value_bytes(vc, 5) == 6

assert utils.seek_index_through_value_bytes(v, 13) == 16
assert utils.seek_index_through_value_bytes(vc, 11) == 13


n = b'{"a": 1234, "b": {"c": 2}}'
assert utils.seek_index_through_value_bytes(n, 5) == 10
assert utils.seek_index_through_value_bytes(n, 6) == 10





def load_with_orjson(bytes, key):
# print("load with orjson", bytes)
return orjson.loads(bytes)[key]


def load_with_seeker(bytes, key):
key_bytes = f"\"{key}\":".encode()
a_val_start = bytes.find(key_bytes) + len(key_bytes)
if bytes[a_val_start] == byte_codes.SPACE:
a_val_start += 1
a_val_end = utils.seek_index_through_value_bytes(bytes, a_val_start)
return orjson.loads(bytes[a_val_start:a_val_end])


def test_seek_index_through_value_bytes_2(use_test_dir):


def orjson_dump_with_indent(data):
return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS)

def orjson_dump_without_indent(data):
return orjson.dumps(data, option=orjson.OPT_SORT_KEYS)

orjson_dump_settings = [orjson_dump_with_indent, orjson_dump_without_indent]

values = [
# Lists
[],
[1, 2, 3],
["xs", "value", "c"],
[1, "xs", 2, "value", 3, "c"],
[1, "xs", 2, "value", 3, "c", [1, 2, 3], [1, 2, 3], [1, 2, 3]],
[{}, {}, {}],
[{"xs": 1}, {"value": 2}, {"c": 3}],
[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}],
[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}, [1, 2, 3], [1, 2, 3], [1, 2, 3]],
# Dicts
{},
{"xs": 1},
{"xs": 1, "value": 2},
{"xs": 1, "value": 2, "c": 3},
{"xs": []},
{"xs": [], "value": []},
{"xs": -3.3, "value": ""},
# Numbers
1,
1234,
1.3,
-1.3,
32.3,
0,
-0,
# Strings
"",
"a",
"hello",
"a\\b",
"\\",
"\\\\",
"\\\\\"",
"\\\"\\",
"\"\\\\",
"\"",
"\"\"",
"\"\"\\",
"\"\\\"",
"\\\"\"",
]

for dumper, v1, v2 in itertools.product(orjson_dump_settings, values, values):

obj = {"a": v1, "b": v2}

json_bytes = dumper(obj)


a_from_orjson = load_with_orjson(json_bytes, "a")
a_from_seeker = load_with_seeker(json_bytes, "a")

b_from_orjson = load_with_orjson(json_bytes, "b")
b_from_seeker = load_with_seeker(json_bytes, "b")

# print("obj", obj)
# print("a_from_orjson", a_from_orjson)
# print("a_from_seeker", a_from_seeker)
assert a_from_orjson == a_from_seeker
# print("b_from_orjson", b_from_orjson)
# print("b_from_seeker", b_from_seeker)
assert b_from_orjson == b_from_seeker