From 00fc600facf120f062360c52952d9ff05c9334a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20Kr=C3=B6ker?= <kroeker.marcel@gmail.com>
Date: Sun, 20 Nov 2022 13:50:03 +0100
Subject: [PATCH 1/2] FIX wrong partial number read

---
 dictdatabase/byte_codes.py |   5 ++
 dictdatabase/utils.py      |  55 +++++++++++------
 tests/test_read.py         |  27 +++++----
 tests/test_utils.py        | 121 +++++++++++++++++++++++++++++++++++++
 4 files changed, 179 insertions(+), 29 deletions(-)
 create mode 100644 tests/test_utils.py

diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py
index d5a85e4..5b8c4f5 100644
--- a/dictdatabase/byte_codes.py
+++ b/dictdatabase/byte_codes.py
@@ -7,3 +7,8 @@
 SPACE = 32
 TAB = 9
 NEWLINE = 10
+DIGIT_0 = 48
+DIGIT_9 = 57
+DOT = 46
+MINUS = 45
+COMMA = 44
diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py
index 55de637..b8ee84f 100644
--- a/dictdatabase/utils.py
+++ b/dictdatabase/utils.py
@@ -54,30 +54,51 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
 
 	skip_next, in_str, list_depth, dict_depth = False, False, 0, 0
 
-	for i in range(index, len(json_bytes)):
-		if skip_next:
-			skip_next = False
-			continue
+
+	i = index
+	while i < len(json_bytes):
 		current = json_bytes[i]
+
+		# If backslash, skip the next character
 		if current == byte_codes.BACKSLASH:
-			skip_next = True
-			continue
-		if current == byte_codes.QUOTE:
+			i += 1
+		# If quote, toggle in_str
+		elif current == byte_codes.QUOTE:
+			# Possible exit point where string ends and nesting is zero
 			in_str = not in_str
-		if in_str or current == byte_codes.SPACE:
-			continue
-		if current == byte_codes.OPEN_SQUARE:
+			if not in_str and list_depth == 0 and dict_depth == 0:
+				return i + 1
+		# If in string, skip
+		elif in_str:
+			pass
+
+		# Invariant: Not in_str, not escaped
+
+		# Handle opening brackets
+		elif current == byte_codes.OPEN_SQUARE:
 			list_depth += 1
-		elif current == byte_codes.CLOSE_SQUARE:
-			list_depth -= 1
 		elif current == byte_codes.OPEN_CURLY:
 			dict_depth += 1
-		elif current == byte_codes.CLOSE_CURLY:
-			dict_depth -= 1
-		if list_depth == 0 and dict_depth == 0:
-			return i + 1
 
-	raise TypeError("Invalid JSON syntax")
+		# Handle closing brackets
+		elif current in [byte_codes.CLOSE_SQUARE, byte_codes.CLOSE_CURLY]:
+			if current == byte_codes.CLOSE_SQUARE:
+				list_depth -= 1
+			if current == byte_codes.CLOSE_CURLY:
+				dict_depth -= 1
+			if list_depth == 0 and dict_depth == 0:
+				return i + 1
+			if list_depth == 0 and dict_depth == -1:
+				return i
+
+		elif current == byte_codes.COMMA or current == byte_codes.NEWLINE:
+			if list_depth == 0 and dict_depth == 0:
+				return i
+		elif list_depth == 0 and dict_depth == -1:
+			return i
+		i += 1
+
+	raise TypeError("Invalid JSON")
 
 
 def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
diff --git a/tests/test_read.py b/tests/test_read.py
index 9c6cda3..0038c9a 100644
--- a/tests/test_read.py
+++ b/tests/test_read.py
@@ -30,23 +30,26 @@ def test_invalid_params(use_test_dir, use_compression, use_orjson, indent):
 
 def test_read_integrity(use_test_dir, use_compression, use_orjson, indent):
 	cases = [
-		r'{"a": "\\", "b": 2}',
-		r'{"a": "\\\\", "b": 2}',
-		r'{"a": "\\\\\"", "b": 2}',
-		r'{"a": "\\\"\\", "b": 2}',
-		r'{"a": "\"\\\\", "b": 2}',
-		r'{"a": "\"", "b": 2}',
-		r'{"a": "\"\"", "b": 2}',
-		r'{"a": "\"\"\\", "b": 2}',
-		r'{"a": "\"\\\"", "b": 2}',
-		r'{"a": "\\\"\"", "b": 2}',
+		r'{"a": "\\", "b": 0}',
+		r'{"a": "\\\\", "b": 1234}',
+		r'{"a": "\\\\\"", "b": 1234}',
+		r'{"a": "\\\"\\", "b": 1234}',
+		r'{"a": "\"\\\\", "b": 1234}',
+		r'{"a": "\"", "b": 1234}',
+		r'{"a": "\"\"", "b": 1234}',
+		r'{"a": "\"\"\\", "b": 1234}',
+		r'{"a": "\"\\\"", "b": 1234}',
+		r'{"a": "\\\"\"", "b": 1234}',
 	]
 
 	for case in cases:
 		with open(f"{DDB.config.storage_directory}/test_read_integrity.json", "w") as f:
 			f.write(case)
-		dd = DDB.at("test_read_integrity", key="a").read()
-		assert dd == json.loads(case)["a"]
+		key_a = DDB.at("test_read_integrity", key="a").read()
+		key_b = DDB.at("test_read_integrity", key="b").read()
+		assert key_a == json.loads(case)["a"]
+		# assert key_b == json.loads(case)["b"]
+
 
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..cc31853
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,121 @@
+import itertools
+import orjson
+from dictdatabase import utils, io_unsafe, byte_codes
+
+
+def test_seek_index_through_value_bytes(use_test_dir):
+	v = b'{"a": 1, "b": {}}'
+	vc = b'{"a":1,"b":{}}'
+
+	assert utils.seek_index_through_value_bytes(v, 5) == 7
+	assert utils.seek_index_through_value_bytes(v, 6) == 7
+	assert utils.seek_index_through_value_bytes(vc, 5) == 6
+
+	assert utils.seek_index_through_value_bytes(v, 13) == 16
+	assert utils.seek_index_through_value_bytes(vc, 11) == 13
+
+
+	n = b'{"a": 1234, "b": {"c": 2}}'
+	assert utils.seek_index_through_value_bytes(n, 5) == 10
+	assert utils.seek_index_through_value_bytes(n, 6) == 10
+
+
+test_seek_index_through_value_bytes(0)
+
+
+
+
+
+def load_with_orjson(bytes, key):
+	# print("load with orjson", bytes)
+	return orjson.loads(bytes)[key]
+
+
+def load_with_seeker(bytes, key):
+	key_bytes = f"\"{key}\":".encode()
+	a_val_start = bytes.find(key_bytes) + len(key_bytes)
+	if bytes[a_val_start] == byte_codes.SPACE:
+		a_val_start += 1
+	a_val_end = utils.seek_index_through_value_bytes(bytes, a_val_start)
+	return orjson.loads(bytes[a_val_start:a_val_end])
+
+
+def test_seek_index_through_value_bytes_2(use_test_dir):
+
+
+	def orjson_dump_with_indent(data):
+		return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS)
+
+	def orjson_dump_without_indent(data):
+		return orjson.dumps(data, option=orjson.OPT_SORT_KEYS)
+
+	orjson_dump_settings = [orjson_dump_with_indent, orjson_dump_without_indent]
+
+	values = [
+		# Lists
+		[],
+		[1, 2, 3],
+		["xs", "value", "c"],
+		[1, "xs", 2, "value", 3, "c"],
+		[1, "xs", 2, "value", 3, "c", [1, 2, 3], [1, 2, 3], [1, 2, 3]],
+		[{}, {}, {}],
+		[{"xs": 1}, {"value": 2}, {"c": 3}],
+		[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}],
+		[{"xs": 1}, {"value": 2}, {"c": 3}, {"xs": 1}, {"value": 2}, {"c": 3}, [1, 2, 3], [1, 2, 3], [1, 2, 3]],
+		# Dicts
+		{},
+		{"xs": 1},
+		{"xs": 1, "value": 2},
+		{"xs": 1, "value": 2, "c": 3},
+		{"xs": []},
+		{"xs": [], "value": []},
+		{"xs": -3.3, "value": ""},
+		# Numbers
+		1,
+		1234,
+		1.3,
+		-1.3,
+		32.3,
+		0,
+		-0,
+		# Strings
+		"",
+		"a",
+		"hello",
+		"a\\b",
+		"\\",
+		"\\\\",
+		"\\\\\"",
+		"\\\"\\",
+		"\"\\\\",
+		"\"",
+		"\"\"",
+		"\"\"\\",
+		"\"\\\"",
+		"\\\"\"",
+	]
+
+	for dumper, v1, v2 in itertools.product(orjson_dump_settings, values, values):
+
+		obj = {"a": v1, "b": v2}
+
+		json_bytes = dumper(obj)
+
+
+		a_from_orjson = load_with_orjson(json_bytes, "a")
+		a_from_seeker = load_with_seeker(json_bytes, "a")
+
+		b_from_orjson = load_with_orjson(json_bytes, "b")
+		b_from_seeker = load_with_seeker(json_bytes, "b")
+
+		# print("obj", obj)
+		# print("a_from_orjson", a_from_orjson)
+		# print("a_from_seeker", a_from_seeker)
+		assert a_from_orjson == a_from_seeker
+		# print("b_from_orjson", b_from_orjson)
+		# print("b_from_seeker", b_from_seeker)
+		assert b_from_orjson == b_from_seeker
+
+
+
+test_seek_index_through_value_bytes_2(0)

From e8075658978ff3134232fe813ae6c67ab92a2354 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20Kr=C3=B6ker?= <kroeker.marcel@gmail.com>
Date: Sun, 20 Nov 2022 14:16:30 +0100
Subject: [PATCH 2/2] cleanup

---
 dictdatabase/byte_codes.py |  5 +---
 dictdatabase/utils.py      | 51 ++++++++++++++++----------------------
 tests/test_read.py         |  2 +-
 tests/test_utils.py        |  7 ------
 4 files changed, 23 insertions(+), 42 deletions(-)

diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py
index 5b8c4f5..5f50482 100644
--- a/dictdatabase/byte_codes.py
+++ b/dictdatabase/byte_codes.py
@@ -1,3 +1,4 @@
+# See: https://www.charset.org/utf-8
 BACKSLASH = 92
 QUOTE = 34
 OPEN_SQUARE = 91
@@ -7,8 +8,4 @@
 SPACE = 32
 TAB = 9
 NEWLINE = 10
-DIGIT_0 = 48
-DIGIT_9 = 57
-DOT = 46
-MINUS = 45
 COMMA = 44
diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py
index b8ee84f..052c3cf 100644
--- a/dictdatabase/utils.py
+++ b/dictdatabase/utils.py
@@ -52,20 +52,17 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
 
 	# See https://www.json.org/json-en.html for the JSON syntax
 
-	skip_next, in_str, list_depth, dict_depth = False, False, 0, 0
+	in_str, list_depth, dict_depth, i, len_json_bytes = False, 0, 0, index, len(json_bytes)
 
-
-	i = index
-	while i < len(json_bytes):
+	while i < len_json_bytes:
 		current = json_bytes[i]
-
 		# If backslash, skip the next character
 		if current == byte_codes.BACKSLASH:
 			i += 1
 		# If quote, toggle in_str
 		elif current == byte_codes.QUOTE:
-			# Possible exit point where string ends and nesting is zero
 			in_str = not in_str
+			# Possible exit point where string ends and nesting is zero
 			if not in_str and list_depth == 0 and dict_depth == 0:
 				return i + 1
 		# If in string, skip
@@ -79,22 +76,19 @@ def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
 			list_depth += 1
 		elif current == byte_codes.OPEN_CURLY:
 			dict_depth += 1
-
 		# Handle closing brackets
 		elif current in [byte_codes.CLOSE_SQUARE, byte_codes.CLOSE_CURLY]:
 			if current == byte_codes.CLOSE_SQUARE:
 				list_depth -= 1
 			if current == byte_codes.CLOSE_CURLY:
 				dict_depth -= 1
-			if list_depth == 0 and dict_depth == 0:
-				return i + 1
-			if list_depth == 0 and dict_depth == -1:
-				return i
-
-		elif current == byte_codes.COMMA or current == byte_codes.NEWLINE:
-			if list_depth == 0 and dict_depth == 0:
-				return i
-		elif list_depth == 0 and dict_depth == -1:
+			if list_depth == 0:
+				if dict_depth == 0:
+					return i + 1
+				if dict_depth == -1:
+					return i  # Case: {"a": {}}
+		elif list_depth == 0 and ((dict_depth == 0 and current in [byte_codes.COMMA, byte_codes.NEWLINE]) or dict_depth == -1):
+			# Handle commas and newline as exit points
 			return i
 		i += 1
 
@@ -111,23 +105,20 @@ def count_nesting_in_bytes(json_bytes: bytes, start: int, end: int) -> int:
 	- `json_bytes`: A bytes object containing valid JSON when decoded
 	"""
 
-	skip_next, in_str, nesting = False, False, 0
-	for i in range(start, end):
-		if skip_next:
-			skip_next = False
-			continue
-		current = json_bytes[i]
-		if current == byte_codes.BACKSLASH:
-			skip_next = True
-			continue
-		if current == byte_codes.QUOTE:
+	in_str, nesting, i = False, 0, start
+	while i < end:
+		byte_i = json_bytes[i]
+		if byte_i == byte_codes.BACKSLASH:
+			i += 1
+		elif byte_i == byte_codes.QUOTE:
 			in_str = not in_str
-		if in_str or current == byte_codes.SPACE:
-			continue
-		elif current == byte_codes.OPEN_CURLY:
+		elif in_str:
+			pass
+		elif byte_i == byte_codes.OPEN_CURLY:
 			nesting += 1
-		elif current == byte_codes.CLOSE_CURLY:
+		elif byte_i == byte_codes.CLOSE_CURLY:
 			nesting -= 1
+		i += 1
 	return nesting
 
 
diff --git a/tests/test_read.py b/tests/test_read.py
index 0038c9a..4219af1 100644
--- a/tests/test_read.py
+++ b/tests/test_read.py
@@ -48,7 +48,7 @@ def test_read_integrity(use_test_dir, use_compression, use_orjson, indent):
 		key_a = DDB.at("test_read_integrity", key="a").read()
 		key_b = DDB.at("test_read_integrity", key="b").read()
 		assert key_a == json.loads(case)["a"]
-		# assert key_b == json.loads(case)["b"]
+		assert key_b == json.loads(case)["b"]
 
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index cc31853..b8cfddf 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -20,9 +20,6 @@ def test_seek_index_through_value_bytes(use_test_dir):
 	assert utils.seek_index_through_value_bytes(n, 6) == 10
 
 
-test_seek_index_through_value_bytes(0)
-
-
 
 
 
@@ -115,7 +112,3 @@ def orjson_dump_without_indent(data):
 		# print("b_from_orjson", b_from_orjson)
 		# print("b_from_seeker", b_from_seeker)
 		assert b_from_orjson == b_from_seeker
-
-
-
-test_seek_index_through_value_bytes_2(0)