[test] Write exhaustive tests over the bjorn dfa and crockford utf8 d…

…ecoders
oilshell · Apr 11, 2024 · d92f8e4 · d92f8e4
1 parent 991fd88
commit d92f8e4
Show file tree

Hide file tree

Showing 7 changed files with 367 additions and 11 deletions.
diff --git a/build/ninja_lib.py b/build/ninja_lib.py
@@ -429,6 +429,10 @@ def HeadersForBinary(self, main_cc):
       headers.extend(self.cc_libs[label].generated_headers)
     return headers
 
+  def run_script(self, script_path, outputs):
+    self.n.build(outputs, "sh", [script_path])
+    self.n.newline()
+
   def asdl_library(self, asdl_path, deps = None,
       pretty_print_methods=True):
 

diff --git a/data_lang/NINJA_subgraph.py b/data_lang/NINJA_subgraph.py
@@ -21,9 +21,18 @@ def NinjaGraph(ru):
     ru.cc_library('//data_lang/utf8_impls/utf8_decode',
                   srcs=['data_lang/utf8_impls/utf8_decode.c'])
 
+    n.rule(
+        'sh',
+        command='sh $in',
+        description='Run shell script passed to $in')
+    n.newline()
+    n.build("_gen/test/utf8/decoder-exhaustive.inc", "sh", inputs=["data_lang/utf8_decoder_tests_gen.sh"], implicit=["data_lang/utf8_decoder_tests_gen.py"])
+    n.newline()
+
     ru.cc_binary(
         'data_lang/utf8_test.cc',
         deps=['//data_lang/utf8_impls/utf8_decode'],
+        implicit=["_gen/test/utf8/decoder-exhaustive.inc"],
         # Add tcmalloc for malloc_address_test
         matrix=ninja_lib.COMPILERS_VARIANTS + [('cxx', 'tcmalloc')])
 

diff --git a/data_lang/utf8_decoder_tests_gen.py b/data_lang/utf8_decoder_tests_gen.py
@@ -0,0 +1,172 @@
+# For generating the exhaustive utf8 decoder tests in data_lang/utf8_test.cc
+
+DECODERS = ["bjorn", "crockford"]
+
+
+def min_pat(pat):
+    """Given a pattern like '1010xxxx', return the minimum value possible when
+    substituting bits for each x.
+    """
+    return int(pat.replace('x', '0'), base=2)
+
+
+def max_pat(pat):
+    """Like min_pat, but produces the largest value from a substitution."""
+    return int(pat.replace('x', '1'), base=2)
+
+
+def test_patterns_fail(patterns):
+    # Extract byte sequences from the input
+    seqs = [seq.strip().replace('_', '').split() for seq in patterns]
+    for seq in seqs:
+        # Assume the patterns are continuous ranges (eg. 1001xxxx, not 10xx00xx)
+        seq[:] = [range(min_pat(b), max_pat(b) + 1) for b in seq]
+
+    # Construct test loops
+    parts = []
+    for seq in seqs:
+        n = len(seq)
+
+        iter_vars = list('xyzw')
+        for i in range(n):
+            iter_var = iter_vars[i]
+            min = seq[i].start
+            max = seq[i].stop
+            # needs to be >= 16 bits as iter_var can be 0xFF (UINT8_MAX)
+            parts.append('for (uint16_t %s = %d; %s < %d; ++%s) {' % (iter_var, min,
+                                                                      iter_var, max,
+                                                                      iter_var))
+
+        input = ', '.join(map(lambda x: '(uint8_t)' + x, iter_vars[:n]))
+        parts.append('''uint8_t bytes[] = {%s, 0};
+            const uint8_t* ptr = bytes;
+            uint32_t codepoint = 0;
+            utf8_decode_result result = DECODER_utf8_next(&ptr, &codepoint);
+
+            ASSERT_ENUM_EQ(result, UTF8_DECODE_ERROR, utf8_decode_result_str);
+            ''' % input)
+
+        for i in range(n):
+            # \n to add an extra newline between for loops
+            parts.append('}\n')
+
+    return "\n".join(parts)
+
+
+# Overlongs
+patterns = """
+  1100_000x 10xx_xxxx
+  1110_0000 100x_xxxx 10xx_xxxx
+  1111_0000 1000_xxxx 10xx_xxxx 10xx_xxxx
+"""
+patterns = (line.strip() for line in patterns.splitlines() if line)
+overlongs_test = test_patterns_fail(patterns)
+
+# Too Large
+patterns = """
+  1111_0101 10xx_xxxx 10xx_xxxx 10xx_xxxx
+  1111_0110 10xx_xxxx 10xx_xxxx 10xx_xxxx
+  1111_0111 10xx_xxxx 10xx_xxxx 10xx_xxxx
+"""
+patterns = (line.strip() for line in patterns.splitlines() if line)
+too_large_test = test_patterns_fail(patterns)
+
+# Bad bit distributions
+patterns = """
+  10xx_xxxx
+
+  110x_xxxx 0xxx_xxxx
+  110x_xxxx 11xx_xxxx
+
+  1110_xxxx 0xxx_xxxx
+  1110_xxxx 11xx_xxxx
+  1110_xxxx 10xx_xxxx 0xxx_xxxx
+  1110_xxxx 10xx_xxxx 11xx_xxxx
+
+  1111_0xxx 0xxx_xxxx
+  1111_0xxx 11xx_xxxx
+  1111_0xxx 10xx_xxxx 0xxx_xxxx
+  1111_0xxx 10xx_xxxx 11xx_xxxx
+  1111_0xxx 10xx_xxxx 10xx_xxxx 0xxx_xxxx
+  1111_0xxx 10xx_xxxx 10xx_xxxx 11xx_xxxx
+
+  1111_1xxx
+"""
+patterns = (line.strip() for line in patterns.splitlines() if line)
+bad_bits_test = test_patterns_fail(patterns)
+
+all_tests = []
+for decoder in DECODERS:
+    tests = '''TEST DECODER_utf8_decoder_identity() {
+      // Check that forall unicode scalar values x, decode(encode(x)) = x
+
+      uint8_t bytes[5] = {0};
+      for (uint32_t codepoint = 0; codepoint <= 0x10FFFF; ++codepoint) {
+        // Skip surrogates
+        if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
+          continue;
+        }
+
+        int length = utf8proc_encode_char(codepoint, bytes);
+        ASSERT(length != 0);
+        ASSERT(length < 5);
+        bytes[length] = 0;
+
+        const uint8_t* ptr = bytes;
+        uint32_t decoded = 0;
+        utf8_decode_result result = DECODER_utf8_next(&ptr, &decoded);
+        ASSERT_ENUM_EQ(result, UTF8_DECODE_OK, utf8_decode_result_str);
+        ASSERT_EQ(decoded, codepoint);
+        ASSERT_EQ(ptr - bytes, length);
+      }
+
+      PASS();
+    }
+
+    TEST DECODER_utf8_decoder_surrogates() {
+      // Check that decode(surrogate_sequence) => ERROR
+
+      uint8_t bytes[5] = {0};
+      for (uint32_t codepoint = 0xD800; codepoint <= 0xDFFF; ++codepoint) {
+        int length = utf8proc_encode_char(codepoint, bytes);
+        ASSERT(length != 0);
+        ASSERT(length < 5);
+        bytes[length] = 0;
+
+        const uint8_t* ptr = bytes;
+        uint32_t decoded = 0;
+        utf8_decode_result result = DECODER_utf8_next(&ptr, &decoded);
+        ASSERT_ENUM_EQ(result, UTF8_DECODE_ERROR, utf8_decode_result_str);
+      }
+
+      PASS();
+    }
+
+    TEST DECODER_utf8_decoder_overlong() {
+      // Check that all overlong encodings are rejected
+
+      %s
+
+      PASS();
+    }
+
+    TEST DECODER_utf8_decoder_too_large() {
+      // Check that all encodings of values over U+10FFFF are rejected
+
+      %s
+
+      PASS();
+    }
+
+    TEST DECODER_utf8_decoder_bad_bit_distribution() {
+      // Handle all cases of bad bit distribution.
+
+      %s
+
+      PASS();
+    }
+    ''' % (overlongs_test, too_large_test, bad_bits_test)
+    tests = tests.replace("DECODER", decoder)
+    all_tests.append(tests)
+
+print("\n".join(all_tests))
diff --git a/data_lang/utf8_decoder_tests_gen.sh b/data_lang/utf8_decoder_tests_gen.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+mkdir -p _gen/test/utf8
+clang-format <(python3 data_lang/utf8_decoder_tests_gen.py) >_gen/test/utf8/decoder-exhaustive.inc
diff --git a/data_lang/utf8_impls/utf8_decode.c b/data_lang/utf8_impls/utf8_decode.c
@@ -56,7 +56,7 @@ static int  the_index = 0;
 static int  the_length = 0;
 static int  the_char = 0;
 static int  the_byte = 0;
-static char* the_input;
+static const char* the_input;
 
 
 /*
@@ -88,7 +88,7 @@ static int cont() {
 /*
     Initialize the UTF-8 decoder. The decoder is not reentrant,
 */
-void utf8_decode_init(char p[], int length) {
+void utf8_decode_init(const char p[], int length) {
     the_index = 0;
     the_input = p;
     the_length = length;
@@ -104,6 +104,12 @@ int utf8_decode_at_byte() {
     return the_byte;
 }
 
+/*
+    Get the current index. (Byte we will read next)
+*/
+int utf8_decode_at_index() {
+    return the_index;
+}
 
 /*
     Get the current character offset. This is generally used in error reporting.

diff --git a/data_lang/utf8_impls/utf8_decode.h b/data_lang/utf8_impls/utf8_decode.h
@@ -4,6 +4,7 @@
 #define UTF8_ERROR -2
 
 extern int  utf8_decode_at_byte();
+extern int  utf8_decode_at_index();
 extern int  utf8_decode_at_character();
-extern void utf8_decode_init(char p[], int length);
+extern void utf8_decode_init(const char p[], int length);
 extern int  utf8_decode_next();