Skip to content

Commit

Permalink
[cpp] Implement UTF-8 check with the Bjoern DFA
Browse files Browse the repository at this point in the history
  • Loading branch information
Andy C committed Jan 6, 2024
1 parent 3e66096 commit 6772c74
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 2 deletions.
9 changes: 9 additions & 0 deletions cpp/NINJA_subgraph.py
Expand Up @@ -85,8 +85,17 @@ def NinjaGraph(ru):
srcs=[
'cpp/data_lang.cc',
],
deps=[
'//mycpp/runtime',
],
)

ru.cc_binary('cpp/data_lang_test.cc',
deps=[
'//cpp/data_lang',
],
matrix=ninja_lib.COMPILERS_VARIANTS)

# Note: depends on code generated by re2c
ru.cc_library(
'//cpp/frontend_match',
Expand Down
4 changes: 4 additions & 0 deletions cpp/TEST.sh
Expand Up @@ -63,6 +63,8 @@ unit() {

run-test-in-dir cpp/core_test '' $variant # has testdata

run-one-test cpp/data_lang_test '' $variant

run-one-test cpp/qsn_test '' $variant

run-one-test cpp/frontend_flag_spec_test '' $variant
Expand Down Expand Up @@ -97,6 +99,8 @@ coverage() {

run-test-in-dir cpp/core_test $compiler $variant # has testdata

run-one-test cpp/data_lang_test $compiler $variant

run-one-test cpp/qsn_test $compiler $variant

run-one-test cpp/frontend_flag_spec_test $compiler $variant
Expand Down
18 changes: 16 additions & 2 deletions cpp/data_lang.cc
Expand Up @@ -2,11 +2,25 @@

#include "cpp/data_lang.h"

#include "data_lang/utf8_impls/bjoern_dfa.h"

namespace pyj8 {

bool PartIsUtf8(BigStr* s, int start, int end) {
// TODO: use Bjoern DFA
return true;
uint32_t codepoint;
uint32_t state = UTF8_ACCEPT;

for (int i = start; i < end; ++i) {
// This var or a static_cast<> is necessary. Should really change BigStr*
// to use unsigned type
uint8_t c = s->data_[i];
decode(&state, &codepoint, c);
if (state == UTF8_REJECT) {
return false;
}
}

return state == UTF8_ACCEPT;
}

} // namespace pyj8
33 changes: 33 additions & 0 deletions cpp/data_lang_test.cc
@@ -0,0 +1,33 @@
#include "cpp/data_lang.h"

#include "vendor/greatest.h"

TEST part_is_utf8_test() {
BigStr* s = StrFromC("hi");

ASSERT(pyj8::PartIsUtf8(s, 0, 2));

// empty string is trivially UTF-8
ASSERT(pyj8::PartIsUtf8(s, 0, 0));

BigStr* binary = StrFromC("h\xff");
ASSERT(!pyj8::PartIsUtf8(binary, 0, len(binary)));

// first byte is UTF-8
ASSERT(pyj8::PartIsUtf8(binary, 0, 1));
// second byte isn't
ASSERT(!pyj8::PartIsUtf8(binary, 1, 2));

PASS();
}

GREATEST_MAIN_DEFS();

int main(int argc, char** argv) {
GREATEST_MAIN_BEGIN();

RUN_TEST(part_is_utf8_test);

GREATEST_MAIN_END();
return 0;
}

0 comments on commit 6772c74

Please sign in to comment.