Skip to content

Commit

Permalink
Optimize character decoding in already-validated UTF-8
Browse files Browse the repository at this point in the history
The UTF-8 Everywhere scheme holds strings in memory as validated UTF-8
bytes, which are known to be good.  This means a faster strategy can
be used that counts the trailing bytes as it goes, until a continuation
byte is seen.
  • Loading branch information
hostilefork committed Aug 28, 2019
1 parent eaefdef commit 8a86454
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 2 deletions.
30 changes: 30 additions & 0 deletions src/include/datatypes/sys-char.h
Expand Up @@ -388,3 +388,33 @@ inline static const REBYTE *Back_Scan_UTF8_Char(

return bp + trail;
}


// This is the fast version of scanning a UTF-8 character where you assume it
// is valid UTF-8...it seeks ahead until it finds a non-continuation byte.
// Since it seeks ahead, it still has to follow the Back_Scan_UTF8_Char()
// strategy that splits ASCII codes to basic incrementation...otherwise it
// would try to read continuation bytes past a `\0` string terminator. :-/
//
inline static const REBYTE *Back_Scan_UTF8_Char_Unchecked(
REBUNI *out,
const REBYTE *bp
){
*out = *bp; // wait to increment...
uint_fast8_t trail = 0; // count as we go

while (Is_Continuation_Byte_If_Utf8(bp[1])) {
*out <<= 6;
++bp; // ...NOW we increment
*out += *bp;
++trail;
}
assert(trail <= 5);

*out -= offsetsFromUTF8[trail]; // subtract the "magic number"

assert(*out <= UNI_MAX_LEGAL_UTF32);
assert(*out < UNI_SUR_HIGH_START or *out > UNI_SUR_LOW_END);

return bp;
}
4 changes: 2 additions & 2 deletions src/include/datatypes/sys-string.h
Expand Up @@ -95,7 +95,7 @@
if (*bp < 0x80)
*codepoint_out = *bp;
else
bp = Back_Scan_UTF8_Char(codepoint_out, bp, NULL);
bp = Back_Scan_UTF8_Char_Unchecked(codepoint_out, bp);
return m_cast(REBYTE*, bp + 1);
}

Expand Down Expand Up @@ -195,7 +195,7 @@
if (*t < 0x80)
*out = *t;
else
t = Back_Scan_UTF8_Char(out, t, NULL);
t = Back_Scan_UTF8_Char_Unchecked(out, t);
return RebchrPtr {t + 1};
}

Expand Down

0 comments on commit 8a86454

Please sign in to comment.