Skip to content

Commit

Permalink
Optimize is_utf8
Browse files Browse the repository at this point in the history
Manually unroll the multibyte loops, and optimize for the single
byte chars.
  • Loading branch information
glinscott committed Jul 11, 2013
1 parent 1796373 commit 5aee5a1
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 8 deletions.
24 changes: 16 additions & 8 deletions src/libstd/str.rs
Expand Up @@ -596,17 +596,25 @@ pub fn is_utf8(v: &[u8]) -> bool {
let mut i = 0u;
let total = v.len();
while i < total {
let mut chsize = utf8_char_width(v[i]);
if chsize == 0u { return false; }
if i + chsize > total { return false; }
i += 1u;
while chsize > 1u {
if v[i] & 192u8 != TAG_CONT_U8 { return false; }
if v[i] < 128u8 {
i += 1u;
chsize -= 1u;
} else {
let w = utf8_char_width(v[i]);
if w == 0u { return false; }

let nexti = i + w;
if nexti > total { return false; }

if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
if w > 2 {
if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; }
}

i = nexti;
}
}
return true;
true
}

/// Determines if a vector of `u16` contains valid UTF-16
Expand Down
11 changes: 11 additions & 0 deletions src/test/run-pass/utf8_chars.rs
Expand Up @@ -27,9 +27,20 @@ pub fn main() {
assert!(s.char_at(1u) == 'é');

assert!((str::is_utf8(s.as_bytes())));
// invalid prefix
assert!((!str::is_utf8(~[0x80_u8])));
// invalid 2 byte prefix
assert!((!str::is_utf8(~[0xc0_u8])));
assert!((!str::is_utf8(~[0xc0_u8, 0x10_u8])));
// invalid 3 byte prefix
assert!((!str::is_utf8(~[0xe0_u8])));
assert!((!str::is_utf8(~[0xe0_u8, 0x10_u8])));
assert!((!str::is_utf8(~[0xe0_u8, 0xff_u8, 0x10_u8])));
// invalid 4 byte prefix
assert!((!str::is_utf8(~[0xf0_u8])));
assert!((!str::is_utf8(~[0xf0_u8, 0x10_u8])));
assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0x10_u8])));
assert!((!str::is_utf8(~[0xf0_u8, 0xff_u8, 0xff_u8, 0x10_u8])));

let mut stack = ~"a×c€";
assert_eq!(stack.pop_char(), '€');
Expand Down

0 comments on commit 5aee5a1

Please sign in to comment.