Skip to content

Commit

Permalink
Optimized UTF8 validation (thanks to Joergen von Bargen).
Browse files Browse the repository at this point in the history
  • Loading branch information
pkulchenko committed Jul 14, 2015
1 parent 09f7a56 commit 789078f
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 9 deletions.
18 changes: 9 additions & 9 deletions src/util.lua
Expand Up @@ -358,15 +358,15 @@ end
function FixUTF8(s, repl)
local p, len, invalid = 1, #s, {}
while p <= len do
if p == s:find("[%z\1-\127]", p) then p = p + 1
elseif p == s:find("[\194-\223][\128-\191]", p) then p = p + 2
elseif p == s:find( "\224[\160-\191][\128-\191]", p)
or p == s:find("[\225-\236][\128-\191][\128-\191]", p)
or p == s:find( "\237[\128-\159][\128-\191]", p)
or p == s:find("[\238-\239][\128-\191][\128-\191]", p) then p = p + 3
elseif p == s:find( "\240[\144-\191][\128-\191][\128-\191]", p)
or p == s:find("[\241-\243][\128-\191][\128-\191][\128-\191]", p)
or p == s:find( "\244[\128-\143][\128-\191][\128-\191]", p) then p = p + 4
if s:find("^[%z\1-\127]", p) then p = p + 1
elseif s:find("^[\194-\223][\128-\191]", p) then p = p + 2
elseif s:find( "^\224[\160-\191][\128-\191]", p)
or s:find("^[\225-\236][\128-\191][\128-\191]", p)
or s:find( "^\237[\128-\159][\128-\191]", p)
or s:find("^[\238-\239][\128-\191][\128-\191]", p) then p = p + 3
elseif s:find( "^\240[\144-\191][\128-\191][\128-\191]", p)
or s:find("^[\241-\243][\128-\191][\128-\191][\128-\191]", p)
or s:find( "^\244[\128-\143][\128-\191][\128-\191]", p) then p = p + 4
else
local repl = type(repl) == 'function' and repl(s:sub(p,p)) or repl
s = s:sub(1, p-1)..repl..s:sub(p+1)
Expand Down
15 changes: 15 additions & 0 deletions t/9-misc.lua
@@ -1,3 +1,18 @@
local fixed, invalid = FixUTF8("+\128\129\130+\194\127+", "+")
is(fixed, "++++++\127+", "Invalid UTF8 is fixed (1/2).")
is(#invalid, 4, "Invalid UTF8 is fixed (2/2).")

local UTF8s = {
"ABCDE", -- 1 byte codes
"\194\160\194\161\194\162\194\163\194\164", -- 2 byte codes
"\225\160\160\225\161\161\225\162\162\225\163\163\225\164\164", -- 3 byte codes
}

for n, code in ipairs(UTF8s) do
is(FixUTF8(code), code, ("Valid UTF8 code is left unmodified (%d/%d)."):format(n, #UTF8s))
end


local editor = NewFile()

for _, tst in ipairs({
Expand Down

0 comments on commit 789078f

Please sign in to comment.