diff --git a/src/util.lua b/src/util.lua index e10c21fac..b4537d6d6 100644 --- a/src/util.lua +++ b/src/util.lua @@ -358,15 +358,15 @@ end function FixUTF8(s, repl) local p, len, invalid = 1, #s, {} while p <= len do - if p == s:find("[%z\1-\127]", p) then p = p + 1 - elseif p == s:find("[\194-\223][\128-\191]", p) then p = p + 2 - elseif p == s:find( "\224[\160-\191][\128-\191]", p) - or p == s:find("[\225-\236][\128-\191][\128-\191]", p) - or p == s:find( "\237[\128-\159][\128-\191]", p) - or p == s:find("[\238-\239][\128-\191][\128-\191]", p) then p = p + 3 - elseif p == s:find( "\240[\144-\191][\128-\191][\128-\191]", p) - or p == s:find("[\241-\243][\128-\191][\128-\191][\128-\191]", p) - or p == s:find( "\244[\128-\143][\128-\191][\128-\191]", p) then p = p + 4 + if s:find("^[%z\1-\127]", p) then p = p + 1 + elseif s:find("^[\194-\223][\128-\191]", p) then p = p + 2 + elseif s:find( "^\224[\160-\191][\128-\191]", p) + or s:find("^[\225-\236][\128-\191][\128-\191]", p) + or s:find( "^\237[\128-\159][\128-\191]", p) + or s:find("^[\238-\239][\128-\191][\128-\191]", p) then p = p + 3 + elseif s:find( "^\240[\144-\191][\128-\191][\128-\191]", p) + or s:find("^[\241-\243][\128-\191][\128-\191][\128-\191]", p) + or s:find( "^\244[\128-\143][\128-\191][\128-\191]", p) then p = p + 4 else local repl = type(repl) == 'function' and repl(s:sub(p,p)) or repl s = s:sub(1, p-1)..repl..s:sub(p+1) diff --git a/t/9-misc.lua b/t/9-misc.lua index 3bd00fc01..b8d484269 100644 --- a/t/9-misc.lua +++ b/t/9-misc.lua @@ -1,3 +1,18 @@ +local fixed, invalid = FixUTF8("+\128\129\130+\194\127+", "+") +is(fixed, "++++++\127+", "Invalid UTF8 is fixed (1/2).") +is(#invalid, 4, "Invalid UTF8 is fixed (2/2).") + +local UTF8s = { + "ABCDE", -- 1 byte codes + "\194\160\194\161\194\162\194\163\194\164", -- 2 byte codes + "\225\160\160\225\161\161\225\162\162\225\163\163\225\164\164", -- 3 byte codes +} + +for n, code in ipairs(UTF8s) do + is(FixUTF8(code), code, ("Valid UTF8 code is left unmodified (%d/%d)."):format(n, #UTF8s)) +end + + local editor = NewFile() for _, tst in ipairs({