Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

317 lines (288 sloc) 11.508 kb
%% @copyright 2010 Mochi Media, Inc.
%% @author Bob Ippolito <bob@mochimedia.com>
%% @doc Algorithm to convert any binary to a valid UTF-8 sequence by ignoring
%% invalid bytes.
-module(mochiutf8).
-export([valid_utf8_bytes/1, codepoint_to_bytes/1, bytes_to_codepoints/1]).
-export([bytes_foldl/3, codepoint_foldl/3, read_codepoint/1, len/1]).
%% External API
-type unichar_low() :: 0..16#d7ff.
-type unichar_high() :: 16#e000..16#10ffff.
-type unichar() :: unichar_low() | unichar_high().
-spec codepoint_to_bytes(unichar()) -> binary().
%% @doc Convert a unicode codepoint to UTF-8 bytes.
codepoint_to_bytes(C) when (C >= 16#00 andalso C =< 16#7f) ->
%% U+0000 - U+007F - 7 bits
<<C>>;
codepoint_to_bytes(C) when (C >= 16#080 andalso C =< 16#07FF) ->
%% U+0080 - U+07FF - 11 bits
<<0:5, B1:5, B0:6>> = <<C:16>>,
<<2#110:3, B1:5,
2#10:2, B0:6>>;
codepoint_to_bytes(C) when (C >= 16#0800 andalso C =< 16#FFFF) andalso
(C < 16#D800 orelse C > 16#DFFF) ->
%% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
<<B2:4, B1:6, B0:6>> = <<C:16>>,
<<2#1110:4, B2:4,
2#10:2, B1:6,
2#10:2, B0:6>>;
codepoint_to_bytes(C) when (C >= 16#010000 andalso C =< 16#10FFFF) ->
%% U+10000 - U+10FFFF - 21 bits
<<0:3, B3:3, B2:6, B1:6, B0:6>> = <<C:24>>,
<<2#11110:5, B3:3,
2#10:2, B2:6,
2#10:2, B1:6,
2#10:2, B0:6>>.
-spec codepoints_to_bytes([unichar()]) -> binary().
%% @doc Convert a list of codepoints to a UTF-8 binary.
codepoints_to_bytes(L) ->
<<<<(codepoint_to_bytes(C))/binary>> || C <- L>>.
-spec read_codepoint(binary()) -> {unichar(), binary(), binary()}.
read_codepoint(Bin = <<2#0:1, C:7, Rest/binary>>) ->
%% U+0000 - U+007F - 7 bits
<<B:1/binary, _/binary>> = Bin,
{C, B, Rest};
read_codepoint(Bin = <<2#110:3, B1:5,
2#10:2, B0:6,
Rest/binary>>) ->
%% U+0080 - U+07FF - 11 bits
case <<B1:5, B0:6>> of
<<C:11>> when C >= 16#80 ->
<<B:2/binary, _/binary>> = Bin,
{C, B, Rest}
end;
read_codepoint(Bin = <<2#1110:4, B2:4,
2#10:2, B1:6,
2#10:2, B0:6,
Rest/binary>>) ->
%% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
case <<B2:4, B1:6, B0:6>> of
<<C:16>> when (C >= 16#0800 andalso C =< 16#FFFF) andalso
(C < 16#D800 orelse C > 16#DFFF) ->
<<B:3/binary, _/binary>> = Bin,
{C, B, Rest}
end;
read_codepoint(Bin = <<2#11110:5, B3:3,
2#10:2, B2:6,
2#10:2, B1:6,
2#10:2, B0:6,
Rest/binary>>) ->
%% U+10000 - U+10FFFF - 21 bits
case <<B3:3, B2:6, B1:6, B0:6>> of
<<C:21>> when (C >= 16#010000 andalso C =< 16#10FFFF) ->
<<B:4/binary, _/binary>> = Bin,
{C, B, Rest}
end.
-spec codepoint_foldl(fun((unichar(), _) -> _), _, binary()) -> _.
codepoint_foldl(F, Acc, <<>>) when is_function(F, 2) ->
Acc;
codepoint_foldl(F, Acc, Bin) ->
{C, _, Rest} = read_codepoint(Bin),
codepoint_foldl(F, F(C, Acc), Rest).
-spec bytes_foldl(fun((binary(), _) -> _), _, binary()) -> _.
bytes_foldl(F, Acc, <<>>) when is_function(F, 2) ->
Acc;
bytes_foldl(F, Acc, Bin) ->
{_, B, Rest} = read_codepoint(Bin),
bytes_foldl(F, F(B, Acc), Rest).
-spec bytes_to_codepoints(binary()) -> [unichar()].
bytes_to_codepoints(B) ->
lists:reverse(codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], B)).
-spec len(binary()) -> non_neg_integer().
len(<<>>) ->
0;
len(B) ->
{_, _, Rest} = read_codepoint(B),
1 + len(Rest).
-spec valid_utf8_bytes(B::binary()) -> binary().
%% @doc Return only the bytes in B that represent valid UTF-8. Uses
%% the following recursive algorithm: skip one byte if B does not
%% follow UTF-8 syntax (a 1-4 byte encoding of some number),
%% skip sequence of 2-4 bytes if it represents an overlong encoding
%% or bad code point (surrogate U+D800 - U+DFFF or > U+10FFFF).
valid_utf8_bytes(B) when is_binary(B) ->
binary_skip_bytes(B, invalid_utf8_indexes(B)).
%% Internal API
-spec binary_skip_bytes(binary(), [non_neg_integer()]) -> binary().
%% @doc Return B, but skipping the 0-based indexes in L.
binary_skip_bytes(B, []) ->
B;
binary_skip_bytes(B, L) ->
binary_skip_bytes(B, L, 0, []).
%% @private
-spec binary_skip_bytes(binary(), [non_neg_integer()], non_neg_integer(), iolist()) -> binary().
binary_skip_bytes(B, [], _N, Acc) ->
iolist_to_binary(lists:reverse([B | Acc]));
binary_skip_bytes(<<_, RestB/binary>>, [N | RestL], N, Acc) ->
binary_skip_bytes(RestB, RestL, 1 + N, Acc);
binary_skip_bytes(<<C, RestB/binary>>, L, N, Acc) ->
binary_skip_bytes(RestB, L, 1 + N, [C | Acc]).
-spec invalid_utf8_indexes(binary()) -> [non_neg_integer()].
%% @doc Return the 0-based indexes in B that are not valid UTF-8.
invalid_utf8_indexes(B) ->
invalid_utf8_indexes(B, 0, []).
%% @private.
-spec invalid_utf8_indexes(binary(), non_neg_integer(), [non_neg_integer()]) -> [non_neg_integer()].
invalid_utf8_indexes(<<C, Rest/binary>>, N, Acc) when C < 16#80 ->
%% U+0000 - U+007F - 7 bits
invalid_utf8_indexes(Rest, 1 + N, Acc);
invalid_utf8_indexes(<<C1, C2, Rest/binary>>, N, Acc)
when C1 band 16#E0 =:= 16#C0,
C2 band 16#C0 =:= 16#80 ->
%% U+0080 - U+07FF - 11 bits
case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of
C when C < 16#80 ->
%% Overlong encoding.
invalid_utf8_indexes(Rest, 2 + N, [1 + N, N | Acc]);
_ ->
%% Upper bound U+07FF does not need to be checked
invalid_utf8_indexes(Rest, 2 + N, Acc)
end;
invalid_utf8_indexes(<<C1, C2, C3, Rest/binary>>, N, Acc)
when C1 band 16#F0 =:= 16#E0,
C2 band 16#C0 =:= 16#80,
C3 band 16#C0 =:= 16#80 ->
%% U+0800 - U+FFFF - 16 bits
case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
(C3 band 16#3F) of
C when (C < 16#800) orelse (C >= 16#D800 andalso C =< 16#DFFF) ->
%% Overlong encoding or surrogate.
invalid_utf8_indexes(Rest, 3 + N, [2 + N, 1 + N, N | Acc]);
_ ->
%% Upper bound U+FFFF does not need to be checked
invalid_utf8_indexes(Rest, 3 + N, Acc)
end;
invalid_utf8_indexes(<<C1, C2, C3, C4, Rest/binary>>, N, Acc)
when C1 band 16#F8 =:= 16#F0,
C2 band 16#C0 =:= 16#80,
C3 band 16#C0 =:= 16#80,
C4 band 16#C0 =:= 16#80 ->
%% U+10000 - U+10FFFF - 21 bits
case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
(C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of
C when (C < 16#10000) orelse (C > 16#10FFFF) ->
%% Overlong encoding or invalid code point.
invalid_utf8_indexes(Rest, 4 + N, [3 + N, 2 + N, 1 + N, N | Acc]);
_ ->
invalid_utf8_indexes(Rest, 4 + N, Acc)
end;
invalid_utf8_indexes(<<_, Rest/binary>>, N, Acc) ->
%% Invalid char
invalid_utf8_indexes(Rest, 1 + N, [N | Acc]);
invalid_utf8_indexes(<<>>, _N, Acc) ->
lists:reverse(Acc).
%%
%% Tests
%%
-include_lib("eunit/include/eunit.hrl").
-ifdef(TEST).
binary_skip_bytes_test() ->
?assertEqual(<<"foo">>,
binary_skip_bytes(<<"foo">>, [])),
?assertEqual(<<"foobar">>,
binary_skip_bytes(<<"foo bar">>, [3])),
?assertEqual(<<"foo">>,
binary_skip_bytes(<<"foo bar">>, [3, 4, 5, 6])),
?assertEqual(<<"oo bar">>,
binary_skip_bytes(<<"foo bar">>, [0])),
ok.
invalid_utf8_indexes_test() ->
?assertEqual(
[],
invalid_utf8_indexes(<<"unicode snowman for you: ", 226, 152, 131>>)),
?assertEqual(
[0],
invalid_utf8_indexes(<<128>>)),
?assertEqual(
[57,59,60,64,66,67],
invalid_utf8_indexes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
ok.
codepoint_to_bytes_test() ->
%% U+0000 - U+007F - 7 bits
%% U+0080 - U+07FF - 11 bits
%% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
%% U+10000 - U+10FFFF - 21 bits
?assertEqual(
<<"a">>,
codepoint_to_bytes($a)),
?assertEqual(
<<16#c2, 16#80>>,
codepoint_to_bytes(16#80)),
?assertEqual(
<<16#df, 16#bf>>,
codepoint_to_bytes(16#07ff)),
?assertEqual(
<<16#ef, 16#bf, 16#bf>>,
codepoint_to_bytes(16#ffff)),
?assertEqual(
<<16#f4, 16#8f, 16#bf, 16#bf>>,
codepoint_to_bytes(16#10ffff)),
ok.
bytes_foldl_test() ->
?assertEqual(
<<"abc">>,
bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>, <<"abc">>)),
?assertEqual(
<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>,
bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>,
<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
ok.
bytes_to_codepoints_test() ->
?assertEqual(
"abc" ++ [16#2603, 16#4e2d, 16#85, 16#10ffff],
bytes_to_codepoints(<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
ok.
codepoint_foldl_test() ->
?assertEqual(
"cba",
codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], <<"abc">>)),
?assertEqual(
[16#10ffff, 16#85, 16#4e2d, 16#2603 | "cba"],
codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [],
<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
ok.
len_test() ->
?assertEqual(
29,
len(<<"unicode snowman for you: ", 226, 152, 131, 228, 184, 173, 194, 133, 244, 143, 191, 191>>)),
ok.
codepoints_to_bytes_test() ->
?assertEqual(
iolist_to_binary(lists:map(fun codepoint_to_bytes/1, lists:seq(1, 1000))),
codepoints_to_bytes(lists:seq(1, 1000))),
ok.
valid_utf8_bytes_test() ->
?assertEqual(
<<"invalid U+11ffff: ">>,
valid_utf8_bytes(<<"invalid U+11ffff: ", 244, 159, 191, 191>>)),
?assertEqual(
<<"U+10ffff: ", 244, 143, 191, 191>>,
valid_utf8_bytes(<<"U+10ffff: ", 244, 143, 191, 191>>)),
?assertEqual(
<<"overlong 2-byte encoding (a): ">>,
valid_utf8_bytes(<<"overlong 2-byte encoding (a): ", 2#11000001, 2#10100001>>)),
?assertEqual(
<<"overlong 2-byte encoding (!): ">>,
valid_utf8_bytes(<<"overlong 2-byte encoding (!): ", 2#11000000, 2#10100001>>)),
?assertEqual(
<<"mu: ", 194, 181>>,
valid_utf8_bytes(<<"mu: ", 194, 181>>)),
?assertEqual(
<<"bad coding bytes: ">>,
valid_utf8_bytes(<<"bad coding bytes: ", 2#10011111, 2#10111111, 2#11111111>>)),
?assertEqual(
<<"low surrogate (unpaired): ">>,
valid_utf8_bytes(<<"low surrogate (unpaired): ", 237, 176, 128>>)),
?assertEqual(
<<"high surrogate (unpaired): ">>,
valid_utf8_bytes(<<"high surrogate (unpaired): ", 237, 191, 191>>)),
?assertEqual(
<<"unicode snowman for you: ", 226, 152, 131>>,
valid_utf8_bytes(<<"unicode snowman for you: ", 226, 152, 131>>)),
?assertEqual(
<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (AISPW))">>,
valid_utf8_bytes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
ok.
-endif.
Jump to Line
Something went wrong with that request. Please try again.