From 5a70cdabb5d8edd5da4e9532a59731390e31e622 Mon Sep 17 00:00:00 2001 From: Bob Ippolito Date: Mon, 8 Feb 2016 18:17:53 -0800 Subject: [PATCH] Support parsing UTF-16 surrogate pairs in mochiweb_html #164 --- CHANGES.md | 5 +++- src/mochiweb_html.erl | 45 ++++++++++++++++++++++++++---------- test/mochiweb_html_tests.erl | 6 +++++ 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 05bf6946..af80a198 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,8 @@ -Version 2.13.0 released XXXX-XX-XX +Version 2.13.0 released 2016-02-08 +* Support parsing of UTF-16 surrogate pairs encoded as character + references in mochiweb_html + https://github.com/mochi/mochiweb/issues/164 * Avoid swallowing messages that are not related to the socket during request parsing https://github.com/mochi/mochiweb/pull/161 diff --git a/src/mochiweb_html.erl b/src/mochiweb_html.erl index 3fd93d0f..3c5c4f91 100644 --- a/src/mochiweb_html.erl +++ b/src/mochiweb_html.erl @@ -639,13 +639,42 @@ find_gt(Bin, S=#decoder{offset=O}, HasSlash) -> tokenize_charref(Bin, S=#decoder{offset=O}) -> try - tokenize_charref(Bin, S, O) + case tokenize_charref_raw(Bin, S, O) of + {C1, S1=#decoder{offset=O1}} when C1 >= 16#D800 andalso C1 =< 16#DFFF -> + %% Surrogate pair + tokeninize_charref_surrogate_pair(Bin, S1, C1); + {Unichar, S1} when is_integer(Unichar) -> + {{data, mochiutf8:codepoint_to_bytes(Unichar), false}, + S1}; + {Unichars, S1} when is_list(Unichars) -> + {{data, unicode:characters_to_binary(Unichars), false}, + S1} + end catch throw:invalid_charref -> {{data, <<"&">>, false}, S} end. -tokenize_charref(Bin, S=#decoder{offset=O}, Start) -> +tokeninize_charref_surrogate_pair(Bin, S=#decoder{offset=O}, C1) -> + case Bin of + <<_:O/binary, $&, _/binary>> -> + case tokenize_charref_raw(Bin, ?INC_COL(S), O + 1) of + {C2, S1} when C2 >= 16#D800 andalso C1 =< 16#DFFF -> + {{data, + unicode:characters_to_binary( + <>, + utf16, + utf8), + false}, + S1}; + _ -> + throw(invalid_charref) + end; + _ -> + throw(invalid_charref) + end. + +tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) -> case Bin of <<_:O/binary>> -> throw(invalid_charref); @@ -658,17 +687,9 @@ tokenize_charref(Bin, S=#decoder{offset=O}, Start) -> <<_:O/binary, $;, _/binary>> -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, - Data = case mochiweb_charref:charref(Raw) of - undefined -> - throw(invalid_charref); - Unichar when is_integer(Unichar) -> - mochiutf8:codepoint_to_bytes(Unichar); - Unichars when is_list(Unichars) -> - unicode:characters_to_binary(Unichars) - end, - {{data, Data, false}, ?INC_COL(S)}; + {mochiweb_charref:charref(Raw), ?INC_COL(S)}; _ -> - tokenize_charref(Bin, ?INC_COL(S), Start) + tokenize_charref_raw(Bin, ?INC_COL(S), Start) end. tokenize_doctype(Bin, S) -> diff --git a/test/mochiweb_html_tests.erl b/test/mochiweb_html_tests.erl index 3d35400a..f67759a1 100644 --- a/test/mochiweb_html_tests.erl +++ b/test/mochiweb_html_tests.erl @@ -126,6 +126,12 @@ tokens_test() -> mochiweb_html:tokens(<<"not html < at all">>)), ok. +surrogate_test() -> + %% https://github.com/mochi/mochiweb/issues/164 + ?assertEqual( + [{data,<<240,159,152,138>>,false}], + mochiweb_html:tokens(<<"��">>)). + parse_test() -> D0 = <<"