Permalink
Browse files

http://code.google.com/p/mochiweb/issues/detail?id=47

  • Loading branch information...
1 parent 6d300db commit de900ba7216e354fe274a7959937e87072d03d00 @etrepum etrepum committed Nov 15, 2009
Showing with 42 additions and 2 deletions.
  1. +42 −2 src/mochijson2.erl
View
@@ -354,10 +354,24 @@ tokenize_string_fast(B, O) ->
case B of
<<_:O/binary, ?Q, _/binary>> ->
O;
- <<_:O/binary, C, _/binary>> when C =/= $\\ ->
+ <<_:O/binary, $\\, _/binary>> ->
+ {escape, O};
+ <<_:O/binary, C1, _/binary>> when C1 < 128 ->
tokenize_string_fast(B, 1 + O);
+ <<_:O/binary, C1, C2, _/binary>> when C1 >= 194, C1 =< 223,
+ C2 >= 128, C2 =< 191 ->
+ tokenize_string_fast(B, 2 + O);
+ <<_:O/binary, C1, C2, C3, _/binary>> when C1 >= 224, C1 =< 239,
+ C2 >= 128, C2 =< 191,
+ C3 >= 128, C3 =< 191 ->
+ tokenize_string_fast(B, 3 + O);
+ <<_:O/binary, C1, C2, C3, C4, _/binary>> when C1 >= 240, C1 =< 244,
+ C2 >= 128, C2 =< 191,
+ C3 >= 128, C3 =< 191,
+ C4 >= 128, C4 =< 191 ->
+ tokenize_string_fast(B, 4 + O);
_ ->
- {escape, O}
+ throw(invalid_utf8)
end.
tokenize_string(B, S=#decoder{offset=O}, Acc) ->
@@ -556,6 +570,7 @@ test_all() ->
[1199344435545.0, 1] = decode(<<"[1199344435545.0,1]">>),
<<16#F0,16#9D,16#9C,16#95>> = decode([34,"\\ud835","\\udf15",34]),
test_encoder_utf8(),
+ test_input_validation(),
test_one(e2j_test_vec(utf8), 1).
test_one([], _N) ->
@@ -621,3 +636,28 @@ test_encoder_utf8() ->
Enc = mochijson2:encoder([{utf8, true}]),
[34,"\\u0001",[209,130],[208,181],[209,129],[209,130],34] =
Enc(<<1,"\321\202\320\265\321\201\321\202">>).
+
+test_input_validation() ->
+ Good = [
+ {16#00A3, <<?Q, 16#C2, 16#A3, ?Q>>}, % pound
+ {16#20AC, <<?Q, 16#E2, 16#82, 16#AC, ?Q>>}, % euro
+ {16#10196, <<?Q, 16#F0, 16#90, 16#86, 16#96, ?Q>>} % denarius
+ ],
+ lists:foreach(fun({CodePoint, UTF8}) ->
+ Expect = list_to_binary(xmerl_ucs:to_utf8(CodePoint)),
+ Expect = decode(UTF8)
+ end, Good),
+
+ Bad = [
+ % 2nd, 3rd, or 4th byte of a multi-byte sequence w/o leading byte
+ <<?Q, 16#80, ?Q>>,
+ % missing continuations, last byte in each should be 80-BF
+ <<?Q, 16#C2, 16#7F, ?Q>>,
+ <<?Q, 16#E0, 16#80,16#7F, ?Q>>,
+ <<?Q, 16#F0, 16#80, 16#80, 16#7F, ?Q>>,
+ % we don't support code points > 10FFFF per RFC 3629
+ <<?Q, 16#F5, 16#80, 16#80, 16#80, ?Q>>
+ ],
+ lists:foreach(fun(X) ->
+ ok = try decode(X) catch invalid_utf8 -> ok end
+ end, Bad).

0 comments on commit de900ba

Please sign in to comment.