Skip to content
This repository
Browse code

http://code.google.com/p/mochiweb/issues/detail?id=47

  • Loading branch information...
commit de900ba7216e354fe274a7959937e87072d03d00 1 parent 6d300db
Bob Ippolito authored November 15, 2009

Showing 1 changed file with 42 additions and 2 deletions. Show diff stats Hide diff stats

  1. 44  src/mochijson2.erl
44  src/mochijson2.erl
@@ -354,10 +354,24 @@ tokenize_string_fast(B, O) ->
354 354
     case B of
355 355
         <<_:O/binary, ?Q, _/binary>> ->
356 356
             O;
357  
-        <<_:O/binary, C, _/binary>> when C =/= $\\ ->
  357
+        <<_:O/binary, $\\, _/binary>> ->
  358
+            {escape, O};
  359
+        <<_:O/binary, C1, _/binary>> when C1 < 128 ->
358 360
             tokenize_string_fast(B, 1 + O);
  361
+        <<_:O/binary, C1, C2, _/binary>> when C1 >= 194, C1 =< 223,
  362
+                C2 >= 128, C2 =< 191 ->
  363
+            tokenize_string_fast(B, 2 + O);
  364
+        <<_:O/binary, C1, C2, C3, _/binary>> when C1 >= 224, C1 =< 239,
  365
+                C2 >= 128, C2 =< 191,
  366
+                C3 >= 128, C3 =< 191 ->
  367
+            tokenize_string_fast(B, 3 + O);
  368
+        <<_:O/binary, C1, C2, C3, C4, _/binary>> when C1 >= 240, C1 =< 244,
  369
+                C2 >= 128, C2 =< 191,
  370
+                C3 >= 128, C3 =< 191,
  371
+                C4 >= 128, C4 =< 191 ->
  372
+            tokenize_string_fast(B, 4 + O);
359 373
         _ ->
360  
-            {escape, O}
  374
+            throw(invalid_utf8)
361 375
     end.
362 376
 
363 377
 tokenize_string(B, S=#decoder{offset=O}, Acc) ->
@@ -556,6 +570,7 @@ test_all() ->
556 570
     [1199344435545.0, 1] = decode(<<"[1199344435545.0,1]">>),
557 571
     <<16#F0,16#9D,16#9C,16#95>> = decode([34,"\\ud835","\\udf15",34]),
558 572
     test_encoder_utf8(),
  573
+    test_input_validation(),
559 574
     test_one(e2j_test_vec(utf8), 1).
560 575
 
561 576
 test_one([], _N) ->
@@ -621,3 +636,28 @@ test_encoder_utf8() ->
621 636
     Enc = mochijson2:encoder([{utf8, true}]),
622 637
     [34,"\\u0001",[209,130],[208,181],[209,129],[209,130],34] =
623 638
         Enc(<<1,"\321\202\320\265\321\201\321\202">>).
  639
+
  640
+test_input_validation() ->
  641
+    Good = [
  642
+        {16#00A3, <<?Q, 16#C2, 16#A3, ?Q>>}, % pound
  643
+        {16#20AC, <<?Q, 16#E2, 16#82, 16#AC, ?Q>>}, % euro
  644
+        {16#10196, <<?Q, 16#F0, 16#90, 16#86, 16#96, ?Q>>} % denarius
  645
+    ],
  646
+    lists:foreach(fun({CodePoint, UTF8}) ->
  647
+        Expect = list_to_binary(xmerl_ucs:to_utf8(CodePoint)),
  648
+        Expect = decode(UTF8)
  649
+    end, Good),
  650
+    
  651
+    Bad = [
  652
+        % 2nd, 3rd, or 4th byte of a multi-byte sequence w/o leading byte
  653
+        <<?Q, 16#80, ?Q>>,
  654
+        % missing continuations, last byte in each should be 80-BF
  655
+        <<?Q, 16#C2, 16#7F, ?Q>>,
  656
+        <<?Q, 16#E0, 16#80,16#7F, ?Q>>,
  657
+        <<?Q, 16#F0, 16#80, 16#80, 16#7F, ?Q>>,
  658
+        % we don't support code points > 10FFFF per RFC 3629
  659
+        <<?Q, 16#F5, 16#80, 16#80, 16#80, ?Q>>
  660
+    ],
  661
+    lists:foreach(fun(X) ->
  662
+        ok = try decode(X) catch invalid_utf8 -> ok end
  663
+    end, Bad).

0 notes on commit de900ba

Please sign in to comment.
Something went wrong with that request. Please try again.