Permalink
Browse files

Support all HTML5 entities

  • Loading branch information...
1 parent 57f6d12 commit 38875c59213d7f41f51a54fe1c094d56e533d710 @etrepum etrepum committed Oct 15, 2011
Showing with 2,194 additions and 272 deletions.
  1. +45 −0 scripts/entities.erl
  2. +2,130 −255 src/mochiweb_charref.erl
  3. +19 −17 src/mochiweb_html.erl
View
@@ -0,0 +1,45 @@
+#!/usr/bin/env escript
+%% -*- mode: erlang -*-
+-export([main/1]).
+
+%% @doc Script used to generate mochiweb_charref.erl table.
+
+main(_) ->
+ application:start(inets),
+ code:add_patha("ebin"),
+ {ok, {_, _, HTML}} = httpc:request("http://www.w3.org/TR/html5/named-character-references.html"),
+ print(lists:sort(search(mochiweb_html:parse(HTML)))).
+
+print([F | T]) ->
+ io:put_chars([clause(F), ";\n"]),
+ print(T);
+print([]) ->
+ io:put_chars(["entity(_) -> undefined.\n"]),
+ ok.
+
+clause({Title, [Codepoint]}) ->
+ ["entity(\"", Title, "\") -> 16#", Codepoint];
+clause({Title, [First | Rest]}) ->
+ ["entity(\"", Title, "\") -> [16#", First,
+ [[", 16#", Codepoint] || Codepoint <- Rest],
+ "]"].
+
+
+search(Elem) ->
+ search(Elem, []).
+
+search({<<"tr">>, [{<<"id">>, <<"entity-", _/binary>>} | _], Children}, Acc) ->
+ %% HTML5 charrefs can have more than one code point(!)
+ [{<<"td">>, _, [{<<"code">>, _, [TitleSemi]}]},
+ {<<"td">>, [], [RawCPs]} | _] = Children,
+ L = byte_size(TitleSemi) - 1,
+ <<Title:L/binary, $;>> = TitleSemi,
+ {match, Matches} = re:run(RawCPs, "(?:\\s*U\\+)([a-fA-F0-9]+)",
+ [{capture, all, binary}, global]),
+ [{Title, [CP || [_, CP] <- Matches]} | Acc];
+search({Tag, Attrs, [H | T]}, Acc) ->
+ search({Tag, Attrs, T}, search(H, Acc));
+search({_Tag, _Attrs, []}, Acc) ->
+ Acc;
+search(<<_/binary>>, Acc) ->
+ Acc.
Oops, something went wrong.

0 comments on commit 38875c5

Please sign in to comment.