Skip to content

Commit

Permalink
Convert HTML/XML entities by default
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris00 committed Jan 24, 2015
1 parent 4cd8af6 commit 13bda20
Show file tree
Hide file tree
Showing 2 changed files with 278 additions and 1 deletion.
268 changes: 267 additions & 1 deletion lib/xml.ml
Expand Up @@ -41,7 +41,273 @@ let _input_tree input : t =
let data str : t = [`Data str] in
input_tree ~el ~data input

let of_string ?entity ?enc str =

let entity_to_utf8_table =
let h = Hashtbl.create in
let open Hashtbl in
(* From https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references *)
add h "quot" "\"";
add h "amp" "&";
add h "apos" "'";
add h "lt" "<";
add h "gt" ">";
add h "nbsp" " ";
add h "iexcl" "¡";
add h "cent" "¢";
add h "pound" "£";
add h "curren" "¤";
add h "yen" "¥";
add h "brvbar" "¦";
add h "sect" "§";
add h "uml" "¨";
add h "copy" "©";
add h "ordf" "ª";
add h "laquo" "«";
add h "not" "¬";
add h "shy" "­";
add h "reg" "®";
add h "macr" "¯";
add h "deg" "°";
add h "plusmn" "±";
add h "sup2" "²";
add h "sup3" "³";
add h "acute" "´";
add h "micro" "µ";
add h "para" "";
add h "middot" "·";
add h "cedil" "¸";
add h "sup1" "¹";
add h "ordm" "º";
add h "raquo" "»";
add h "frac14" "¼";
add h "frac12" "½";
add h "frac34" "¾";
add h "iquest" "¿";
add h "Agrave" "À";
add h "Aacute" "Á";
add h "Acirc" "Â";
add h "Atilde" "Ã";
add h "Auml" "Ä";
add h "Aring" "Å";
add h "AElig" "Æ";
add h "Ccedil" "Ç";
add h "Egrave" "È";
add h "Eacute" "É";
add h "Ecirc" "Ê";
add h "Euml" "Ë";
add h "Igrave" "Ì";
add h "Iacute" "Í";
add h "Icirc" "Î";
add h "Iuml" "Ï";
add h "ETH" "Ð";
add h "Ntilde" "Ñ";
add h "Ograve" "Ò";
add h "Oacute" "Ó";
add h "Ocirc" "Ô";
add h "Otilde" "Õ";
add h "Ouml" "Ö";
add h "times" "×";
add h "Oslash" "Ø";
add h "Ugrave" "Ù";
add h "Uacute" "Ú";
add h "Ucirc" "Û";
add h "Uuml" "Ü";
add h "Yacute" "Ý";
add h "THORN" "Þ";
add h "szlig" "ß";
add h "agrave" "à";
add h "aacute" "á";
add h "acirc" "â";
add h "atilde" "ã";
add h "auml" "ä";
add h "aring" "å";
add h "aelig" "æ";
add h "ccedil" "ç";
add h "egrave" "è";
add h "eacute" "é";
add h "ecirc" "ê";
add h "euml" "ë";
add h "igrave" "ì";
add h "iacute" "í";
add h "icirc" "î";
add h "iuml" "ï";
add h "eth" "ð";
add h "ntilde" "ñ";
add h "ograve" "ò";
add h "oacute" "ó";
add h "ocirc" "ô";
add h "otilde" "õ";
add h "ouml" "ö";
add h "divide" "÷";
add h "oslash" "ø";
add h "ugrave" "ù";
add h "uacute" "ú";
add h "ucirc" "û";
add h "uuml" "ü";
add h "yacute" "ý";
add h "thorn" "þ";
add h "yuml" "ÿ";
add h "OElig" "Œ";
add h "oelig" "œ";
add h "Scaron" "Š";
add h "scaron" "š";
add h "Yuml" "Ÿ";
add h "fnof" "ƒ";
add h "circ" "ˆ";
add h "tilde" "˜";
add h "Alpha" "Α";
add h "Beta" "Β";
add h "Gamma" "Γ";
add h "Delta" "Δ";
add h "Epsilon" "Ε";
add h "Zeta" "Ζ";
add h "Eta" "Η";
add h "Theta" "Θ";
add h "Iota" "Ι";
add h "Kappa" "Κ";
add h "Lambda" "Λ";
add h "Mu" "Μ";
add h "Nu" "Ν";
add h "Xi" "Ξ";
add h "Omicron" "Ο";
add h "Pi" "Π";
add h "Rho" "Ρ";
add h "Sigma" "Σ";
add h "Tau" "Τ";
add h "Upsilon" "Υ";
add h "Phi" "Φ";
add h "Chi" "Χ";
add h "Psi" "Ψ";
add h "Omega" "Ω";
add h "alpha" "α";
add h "beta" "β";
add h "gamma" "γ";
add h "delta" "δ";
add h "epsilon" "ε";
add h "zeta" "ζ";
add h "eta" "η";
add h "theta" "θ";
add h "iota" "ι";
add h "kappa" "κ";
add h "lambda" "λ";
add h "mu" "μ";
add h "nu" "ν";
add h "xi" "ξ";
add h "omicron" "ο";
add h "pi" "π";
add h "rho" "ρ";
add h "sigmaf" "ς";
add h "sigma" "σ";
add h "tau" "τ";
add h "upsilon" "υ";
add h "phi" "φ";
add h "chi" "χ";
add h "psi" "ψ";
add h "omega" "ω";
add h "thetasym" "ϑ";
add h "upsih" "ϒ";
add h "piv" "ϖ";
add h "ensp" ""; (* EN SPACE, U+2002 *)
add h "emsp" ""; (* EM SPACE, U+2003 *)
add h "thinsp" ""; (* THIN SPACE, U+2009 *)
add h "zwnj" ""; (* U+200C *)
add h "zwj" ""; (* U+200D *)
add h "lrm" ""; (* U+200E *)
add h "rlm" ""; (* U+200F *)
add h "ndash" "";
add h "mdash" "";
add h "lsquo" "";
add h "rsquo" "";
add h "sbquo" "";
add h "ldquo" "";
add h "rdquo" "";
add h "bdquo" "";
add h "dagger" "";
add h "Dagger" "";
add h "bull" "";
add h "hellip" "";
add h "permil" "";
add h "prime" "";
add h "Prime" "";
add h "lsaquo" "";
add h "rsaquo" "";
add h "oline" "";
add h "frasl" "";
add h "euro" "";
add h "image" "";
add h "weierp" "";
add h "real" "";
add h "trade" "";
add h "alefsym" "";
add h "larr" "";
add h "uarr" "";
add h "rarr" "";
add h "darr" "";
add h "harr" "";
add h "crarr" "";
add h "lArr" "";
add h "uArr" "";
add h "rArr" "";
add h "dArr" "";
add h "hArr" "";
add h "forall" "";
add h "part" "";
add h "exist" "";
add h "empty" "";
add h "nabla" "";
add h "isin" "";
add h "notin" "";
add h "ni" "" ;
add h "prod" "";
add h "sum" "";
add h "minus" "";
add h "lowast" "";
add h "radic" "";
add h "prop" "";
add h "infin" "";
add h "ang" "";
add h "and" "";
add h "or" "";
add h "cap" "";
add h "cup" "";
add h "int" "";
add h "there4" "";
add h "sim" "";
add h "cong" "";
add h "asymp" "";
add h "ne" "";
add h "equiv" "";
add h "le" "";
add h "ge" "";
add h "sub" "";
add h "sup" "";
add h "nsub" "";
add h "sube" "";
add h "supe" "";
add h "oplus" "";
add h "otimes" "";
add h "perp" "";
add h "sdot" "";
add h "vellip" "";
add h "lceil" "";
add h "rceil" "";
add h "lfloor" "";
add h "rfloor" "";
add h "lang" "";
add h "rang" "";
add h "loz" "";
add h "spades" "";
add h "clubs" "";
add h "hearts" "";
add h "diams" "";
h

let entity_to_utf8 entity =
try Some(Hashtbl.find entity_to_utf8_table entity)
with _ -> None


let of_string ?(entity=entity_to_utf8) ?enc str =
(* XXX: ugly hack to manually remove the DTD *)
let remove_dtd str =
let xml_decl = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" in
Expand Down
11 changes: 11 additions & 0 deletions lib/xml.mli
Expand Up @@ -26,3 +26,14 @@ val of_string :
?entity:(string -> string option) ->
?enc:encoding ->
string -> t
(** [of_string s] returns the XML tree described by [s].
@param entity is called to resolve non predefined entity
references such as "&amp;". It must return an UTF-8 string
corresponding to the replacement character data. By default, the
{{:https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references}
standard entities} are recognized.
@param enc The encoding of the document. Default [None] which
means that one does not know the encoding. *)
;;

0 comments on commit 13bda20

Please sign in to comment.