diff --git a/lib/xml.ml b/lib/xml.ml index 165b223..ac9d48b 100644 --- a/lib/xml.ml +++ b/lib/xml.ml @@ -41,7 +41,273 @@ let _input_tree input : t = let data str : t = [`Data str] in input_tree ~el ~data input -let of_string ?entity ?enc str = + +let entity_to_utf8_table = + let h = Hashtbl.create in + let open Hashtbl in + (* From https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references *) + add h "quot" "\""; + add h "amp" "&"; + add h "apos" "'"; + add h "lt" "<"; + add h "gt" ">"; + add h "nbsp" " "; + add h "iexcl" "¡"; + add h "cent" "¢"; + add h "pound" "£"; + add h "curren" "¤"; + add h "yen" "¥"; + add h "brvbar" "¦"; + add h "sect" "§"; + add h "uml" "¨"; + add h "copy" "©"; + add h "ordf" "ª"; + add h "laquo" "«"; + add h "not" "¬"; + add h "shy" "­"; + add h "reg" "®"; + add h "macr" "¯"; + add h "deg" "°"; + add h "plusmn" "±"; + add h "sup2" "²"; + add h "sup3" "³"; + add h "acute" "´"; + add h "micro" "µ"; + add h "para" "¶"; + add h "middot" "·"; + add h "cedil" "¸"; + add h "sup1" "¹"; + add h "ordm" "º"; + add h "raquo" "»"; + add h "frac14" "¼"; + add h "frac12" "½"; + add h "frac34" "¾"; + add h "iquest" "¿"; + add h "Agrave" "À"; + add h "Aacute" "Á"; + add h "Acirc" "Â"; + add h "Atilde" "Ã"; + add h "Auml" "Ä"; + add h "Aring" "Å"; + add h "AElig" "Æ"; + add h "Ccedil" "Ç"; + add h "Egrave" "È"; + add h "Eacute" "É"; + add h "Ecirc" "Ê"; + add h "Euml" "Ë"; + add h "Igrave" "Ì"; + add h "Iacute" "Í"; + add h "Icirc" "Î"; + add h "Iuml" "Ï"; + add h "ETH" "Ð"; + add h "Ntilde" "Ñ"; + add h "Ograve" "Ò"; + add h "Oacute" "Ó"; + add h "Ocirc" "Ô"; + add h "Otilde" "Õ"; + add h "Ouml" "Ö"; + add h "times" "×"; + add h "Oslash" "Ø"; + add h "Ugrave" "Ù"; + add h "Uacute" "Ú"; + add h "Ucirc" "Û"; + add h "Uuml" "Ü"; + add h "Yacute" "Ý"; + add h "THORN" "Þ"; + add h "szlig" "ß"; + add h "agrave" "à"; + add h "aacute" "á"; + add h "acirc" "â"; + add h "atilde" "ã"; + add h "auml" "ä"; + add h "aring" "å"; + add h "aelig" "æ"; + add h "ccedil" "ç"; + add h "egrave" "è"; + add h "eacute" "é"; + add h "ecirc" "ê"; + add h "euml" "ë"; + add h "igrave" "ì"; + add h "iacute" "í"; + add h "icirc" "î"; + add h "iuml" "ï"; + add h "eth" "ð"; + add h "ntilde" "ñ"; + add h "ograve" "ò"; + add h "oacute" "ó"; + add h "ocirc" "ô"; + add h "otilde" "õ"; + add h "ouml" "ö"; + add h "divide" "÷"; + add h "oslash" "ø"; + add h "ugrave" "ù"; + add h "uacute" "ú"; + add h "ucirc" "û"; + add h "uuml" "ü"; + add h "yacute" "ý"; + add h "thorn" "þ"; + add h "yuml" "ÿ"; + add h "OElig" "Œ"; + add h "oelig" "œ"; + add h "Scaron" "Š"; + add h "scaron" "š"; + add h "Yuml" "Ÿ"; + add h "fnof" "ƒ"; + add h "circ" "ˆ"; + add h "tilde" "˜"; + add h "Alpha" "Α"; + add h "Beta" "Β"; + add h "Gamma" "Γ"; + add h "Delta" "Δ"; + add h "Epsilon" "Ε"; + add h "Zeta" "Ζ"; + add h "Eta" "Η"; + add h "Theta" "Θ"; + add h "Iota" "Ι"; + add h "Kappa" "Κ"; + add h "Lambda" "Λ"; + add h "Mu" "Μ"; + add h "Nu" "Ν"; + add h "Xi" "Ξ"; + add h "Omicron" "Ο"; + add h "Pi" "Π"; + add h "Rho" "Ρ"; + add h "Sigma" "Σ"; + add h "Tau" "Τ"; + add h "Upsilon" "Υ"; + add h "Phi" "Φ"; + add h "Chi" "Χ"; + add h "Psi" "Ψ"; + add h "Omega" "Ω"; + add h "alpha" "α"; + add h "beta" "β"; + add h "gamma" "γ"; + add h "delta" "δ"; + add h "epsilon" "ε"; + add h "zeta" "ζ"; + add h "eta" "η"; + add h "theta" "θ"; + add h "iota" "ι"; + add h "kappa" "κ"; + add h "lambda" "λ"; + add h "mu" "μ"; + add h "nu" "ν"; + add h "xi" "ξ"; + add h "omicron" "ο"; + add h "pi" "π"; + add h "rho" "ρ"; + add h "sigmaf" "ς"; + add h "sigma" "σ"; + add h "tau" "τ"; + add h "upsilon" "υ"; + add h "phi" "φ"; + add h "chi" "χ"; + add h "psi" "ψ"; + add h "omega" "ω"; + add h "thetasym" "ϑ"; + add h "upsih" "ϒ"; + add h "piv" "ϖ"; + add h "ensp" " "; (* EN SPACE, U+2002 *) + add h "emsp" " "; (* EM SPACE, U+2003 *) + add h "thinsp" " "; (* THIN SPACE, U+2009 *) + add h "zwnj" "‌"; (* U+200C *) + add h "zwj" "‍"; (* U+200D *) + add h "lrm" "‎"; (* U+200E *) + add h "rlm" "‏"; (* U+200F *) + add h "ndash" "–"; + add h "mdash" "—"; + add h "lsquo" "‘"; + add h "rsquo" "’"; + add h "sbquo" "‚"; + add h "ldquo" "“"; + add h "rdquo" "”"; + add h "bdquo" "„"; + add h "dagger" "†"; + add h "Dagger" "‡"; + add h "bull" "•"; + add h "hellip" "…"; + add h "permil" "‰"; + add h "prime" "′"; + add h "Prime" "″"; + add h "lsaquo" "‹"; + add h "rsaquo" "›"; + add h "oline" "‾"; + add h "frasl" "⁄"; + add h "euro" "€"; + add h "image" "ℑ"; + add h "weierp" "℘"; + add h "real" "ℜ"; + add h "trade" "™"; + add h "alefsym" "ℵ"; + add h "larr" "←"; + add h "uarr" "↑"; + add h "rarr" "→"; + add h "darr" "↓"; + add h "harr" "↔"; + add h "crarr" "↵"; + add h "lArr" "⇐"; + add h "uArr" "⇑"; + add h "rArr" "⇒"; + add h "dArr" "⇓"; + add h "hArr" "⇔"; + add h "forall" "∀"; + add h "part" "∂"; + add h "exist" "∃"; + add h "empty" "∅"; + add h "nabla" "∇"; + add h "isin" "∈"; + add h "notin" "∉"; + add h "ni" "∋" ; + add h "prod" "∏"; + add h "sum" "∑"; + add h "minus" "−"; + add h "lowast" "∗"; + add h "radic" "√"; + add h "prop" "∝"; + add h "infin" "∞"; + add h "ang" "∠"; + add h "and" "∧"; + add h "or" "∨"; + add h "cap" "∩"; + add h "cup" "∪"; + add h "int" "∫"; + add h "there4" "∴"; + add h "sim" "∼"; + add h "cong" "≅"; + add h "asymp" "≈"; + add h "ne" "≠"; + add h "equiv" "≡"; + add h "le" "≤"; + add h "ge" "≥"; + add h "sub" "⊂"; + add h "sup" "⊃"; + add h "nsub" "⊄"; + add h "sube" "⊆"; + add h "supe" "⊇"; + add h "oplus" "⊕"; + add h "otimes" "⊗"; + add h "perp" "⊥"; + add h "sdot" "⋅"; + add h "vellip" "⋮"; + add h "lceil" "⌈"; + add h "rceil" "⌉"; + add h "lfloor" "⌊"; + add h "rfloor" "⌋"; + add h "lang" "〈"; + add h "rang" "〉"; + add h "loz" "◊"; + add h "spades" "♠"; + add h "clubs" "♣"; + add h "hearts" "♥"; + add h "diams" "♦"; + h + +let entity_to_utf8 entity = + try Some(Hashtbl.find entity_to_utf8_table entity) + with _ -> None + + +let of_string ?(entity=entity_to_utf8) ?enc str = (* XXX: ugly hack to manually remove the DTD *) let remove_dtd str = let xml_decl = "\n" in diff --git a/lib/xml.mli b/lib/xml.mli index 2f009a4..4b8ee6b 100644 --- a/lib/xml.mli +++ b/lib/xml.mli @@ -26,3 +26,14 @@ val of_string : ?entity:(string -> string option) -> ?enc:encoding -> string -> t +(** [of_string s] returns the XML tree described by [s]. + + @param entity is called to resolve non predefined entity + references such as "&". It must return an UTF-8 string + corresponding to the replacement character data. By default, the + {{:https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references} + standard entities} are recognized. + + @param enc The encoding of the document. Default [None] which + means that one does not know the encoding. *) +;;