diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..432f863 --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# The directory Mix will write compiled artifacts to. +/_build + +# If you run "mix test --cover", coverage assets end up here. +/cover + +# The directory Mix downloads your dependencies sources to. +/deps + +# Where 3rd-party dependencies like ExDoc output generated docs. +/doc + +# Ignore .fetch files in case you like to edit your project deps locally. +/.fetch + +# If the VM crashes, it generates a dump, let's ignore it too. +erl_crash.dump + +# Also ignore archive artifacts (built via "mix archive.build"). +*.ez + +/priv/native + +/native/*/target diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..16fe87b --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..f7f20dc --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2017 hansihe + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..f55543c --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +# Meeseeks_Html5ever + +Fork of [html5ever_elixir](https://github.com/hansihe/html5ever_elixir) that is more tightly coupled with [Meeseeks](https://github.com/mischov/meeseeks). + +## Installation + +The package can be installed by adding `meeseeks_html5ever` to your list of dependencies in `mix.exs`: + +```elixir +def deps do + [{:meeseeks_html5ever, "~> 0.4.0"}] +end +``` + +## License + +Licensed under either of + + * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) + * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) + +at your option. diff --git a/config/config.exs b/config/config.exs new file mode 100644 index 0000000..abd42cb --- /dev/null +++ b/config/config.exs @@ -0,0 +1,30 @@ +# This file is responsible for configuring your application +# and its dependencies with the aid of the Mix.Config module. +use Mix.Config + +# This configuration is loaded before any dependency and is restricted +# to this project. If another project depends on this project, this +# file won't be loaded nor affect the parent project. For this reason, +# if you want to provide default values for your application for +# 3rd-party users, it should be done in your "mix.exs" file. + +# You can configure for your application as: +# +# config :ex_html5ever, key: :value +# +# And access this configuration in your application as: +# +# Application.get_env(:ex_html5ever, :key) +# +# Or configure a 3rd-party app: +# +# config :logger, level: :info +# + +# It is also possible to import configuration files, relative to this +# directory. For example, you can emulate configuration per environment +# by uncommenting the line below and defining dev.exs, test.exs and such. +# Configuration from the imported file will override the ones defined +# here (which is why it is important to import them last). +# +# import_config "#{Mix.env}.exs" diff --git a/lib/meeseeks_html5ever.ex b/lib/meeseeks_html5ever.ex new file mode 100644 index 0000000..c5b8900 --- /dev/null +++ b/lib/meeseeks_html5ever.ex @@ -0,0 +1,32 @@ +defmodule MeeseeksHtml5ever do + @moduledoc """ + Documentation for ExHtml5ever. + """ + + def parse(html) when byte_size(html) > 500 do + parse_async(html) + end + def parse(html) do + parse_sync(html) + end + + defp parse_async(html) do + MeeseeksHtml5ever.Native.parse_async(html) + receive do + {:html5ever_nif_result, :ok, result} -> + {:ok, result} + {:html5ever_nif_result, :error, err} -> + {:error, err} + end + end + + defp parse_sync(html) do + case MeeseeksHtml5ever.Native.parse_sync(html) do + {:html5ever_nif_result, :ok, result} -> + {:ok, result} + {:html5ever_nif_result, :error, err} -> + {:error, err} + end + end + +end diff --git a/lib/meeseeks_html5ever/native.ex b/lib/meeseeks_html5ever/native.ex new file mode 100644 index 0000000..a6725d0 --- /dev/null +++ b/lib/meeseeks_html5ever/native.ex @@ -0,0 +1,14 @@ +defmodule NifNotLoadedError do + defexception message: "nif not loaded" +end + +defmodule MeeseeksHtml5ever.Native do + use Rustler, otp_app: :meeseeks_html5ever, crate: "meeseeks_html5ever_nif" + + def parse_async(_binary), do: err() + def parse_sync(_binary), do: err() + + defp err() do + throw NifNotLoadedError + end +end diff --git a/mix.exs b/mix.exs new file mode 100644 index 0000000..56c29a9 --- /dev/null +++ b/mix.exs @@ -0,0 +1,67 @@ +defmodule Html5ever.Mixfile do + use Mix.Project + + def project do + [app: :meeseeks_html5ever, + version: "0.3.0", + elixir: "~> 1.4", + build_embedded: Mix.env == :prod, + start_permanent: Mix.env == :prod, + compilers: [:rustler] ++ Mix.compilers(), + rustler_crates: rustler_crates(), + deps: deps(), + description: description(), + package: package()] + end + + def rustler_crates do + [ + meeseeks_html5ever_nif: [ + path: "native/meeseeks_html5ever_nif", + cargo: :system, + default_features: false, + features: [], + mode: :release, + # mode: (if Mix.env == :prod, do: :release, else: :debug), + ] + ] + end + + # Configuration for the OTP application + # + # Type "mix help compile.app" for more information + def application do + # Specify extra applications you'll use from Erlang/Elixir + [extra_applications: [:logger]] + end + + # Dependencies can be Hex packages: + # + # {:my_dep, "~> 0.3.0"} + # + # Or git/path repositories: + # + # {:my_dep, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"} + # + # Type "mix help deps" for more examples and options + defp deps do + [{:rustler, "~> 0.9"}, + {:ex_doc, ">= 0.0.0", only: :dev}] + end + + defp description do + """ + Meeseeks specific NIF binding of html5ever using rustler. + """ + end + + defp package do + [ + files: ["lib", "native", "mix.exs", "README.md"], + maintainers: ["Mischov"], + licenses: ["MIT", "Apache-2.0"], + links: %{"GitHub" => "https://github.com/mischov/meeseeks_html5ever"}, + ] + end + +end diff --git a/mix.lock b/mix.lock new file mode 100644 index 0000000..8d57e85 --- /dev/null +++ b/mix.lock @@ -0,0 +1,3 @@ +%{"earmark": {:hex, :earmark, "1.1.0", "8c2bf85d725050a92042bc1edf362621004d43ca6241c756f39612084e95487f", [:mix], []}, + "ex_doc": {:hex, :ex_doc, "0.14.5", "c0433c8117e948404d93ca69411dd575ec6be39b47802e81ca8d91017a0cf83c", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]}, + "rustler": {:hex, :rustler, "0.9.0", "6fa87ac78f48f70aa8ecfb6e16b8af41c398989d33de41d292b5581d6a2eeb5a", [:mix], []}} diff --git a/native/meeseeks_html5ever_nif/.gitignore b/native/meeseeks_html5ever_nif/.gitignore new file mode 100644 index 0000000..a9d37c5 --- /dev/null +++ b/native/meeseeks_html5ever_nif/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/native/meeseeks_html5ever_nif/Cargo.toml b/native/meeseeks_html5ever_nif/Cargo.toml new file mode 100644 index 0000000..421575c --- /dev/null +++ b/native/meeseeks_html5ever_nif/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "meeseeks_html5ever_nif" +version = "0.1.0" +authors = ["Meeseeks "] + +[lib] +name = "meeseeks_html5ever_nif" +path = "src/lib.rs" +crate-type = ["dylib"] + +[dependencies] +rustler = "^0.14" +rustler_codegen = "^0.14" + +html5ever = "*" +tendril = "*" +lazy_static = "*" +scoped-pool = "*" diff --git a/native/meeseeks_html5ever_nif/src/flat_dom.rs b/native/meeseeks_html5ever_nif/src/flat_dom.rs new file mode 100644 index 0000000..0e44801 --- /dev/null +++ b/native/meeseeks_html5ever_nif/src/flat_dom.rs @@ -0,0 +1,186 @@ +use ::html5ever; +use html5ever::{ QualName, Attribute }; +use html5ever::tree_builder::interface::{ TreeSink, QuirksMode, NodeOrText }; + +use tendril::{ StrTendril, TendrilSink }; + +use std::borrow::Cow; + +#[derive(Copy, Clone, PartialEq, Debug)] +pub struct ElementHandle(usize); + +#[derive(Debug)] +struct Element { + id: usize, + name: Option, + children: Vec, + parent: Option, + last_string: bool, +} +impl Element { + fn new(id: usize) -> Self { + Element { + id: id, + name: None, + children: Vec::with_capacity(10), + parent: None, + last_string: false, + } + } + + fn handle(&self) -> ElementHandle { + ElementHandle(self.id) + } +} + +#[derive(Debug)] +enum ElementType { + Element(Element), + Text(StrTendril), +} +impl ElementType { + fn elem(&self) -> &Element { + match self { + &ElementType::Element(ref elem) => elem, + &ElementType::Text(_) => unreachable!(), + } + } + fn elem_mut(&mut self) -> &mut Element { + match self { + &mut ElementType::Element(ref mut elem) => elem, + &mut ElementType::Text(_) => unreachable!(), + } + } + fn text_mut(&mut self) -> &mut StrTendril { + match self { + &mut ElementType::Element(_) => unreachable!(), + &mut ElementType::Text(ref mut st) => st, + } + } +} + +#[derive(Debug)] +pub struct FlatSink { + elements: Vec, +} + +impl FlatSink { + + pub fn new() -> FlatSink { + let mut sink = FlatSink { + elements: Vec::with_capacity(200), + }; + + // Element 0 is always root + sink.elements.push(ElementType::Element(Element::new(0))); + + sink + } + + fn elem(&self, elem: ElementHandle) -> &ElementType { + &self.elements[elem.0] + } + fn elem_mut(&mut self, elem: ElementHandle) -> &mut ElementType { + &mut self.elements[elem.0] + } + + fn new_elem(&mut self) -> &mut Element { + let idx = self.elements.len(); + self.elements.push(ElementType::Element(Element::new(idx))); + self.elements[idx].elem_mut() + } + fn new_text(&mut self, text: StrTendril) -> ElementHandle { + let idx = self.elements.len(); + self.elements.push(ElementType::Text(text)); + ElementHandle(idx) + } + + fn append_node(&mut self, parent: ElementHandle, child: ElementHandle) { + self.elem_mut(child).elem_mut().parent = Some(parent); + let elem = self.elem_mut(parent).elem_mut(); + elem.children.push(child); + elem.last_string = false; + } + + fn append_text(&mut self, parent: ElementHandle, child: StrTendril) { + if self.elem(parent).elem().last_string { + match self.elem(parent).elem().children.last() { + Some(&handle) => self.elem_mut(handle).text_mut().push_tendril(&child), + _ => unreachable!(), + } + } else { + let st = self.new_text(child); + let elem = self.elem_mut(parent).elem_mut(); + elem.children.push(st); + elem.last_string = true; + } + } + +} + +impl TreeSink for FlatSink { + type Output = u32; + type Handle = ElementHandle; + + fn finish(self) -> Self::Output { + println!("{:?}", self); + 0 + } + + // TODO: Log this or something + fn parse_error(&mut self, msg: Cow<'static, str>) {} + fn set_quirks_mode(&mut self, mode: QuirksMode) {} + + fn get_document(&mut self) -> Self::Handle { ElementHandle(0) } + fn get_template_contents(&mut self, target: Self::Handle) -> Self::Handle { + panic!("Templates not supported"); + } + + fn same_node(&self, x: Self::Handle, y: Self::Handle) -> bool { x == y } + fn elem_name(&self, target: Self::Handle) -> QualName { + self.elem(target).elem().name.as_ref().map(|i| i.clone()).unwrap() + } + + fn create_element(&mut self, name: QualName, attrs: Vec) -> Self::Handle { + let elem = self.new_elem(); + elem.name = Some(name); + elem.handle() + } + + fn create_comment(&mut self, _text: StrTendril) -> Self::Handle { + let elem = self.new_elem(); + elem.handle() + } + + fn append(&mut self, parent: Self::Handle, child: NodeOrText) { + match child { + NodeOrText::AppendNode(node) => self.append_node(parent, node), + NodeOrText::AppendText(text) => self.append_text(parent, text), + }; + } + + fn append_before_sibling(&mut self, sibling: Self::Handle, new_node: NodeOrText) -> Result<(), NodeOrText> { + panic!("unsupported"); + } + + fn append_doctype_to_document(&mut self, name: StrTendril, public_id: StrTendril, system_id: StrTendril) { + println!("append_doctype_to_document"); + } + + fn add_attrs_if_missing(&mut self, target: Self::Handle, attrs: Vec) { + panic!("unsupported"); + } + + fn remove_from_parent(&mut self, target: Self::Handle) { + panic!("unsupported"); + } + + fn reparent_children(&mut self, node: Self::Handle, new_parent: Self::Handle) { + panic!("unsupported"); + } + + fn mark_script_already_started(&mut self, elem: Self::Handle) { + panic!("unsupported"); + } + +} diff --git a/native/meeseeks_html5ever_nif/src/lib.rs b/native/meeseeks_html5ever_nif/src/lib.rs new file mode 100644 index 0000000..8141acb --- /dev/null +++ b/native/meeseeks_html5ever_nif/src/lib.rs @@ -0,0 +1,261 @@ +#[macro_use] +extern crate rustler; +#[macro_use] +extern crate rustler_codegen; +#[macro_use] +extern crate lazy_static; +extern crate html5ever; +extern crate tendril; +extern crate scoped_pool; + +use std::panic; + +use rustler::{ + NifEnv, + NifTerm, + NifResult, + NifError, + NifEncoder, + NifDecoder, +}; +use rustler::types::binary::NifBinary; +use rustler::env::OwnedEnv; + +use html5ever::{ QualName }; +use html5ever::rcdom::{ RcDom, Handle, NodeEnum }; +use html5ever::driver::ParseOpts; +use html5ever::tokenizer::TokenizerOpts; +use html5ever::tree_builder::TreeBuilderOpts; +use html5ever::tree_builder::interface::QuirksMode; +use tendril::{ TendrilSink, StrTendril }; + +mod atoms { + rustler_atoms! { + atom html5ever_nif_result; + + atom ok; + atom error; + atom nil; + atom nif_panic; + + atom doctype; + atom comment; + + atom error_level; + atom discard_bom; + atom scripting_enabled; + atom iframe_srcdoc; + atom drop_doctype; + + atom none; + atom some; + atom all; + } +} + +#[derive(PartialEq, Eq)] +enum ErrorLevel { + None, + Some, + All, +} +impl<'a> NifDecoder<'a> for ErrorLevel { + fn decode(term: NifTerm<'a>) -> NifResult { + if atoms::none() == term { Ok(ErrorLevel::None) } + else if atoms::some() == term { Ok(ErrorLevel::Some) } + else if atoms::all() == term { Ok(ErrorLevel::All) } + else { Err(NifError::BadArg) } + } +} + +fn term_to_configs(term: NifTerm) -> NifResult { + if atoms::nil() == term { + Ok(ParseOpts::default()) + } else { + let env = term.get_env(); + + let errors: ErrorLevel = + term.map_get(atoms::error_level().to_term(env))?.decode()?; + + let discard_bom: bool = + term.map_get(atoms::discard_bom().to_term(env))?.decode()?; + let scripting_enabled: bool = + term.map_get(atoms::scripting_enabled().to_term(env))?.decode()?; + let iframe_srcdoc: bool = + term.map_get(atoms::iframe_srcdoc().to_term(env))?.decode()?; + let drop_doctype: bool = + term.map_get(atoms::drop_doctype().to_term(env))?.decode()?; + + Ok(ParseOpts { + tokenizer: TokenizerOpts { + exact_errors: errors == ErrorLevel::All, + discard_bom: discard_bom, + profile: false, + initial_state: None, + last_start_tag_name: None, + }, + tree_builder: TreeBuilderOpts { + exact_errors: errors == ErrorLevel::All, + scripting_enabled: scripting_enabled, + iframe_srcdoc: iframe_srcdoc, + drop_doctype: drop_doctype, + ignore_missing_rules: false, + quirks_mode: QuirksMode::NoQuirks, + }, + }) + } +} + +// Zero-cost wrapper types which makes it possible to implement +// NifEncoder for these externally defined types. +// Unsure if this is a great way of doing it, but it's the way +// that produced the cleanest and least noisy code. +struct QNW<'a>(&'a QualName); +struct STW<'a>(&'a StrTendril); + +impl<'b> NifEncoder for QNW<'b> { + fn encode<'a>(&self, env: NifEnv<'a>) -> NifTerm<'a> { + let data: &str = &*self.0.local; + data.encode(env) + } +} +impl<'b> NifEncoder for STW<'b> { + fn encode<'a>(&self, env: NifEnv<'a>) -> NifTerm<'a> { + let data: &str = &*self.0; + data.encode(env) + } +} + +/// Takes a Handle from a RcDom, encodes it into a NifTerm. +/// This follows the mochiweb encoding scheme with two exceptions: +/// * A `{:doctype, name, pubid, sysid}` node. +/// * Always returns a list as it's root node. +fn handle_to_term<'a>(env: NifEnv<'a>, handle: &Handle) -> NifTerm<'a> { + let node = handle.borrow(); + + // Closure so that we don't encode this when we don't need to return + // it to the user. + let children = || { + // Encodes a Vec to a Vec + let res: Vec> = + node.children.iter().map(|h| handle_to_term(env, h)).collect(); + // Encodes to erlang list term. + res.encode(env) + }; + + match node.node { + // Root document node. As far as I know, this is only located in the + // root of the DOM. + NodeEnum::Document => + children(), + + NodeEnum::Doctype(ref name, ref pubid, ref sysid) => + (atoms::doctype(), STW(name), STW(pubid), STW(sysid)).encode(env), + + NodeEnum::Text(ref text) => + STW(text).encode(env), + + NodeEnum::Comment(ref text) => + (atoms::comment(), STW(text)).encode(env), + + NodeEnum::Element(ref name, ref _elem_type, ref attributes) => { + let attribute_terms: Vec> = + attributes.iter() + .map(|a| (QNW(&a.name), STW(&a.value)).encode(env)) + .collect(); + + (QNW(name), attribute_terms, children()).encode(env) + }, + } +} + +// Thread pool for `parse_async`. +// TODO: How do we decide on pool size? +lazy_static! { + static ref POOL: scoped_pool::Pool = scoped_pool::Pool::new(4); +} + +fn parse_async<'a>(env: NifEnv<'a>, args: &[NifTerm<'a>]) -> NifResult> { + let mut owned_env = OwnedEnv::new(); + + // Copies the term into the inner env. Since this term is normally a large + // binary term, copying it over should be cheap, since the binary will be + // refcounted within the BEAM. + let input_term = owned_env.save(args[0]); + + let return_pid = env.pid(); + + //let config = term_to_configs(args[1]); + + POOL.spawn(move || { + owned_env.send_and_clear(&return_pid, |inner_env| { + // This should not really be done in user code. We (Rustler project) + // need to find a better abstraction that eliminates this. + match panic::catch_unwind(|| { + let binary: NifBinary = match input_term.load(inner_env).decode() { + Ok(inner) => inner, + Err(_) => panic!("argument is not a binary"), + }; + + let sink = RcDom::default(); + + // TODO: Use Parser.from_bytes instead? + let parser = html5ever::parse_document(sink, Default::default()); + let result = parser.one( + std::str::from_utf8(binary.as_slice()).unwrap()); + + let result_term = handle_to_term(inner_env, &result.document); + (atoms::html5ever_nif_result(), atoms::ok(), result_term) + .encode(inner_env) + }) { + Ok(term) => term, + Err(err) => { + // Try to extract a panic reason and return that. If this + // fails, fail generically. + let reason = + if let Some(s) = err.downcast_ref::() { + s.encode(inner_env) + } else if let Some(&s) = err.downcast_ref::<&'static str>() { + s.encode(inner_env) + } else { + atoms::nif_panic().encode(inner_env) + }; + (atoms::html5ever_nif_result(), atoms::error(), reason) + .encode(inner_env) + }, + } + }); + }); + + Ok(atoms::ok().encode(env)) +} + +fn parse_sync<'a>(env: NifEnv<'a>, args: &[NifTerm<'a>]) -> NifResult> { + let binary: NifBinary = args[0].decode()?; + let sink = RcDom::default(); + + // TODO: Use Parser.from_bytes instead? + let parser = html5ever::parse_document(sink, Default::default()); + let result = parser.one( + std::str::from_utf8(binary.as_slice()).unwrap()); + + //std::thread::sleep(std::time::Duration::from_millis(10)); + + let result_term = handle_to_term(env, &result.document); + + Ok((atoms::html5ever_nif_result(), atoms::ok(), result_term) + .encode(env)) + +} + +rustler_export_nifs!( + "Elixir.MeeseeksHtml5ever.Native", + [("parse_async", 1, parse_async), + ("parse_sync", 1, parse_sync)], + Some(on_load) +); + + +fn on_load<'a>(_env: NifEnv<'a>, _load_info: NifTerm<'a>) -> bool { + true +} diff --git a/test/meeseeks_html5ever_test.exs b/test/meeseeks_html5ever_test.exs new file mode 100644 index 0000000..4ddd031 --- /dev/null +++ b/test/meeseeks_html5ever_test.exs @@ -0,0 +1,16 @@ +defmodule MeeseeksHtml5everTest do + use ExUnit.Case + doctest MeeseeksHtml5ever + + test "parse basic html" do + html = "" + ret = {:ok, [{"html", [], [{"head", [], []}, {"body", [], []}]}]} + assert MeeseeksHtml5ever.parse(html) == ret + end + + test "unbalanced worst case" do + html = String.duplicate("
", 100) + assert match?({:ok, _}, MeeseeksHtml5ever.parse(html)) + end + +end diff --git a/test/test_helper.exs b/test/test_helper.exs new file mode 100644 index 0000000..869559e --- /dev/null +++ b/test/test_helper.exs @@ -0,0 +1 @@ +ExUnit.start()