From 768a5fcd01da5eaa398a415676a4c82f8052eae3 Mon Sep 17 00:00:00 2001 From: Nordine Bittich Date: Sat, 27 Jan 2024 11:55:37 +0100 Subject: [PATCH] more tests, add configurable well known prefix --- Cargo.lock | 29 +---- Cargo.toml | 2 +- README.md | 59 +++++++-- index.html | 92 +++++++++++--- lib-rdfa/examples/earl_html5/example0035.html | 15 +++ lib-rdfa/examples/earl_html5/example0035.ttl | 1 + lib-rdfa/examples/earl_html5/example0037.html | 18 +++ lib-rdfa/examples/earl_html5/example0037.ttl | 5 + lib-rdfa/examples/earl_html5/example0039.html | 15 +++ lib-rdfa/examples/earl_html5/example0039.ttl | 1 + lib-rdfa/src/lib.rs | 13 +- lib-rdfa/src/structs.rs | 116 +++++++++++------- lib-rdfa/src/tests/earl_html5.rs | 3 + lib-rdfa/src/tests/mod.rs | 6 +- rdfa-wasm/src/lib.rs | 12 +- 15 files changed, 277 insertions(+), 110 deletions(-) create mode 100644 lib-rdfa/examples/earl_html5/example0035.html create mode 100644 lib-rdfa/examples/earl_html5/example0035.ttl create mode 100644 lib-rdfa/examples/earl_html5/example0037.html create mode 100644 lib-rdfa/examples/earl_html5/example0037.ttl create mode 100644 lib-rdfa/examples/earl_html5/example0039.html create mode 100644 lib-rdfa/examples/earl_html5/example0039.ttl diff --git a/Cargo.lock b/Cargo.lock index e54565b..8b7d6fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -388,7 +388,7 @@ version = "0.1.2" dependencies = [ "ego-tree", "env_logger", - "itertools 0.12.0", + "itertools", "lazy_static", "log", "regex", @@ -400,22 +400,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "graph-rdfa-processor" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a40adbf551dc0d0f9f246290b93b00a4745e08c3cc4f42025e41cd8d923d5187" -dependencies = [ - "ego-tree", - "itertools 0.11.0", - "lazy_static", - "log", - "regex", - "scraper", - "url", - "uuid", -] - [[package]] name = "hashbrown" version = "0.14.3" @@ -475,15 +459,6 @@ dependencies = [ "unicode-normalization", ] -[[package]] -name = "itertools" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.12.0" @@ -785,7 +760,7 @@ name = "rdfa-wasm" version = "0.1.2" dependencies = [ "getrandom", - "graph-rdfa-processor 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "graph-rdfa-processor", "wasm-bindgen", ] diff --git a/Cargo.toml b/Cargo.toml index 8a72bf4..2958e71 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,7 @@ test-case = "3.3.1" env_logger = "0.11.1" tortank = "0.20.0" wasm-bindgen = "0.2.90" -graph-rdfa-processor = "0.1.2" +graph-rdfa-processor = { path = "./lib-rdfa" } [profile.release] opt-level = 'z' # Optimize for size. diff --git a/README.md b/README.md index 6e4143a..f377b31 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,58 @@ -# graph-rdfa-processor +# RDFa processor library (WIP) -### RDFa processor library (WIP) +Rust and wasm library to extract rdf statements (in n-triples format) from an html file +based on rdfa annotations. -- covers [RDFa 1.1 Primer - Third Edition](https://www.w3.org/TR/rdfa-primer/). -- used [RDFa/Play](https://rdfa.info/play/) for comparing. -- WIP: +## Examples + +```rust +use graph_rdfa_processor::RdfaGraph; + let html = r#" +
+

Dan Brickley

+
+ "#; + let base = "http://example.com" + let well_known_prefix = Some("http://example.org/.well_known"); + + RdfaGraph::parse_str(html, base, well_known_prefix).unwrap() + +``` + +```js + + +``` + +- covers: + + - [RDFa 1.1 Primer - Third Edition](https://www.w3.org/TR/rdfa-primer/) - [RDFa Core](https://www.w3.org/TR/rdfa-core/) - [Earl-Reports](https://rdfa.info/earl-reports/#RDFa-rdfa1.1-tests-for-html5) +- used [RDFa/Play](https://rdfa.info/play/) for comparing. - [Demo](https://nbittich.github.io/graph-rdfa-processor/) - ### WIP - - The plan is to make as much tests as possible, and fix the bugs along the way. +### WIP - Once enough tests have been implemented, a full refactoring will be needed before - doing anything else. +The plan is to make as much tests as possible, and fix the bugs along the way. - ### KNOWN ISSUES +### KNOWN ISSUES - - White space not preserved. See test 0329. +- White space not preserved. See test 0329. diff --git a/index.html b/index.html index 8a93622..528f1fd 100644 --- a/index.html +++ b/index.html @@ -17,28 +17,64 @@ form.classList.remove("d-none"); const text_area = document.querySelector("#html"); - text_area.value = ` - - - Test 0083 - - -
-

Ivan Herman

-

mailto:ivan@w3.org

-

Dan Brickley

-
- - -`; + text_area.value = ` + + + + Test 0083 + + +
+

Ivan Herman

+

mailto:ivan@w3.org

+

Dan Brickley

+
+ + + `; form.addEventListener("submit", (e) => { e.preventDefault(); const data = new FormData(e.target); - let res = html_to_rdfa(data.get("html"), data.get("base")); + let res = html_to_rdfa( + data.get("html") || "", + data.get("base") || "", + data.get("wellKnownPrefix") || "", + ); const out = document.querySelector("pre"); out.innerText = res; }); + const issueLink = document.querySelector("#issueLink"); + issueLink.onclick = (e) => { + e.preventDefault(); + const a = document.createElement("a"); + let params = new URLSearchParams(); + const data = new FormData(form); + params.append("title", "RDFa processing bug"); + + params.append( + "body", + ` +### Base: + + \`${data.get("base") || ""}\` + +### Well Known Prefix: + + \`${data.get("base") || ""}\` + +### Html: + + \`\`\`html + ${data.get("html") || ""} + \`\`\` + + `, + ); + a.href = `https://github.com/nbittich/graph-rdfa-processor/issues/new?${params.toString()}`; + a.target = "_blank"; + a.click(); + }; } run(); @@ -48,11 +84,11 @@

RDFa Playground

This is a poc of an rdfa library written in rust. If you find any bug, - please open an issue. + please open an issue.


-
-
+
+
@@ -63,6 +99,20 @@

RDFa Playground

value="http://example.com" />
+
+
+ +
+
+ +
+ Well known prefix for + Skolemisation +
+
+
+
@@ -72,10 +122,12 @@

RDFa Playground

aria-describedby="Html content">
- +
+ +
-
+

     
diff --git a/lib-rdfa/examples/earl_html5/example0035.html b/lib-rdfa/examples/earl_html5/example0035.html new file mode 100644 index 0000000..553d5f4 --- /dev/null +++ b/lib-rdfa/examples/earl_html5/example0035.html @@ -0,0 +1,15 @@ + + + + Test 0035 + + +
+ A photo depicting Michael +
+ + \ No newline at end of file diff --git a/lib-rdfa/examples/earl_html5/example0035.ttl b/lib-rdfa/examples/earl_html5/example0035.ttl new file mode 100644 index 0000000..505a166 --- /dev/null +++ b/lib-rdfa/examples/earl_html5/example0035.ttl @@ -0,0 +1 @@ + . diff --git a/lib-rdfa/examples/earl_html5/example0037.html b/lib-rdfa/examples/earl_html5/example0037.html new file mode 100644 index 0000000..4f732cf --- /dev/null +++ b/lib-rdfa/examples/earl_html5/example0037.html @@ -0,0 +1,18 @@ + + + + + + Test 0037 + + +
+ A photo depicting Michael +
+ + \ No newline at end of file diff --git a/lib-rdfa/examples/earl_html5/example0037.ttl b/lib-rdfa/examples/earl_html5/example0037.ttl new file mode 100644 index 0000000..e7e4a0e --- /dev/null +++ b/lib-rdfa/examples/earl_html5/example0037.ttl @@ -0,0 +1,5 @@ + + + . + + diff --git a/lib-rdfa/examples/earl_html5/example0039.html b/lib-rdfa/examples/earl_html5/example0039.html new file mode 100644 index 0000000..9138b79 --- /dev/null +++ b/lib-rdfa/examples/earl_html5/example0039.html @@ -0,0 +1,15 @@ + + + + Test 0039 + + +
+ A photo depicting Michael +
+ + \ No newline at end of file diff --git a/lib-rdfa/examples/earl_html5/example0039.ttl b/lib-rdfa/examples/earl_html5/example0039.ttl new file mode 100644 index 0000000..25a4624 --- /dev/null +++ b/lib-rdfa/examples/earl_html5/example0039.ttl @@ -0,0 +1 @@ + . diff --git a/lib-rdfa/src/lib.rs b/lib-rdfa/src/lib.rs index a93dbe4..645f6d1 100644 --- a/lib-rdfa/src/lib.rs +++ b/lib-rdfa/src/lib.rs @@ -27,19 +27,28 @@ impl<'a> RdfaGraph<'a> { initial_context: Context<'a>, ) -> Result, Box> { let mut triples = vec![]; + let well_known_prefix = initial_context.well_known_prefix.clone(); traverse_element(input, None, initial_context, &mut triples, &mut vec![])?; triples = copy_pattern(triples)?; - Ok(RdfaGraph(triples.into_iter().collect())) + Ok(RdfaGraph { + statements: triples.into_iter().collect(), + well_known_prefix, + }) } // temporary thing - pub fn parse_str(html: &'a str, base: &'a str) -> Result> { + pub fn parse_str( + html: &'a str, + base: &'a str, + well_known_prefix: Option<&'a str>, + ) -> Result> { let document = scraper::Html::parse_document(html); let root = document.root_element(); let root_ctx = Context { base, + well_known_prefix, ..Default::default() }; RdfaGraph::parse(&root, root_ctx).map(|g| g.to_string()) diff --git a/lib-rdfa/src/structs.rs b/lib-rdfa/src/structs.rs index 14b5b68..61528f8 100644 --- a/lib-rdfa/src/structs.rs +++ b/lib-rdfa/src/structs.rs @@ -19,11 +19,15 @@ macro_rules! iri { } #[derive(Debug)] -pub struct RdfaGraph<'a>(pub HashSet>); +pub struct RdfaGraph<'a> { + pub well_known_prefix: Option<&'a str>, + pub statements: HashSet>, +} #[derive(Debug, Default, Clone)] pub struct Context<'a> { pub base: &'a str, + pub well_known_prefix: Option<&'a str>, pub vocab: Option<&'a str>, pub lang: Option<&'a str>, pub in_rel: Option>>, @@ -63,6 +67,22 @@ pub struct Statement<'a> { pub object: Node<'a>, } +impl Statement<'_> { + fn as_ntriple_string(&self, well_known_prefix: Option<&str>) -> String { + let Statement { + subject, + predicate, + object, + } = self; + format!( + r#"{} {} {}."#, + subject.as_ntriple_string(well_known_prefix), + predicate.as_ntriple_string(well_known_prefix), + object.as_ntriple_string(well_known_prefix) + ) + } +} + impl Node<'_> { pub fn is_empty(&self) -> bool { match self { @@ -78,31 +98,10 @@ impl Node<'_> { Node::RefBNode((s, _)) => s.is_empty(), } } -} - -impl PartialEq for Node<'_> { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (Self::Iri(l0), Self::Iri(r0)) => l0 == r0, - (Self::Iri(l0), Self::TermIri(r0)) => l0 == r0, - (Self::TermIri(l0), Self::TermIri(r0)) => l0 == r0, - (Self::TermIri(l0), Self::Iri(r0)) => l0 == r0, - (Self::Literal(l0), Self::Literal(r0)) => l0 == r0, - (Self::Ref(l0), Self::Ref(r0)) => l0 == r0, - (Self::Ref(l0), rhs) => l0.as_ref() == rhs, - (lhs, Self::Ref(r0)) => lhs == r0.as_ref(), - (Self::BNode(l0), Self::BNode(r0)) => l0 == r0, - (Self::RefBNode(l0), Self::RefBNode(r0)) => l0 == r0, - _ => false, - } - } -} - -impl Display for Node<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + fn as_ntriple_string(&self, well_known_prefix: Option<&str>) -> String { match self { - Node::Iri(iri) | Node::TermIri(iri) => f.write_str(&format!("<{}>", iri)), - Node::Ref(iri) => f.write_str(&format!("{}", iri)), + Node::Iri(iri) | Node::TermIri(iri) => format!("<{}>", iri), + Node::Ref(iri) => format!("{}", iri.as_ntriple_string(well_known_prefix)), Node::Literal(Literal { datatype, lang, @@ -122,51 +121,82 @@ impl Display for Node<'_> { .as_ref() .filter(|dt| dt.as_ref() != &*NODE_RDF_XSD_STRING) { - s.push_str(&format!(r#"^^{datatype}"#)); + s.push_str(&format!( + r#"^^{}"#, + datatype.as_ntriple_string(well_known_prefix) + )); } else if let Some(lang) = lang { s.push_str(&format!(r#"@{lang}"#)); } - f.write_str(&s) + s } Node::BNode(id) => { // todo maybe this should use the base? - f.write_str(&format!("<{}{}>", DEFAULT_WELL_KNOWN_PREFIX, id)) + format!( + "<{}{}>", + well_known_prefix.unwrap_or(DEFAULT_WELL_KNOWN_PREFIX), + id + ) } Node::RefBNode((id, uuid)) => { if let Ok(v) = id.parse::() { if v <= BNODE_ID_GENERATOR.load(std::sync::atomic::Ordering::SeqCst) { - f.write_str(&format!("<{}{}>", DEFAULT_WELL_KNOWN_PREFIX, uuid)) + format!( + "<{}{}>", + well_known_prefix.unwrap_or(DEFAULT_WELL_KNOWN_PREFIX), + uuid + ) } else { - f.write_str(&format!("<{}{}>", DEFAULT_WELL_KNOWN_PREFIX, id)) + format!( + "<{}{}>", + well_known_prefix.unwrap_or(DEFAULT_WELL_KNOWN_PREFIX), + id + ) } } else if id.is_empty() { - f.write_str(&format!("<{}{}>", DEFAULT_WELL_KNOWN_PREFIX, uuid)) + format!( + "<{}{}>", + well_known_prefix.unwrap_or(DEFAULT_WELL_KNOWN_PREFIX), + uuid + ) } else { - f.write_str(&format!("<{}{}>", DEFAULT_WELL_KNOWN_PREFIX, id)) + format!( + "<{}{}>", + well_known_prefix.unwrap_or(DEFAULT_WELL_KNOWN_PREFIX), + id + ) } } } } } -impl Display for Statement<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let Statement { - subject, - predicate, - object, - } = self; - f.write_str(&format!(r#"{subject} {predicate} {object}."#)) +impl PartialEq for Node<'_> { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Iri(l0), Self::Iri(r0)) => l0 == r0, + (Self::Iri(l0), Self::TermIri(r0)) => l0 == r0, + (Self::TermIri(l0), Self::TermIri(r0)) => l0 == r0, + (Self::TermIri(l0), Self::Iri(r0)) => l0 == r0, + (Self::Literal(l0), Self::Literal(r0)) => l0 == r0, + (Self::Ref(l0), Self::Ref(r0)) => l0 == r0, + (Self::Ref(l0), rhs) => l0.as_ref() == rhs, + (lhs, Self::Ref(r0)) => lhs == r0.as_ref(), + (Self::BNode(l0), Self::BNode(r0)) => l0 == r0, + (Self::RefBNode(l0), Self::RefBNode(r0)) => l0 == r0, + _ => false, + } } } + impl Display for RdfaGraph<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str( &self - .0 + .statements .iter() - .map(Statement::to_string) - .collect::>() + .map(|s| s.as_ntriple_string(self.well_known_prefix)) + .collect::>() .join("\n"), ) } diff --git a/lib-rdfa/src/tests/earl_html5.rs b/lib-rdfa/src/tests/earl_html5.rs index 330fab4..e6b103b 100644 --- a/lib-rdfa/src/tests/earl_html5.rs +++ b/lib-rdfa/src/tests/earl_html5.rs @@ -28,8 +28,11 @@ const INPUT_OUTPUT_DIR: &str = "examples/earl_html5"; #[test_case("example0032" ; "@resource overrides @href : earl_reports_html5_0032 ")] #[test_case("example0033" ; "simple chaining test with bNode : earl_reports_html5_0033 ")] #[test_case("example0034" ; "simple img[@src] test : earl_reports_html5_0034 ")] +#[test_case("example0035" ; "@src/@href test : earl_reports_html5_0035 ")] #[test_case("example0036" ; "@src/@resource test : earl_reports_html5_0036 ")] +#[test_case("example0037" ; "@src/@resource test : earl_reports_html5_0037 ")] #[test_case("example0038" ; "@rev - img[@src] test : earl_reports_html5_0038 ")] +#[test_case("example0039" ; "@rev - img[@src] test : earl_reports_html5_0039 ")] #[test_case("example0048" ; "@typeof with @about and @rel present, no @resource : earl_reports_html5_0048 ")] #[test_case("example0049" ; "@typeof with @about, no @rel or @resource : earl_reports_html5_0049 ")] #[test_case("example0050" ; "@typeof without anything else : earl_reports_html5_0050 ")] diff --git a/lib-rdfa/src/tests/mod.rs b/lib-rdfa/src/tests/mod.rs index 39494e5..09ecd63 100644 --- a/lib-rdfa/src/tests/mod.rs +++ b/lib-rdfa/src/tests/mod.rs @@ -1,4 +1,4 @@ -use std::path::PathBuf; +use std::{ops::Add, path::PathBuf}; use scraper::Html; use tortank::turtle::turtle_doc::TurtleDoc; @@ -60,8 +60,8 @@ fn cmp_files(test_name: &str, input_output_dir: &str, base: &str) { Some(constants::DEFAULT_WELL_KNOWN_PREFIX.to_string()), )) .unwrap(); - let diff = ttl.difference(&graph).unwrap(); - // diff = diff.add(graph.difference(&ttl).unwrap()); + let mut diff = ttl.difference(&graph).unwrap(); + diff = diff.add(graph.difference(&ttl).unwrap()); if !diff.is_empty() { println!("============ Difference ============"); println!("{diff}"); diff --git a/rdfa-wasm/src/lib.rs b/rdfa-wasm/src/lib.rs index 3be6380..de46ef8 100644 --- a/rdfa-wasm/src/lib.rs +++ b/rdfa-wasm/src/lib.rs @@ -2,6 +2,14 @@ use graph_rdfa_processor::RdfaGraph; use wasm_bindgen::prelude::*; #[wasm_bindgen] -pub fn html_to_rdfa(html: &str, base: &str) -> String { - RdfaGraph::parse_str(html, base).unwrap() +pub fn html_to_rdfa(html: &str, base: &str, well_known_prefix: &str) -> String { + let wkp = { + let wkp = well_known_prefix.trim(); + if wkp.is_empty() { + None + } else { + Some(wkp) + } + }; + RdfaGraph::parse_str(html, base, wkp).unwrap() }