Skip to content

Commit

Permalink
Remove images and links
Browse files Browse the repository at this point in the history
See #11 for next steps

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
  • Loading branch information
newsch committed Jul 10, 2023
1 parent 9036e34 commit 8ec696c
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 12 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ default-run = "om-wikiparser"
[dependencies]
anyhow = { version = "1.0.71", features = ["backtrace"] }
clap = { version = "4.3.2", features = ["derive"] }
ego-tree = "0.6.2"
env_logger = "0.10.0"
log = "0.4.18"
once_cell = "1.18.0"
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ As an example of usage with the map generator:
# Transform intermediate files from generator.
cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
# Enable backtraces in errors and panics.
export RUST_BACKTRACE=1
# Set log level to debug
export RUST_LOG=om_wikiparser=debug
# Begin extraction.
for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
do
Expand Down
5 changes: 5 additions & 0 deletions src/bin/simplify_html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ use std::io::{stdin, stdout, Read, Write};
use om_wikiparser::html::simplify;

fn main() -> anyhow::Result<()> {
env_logger::Builder::new()
.filter_level(log::LevelFilter::Info)
.parse_default_env()
.try_init()?;

let mut input = String::new();
stdin().read_to_string(&mut input)?;

Expand Down
102 changes: 90 additions & 12 deletions src/html.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::collections::{BTreeMap, BTreeSet};

use ego_tree::NodeId;
use once_cell::sync::Lazy;
use scraper::{ElementRef, Html, Selector};
use serde::Deserialize;
Expand Down Expand Up @@ -51,34 +52,65 @@ pub fn simplify(html: &str, lang: &str) -> String {
}
}

for id in to_remove.drain(..) {
if let Some(mut node) = document.tree.get_mut(id) {
node.detach();
}
}
remove_ids(&mut document, to_remove.drain(..));
} else {
warn!("No sections to remove configured for lang {lang:?}");
}

// Remove elements with no text that isn't whitespace.

for element in document
for el in document
.root_element()
.descendants()
.filter_map(ElementRef::wrap)
{
if element.text().all(|t| t.trim().is_empty()) {
to_remove.push(element.id());
if is_image(&el) || is_empty_or_whitespace(&el) {
to_remove.push(el.id());
}
}
remove_ids(&mut document, to_remove.drain(..));

for id in to_remove.drain(..) {
remove_links(&mut document);

document.html()
}

fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
for id in ids {
if let Some(mut node) = document.tree.get_mut(id) {
node.detach();
}
}
}

document.html()
fn is_empty_or_whitespace(el: &ElementRef) -> bool {
el.text().flat_map(str::chars).all(char::is_whitespace)
}

fn is_image(el: &ElementRef) -> bool {
["img", "picture"].contains(&el.value().name())
}

/// Remove all links, preserving any inner elements/text.
fn remove_links(document: &mut Html) {
let links: Vec<_> = document
.select(&Selector::parse("a").unwrap())
.map(|el| el.id())
.collect();

for id in links {
let Some(mut node) = document.tree.get_mut(id) else { continue };
if node.parent().is_none() {
continue;
}

// reparent to same location as node
while let Some(mut child) = node.first_child() {
let child_id = child.id();
child.detach();
node.insert_id_before(child_id);
}

node.detach();
}
}

#[cfg(test)]
Expand All @@ -89,4 +121,50 @@ mod test {
fn static_config_parses() {
assert!(!CONFIG.sections_to_remove.is_empty());
}

#[test]
fn remove_links() {
let html = r#"
<p> Some text that includes
<a href="Some_Page"><span id="inner-content">several</span></a>
<a id="second-link" href="./Another_Page">relative links</a>
and
<a href="https://example.com/page">an absolute link</a>
.
</p>
"#;

let anchors = Selector::parse("a").unwrap();
let inner_element = Selector::parse("#inner-content").unwrap();
let second_link = Selector::parse("#second-link").unwrap();

let mut document = Html::parse_fragment(html);
let links: Vec<_> = document
.select(&anchors)
.filter_map(|el| el.value().attr("href"))
.collect();

eprintln!("{}", document.html());

assert_eq!(
vec!["Some_Page", "./Another_Page", "https://example.com/page"],
links,
"Links in original html are not expected."
);

// Detach one of the links from the root tree (as if previously deleted) to ensure it handles orphan nodes nicely.
let link = document.select(&second_link).next().unwrap().id();
document.tree.get_mut(link).unwrap().detach();

super::remove_links(&mut document);

let links: Vec<_> = document.select(&anchors).collect();

assert!(links.is_empty(), "All links should be removed.");

assert!(
document.select(&inner_element).next().is_some(),
"Link inner elements should be preserved."
);
}
}
2 changes: 2 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ fn write(
}

fn main() -> anyhow::Result<()> {
// Use info level by default, load overrides from `RUST_LOG` env variable.
// See https://docs.rs/env_logger/latest/env_logger/index.html#example
env_logger::Builder::new()
.filter_level(log::LevelFilter::Info)
.parse_default_env()
Expand Down

0 comments on commit 8ec696c

Please sign in to comment.