adding rayon web crawler example

nogibjj · Jul 23, 2023 · 3a6add1 · 3a6add1
1 parent 715073e
commit 3a6add1
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 0 deletions.
diff --git a/webcrawl-wikipedia-rayon/Cargo.toml b/webcrawl-wikipedia-rayon/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "webcrawl-wikipedia-rayon"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+wikipedia = "0.3.4"
+rayon = "1.7.0"
diff --git a/webcrawl-wikipedia-rayon/Makefile b/webcrawl-wikipedia-rayon/Makefile
@@ -0,0 +1,13 @@
+format:
+	cargo fmt --quiet
+
+lint:
+	cargo clippy --quiet
+
+test:
+	cargo test --quiet
+
+run:
+	cargo run 
+
+all: format lint test run
diff --git a/webcrawl-wikipedia-rayon/src/main.rs b/webcrawl-wikipedia-rayon/src/main.rs
@@ -0,0 +1,81 @@
+/*
+* The Page struct required a type implementing HttpClient trait.
+* Using just a String caused a compiler error.
+*
+* The fix was to use the Client struct from wikipedia crate,
+* which implements HttpClient.
+*
+* Constructing Page using Client instead of String satisfied the
+* HttpClient bound. This fixed the compilation error.
+*
+* The key points:
+* - Page requires a type implementing HttpClient
+* - Pass in Client instead of String
+* - Client implements HttpClient, so compiles correctly
+*
+* By using the right type for Page construction, it builds
+* and runs properly.
+*/
+
+use wikipedia::http::default::Client;
+use wikipedia::Page;
+use wikipedia::Wikipedia;
+
+struct ProcessedPage {
+    title: String,
+    data: String,
+}
+
+const PAGES: [&str; 9] = [
+    "Giannis Antetokounmpo",
+    "James Harden",
+    "Russell Westbrook",
+    "Stephen Curry",
+    "Kevin Durant",
+    "LeBron James",
+    "Kobe Bryant",
+    "Michael Jordan",
+    "Shaquille O'Neal",
+];
+
+fn process_page(page: &Page<Client>) -> ProcessedPage {
+    let title = page.get_title().unwrap();
+    let content = page.get_content().unwrap();
+    ProcessedPage {
+        title: title.to_string(),
+        data: content,
+    }
+}
+
+//times how long it takes to process the pages and total time
+fn main() {
+    //start timer
+    let start = std::time::Instant::now();
+    let wikipedia = Wikipedia::<Client>::default();
+    let pages: Vec<_> = PAGES
+        .iter()
+        .map(|&p| wikipedia.page_from_title(p.to_string()))
+        .collect();
+    let processed_pages: Vec<_> = pages.iter().map(|p| process_page(p)).collect();
+    for page in processed_pages {
+        //time how long it takes to process each page
+        let start_page = std::time::Instant::now();
+        println!("Title: {}", page.title);
+        //grab first sentence of the page
+        let first_sentence = page.data.split('.').next().unwrap();
+        println!("First sentence: {}", first_sentence);
+        //count the number of words in the page
+        let word_count = page.data.split_whitespace().count();
+        println!("Word count: {}", word_count);
+        //prints time it took to process each page
+        println!("Page time: {:?}", start_page.elapsed());
+    }
+    //descriptive statistics of: total time, average time per page, and total number of pages, as well as the number of threads used
+    println!("Total time: {:?}", start.elapsed());
+    println!(
+        "Average time per page: {:?}",
+        start.elapsed() / PAGES.len() as u32
+    );
+    println!("Total number of pages: {}", PAGES.len());
+    println!("Number of threads: {}", rayon::current_num_threads());
+}