Skip to content

Commit

Permalink
adding rayon web crawler example
Browse files Browse the repository at this point in the history
  • Loading branch information
noahgift committed Jul 23, 2023
1 parent 715073e commit 3a6add1
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 0 deletions.
10 changes: 10 additions & 0 deletions webcrawl-wikipedia-rayon/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "webcrawl-wikipedia-rayon"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
wikipedia = "0.3.4"
rayon = "1.7.0"
13 changes: 13 additions & 0 deletions webcrawl-wikipedia-rayon/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
format:
cargo fmt --quiet

lint:
cargo clippy --quiet

test:
cargo test --quiet

run:
cargo run

all: format lint test run
81 changes: 81 additions & 0 deletions webcrawl-wikipedia-rayon/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* The Page struct required a type implementing HttpClient trait.
* Using just a String caused a compiler error.
*
* The fix was to use the Client struct from wikipedia crate,
* which implements HttpClient.
*
* Constructing Page using Client instead of String satisfied the
* HttpClient bound. This fixed the compilation error.
*
* The key points:
* - Page requires a type implementing HttpClient
* - Pass in Client instead of String
* - Client implements HttpClient, so compiles correctly
*
* By using the right type for Page construction, it builds
* and runs properly.
*/

use wikipedia::http::default::Client;
use wikipedia::Page;
use wikipedia::Wikipedia;

struct ProcessedPage {
title: String,
data: String,
}

const PAGES: [&str; 9] = [
"Giannis Antetokounmpo",
"James Harden",
"Russell Westbrook",
"Stephen Curry",
"Kevin Durant",
"LeBron James",
"Kobe Bryant",
"Michael Jordan",
"Shaquille O'Neal",
];

fn process_page(page: &Page<Client>) -> ProcessedPage {
let title = page.get_title().unwrap();
let content = page.get_content().unwrap();
ProcessedPage {
title: title.to_string(),
data: content,
}
}

//times how long it takes to process the pages and total time
fn main() {
//start timer
let start = std::time::Instant::now();
let wikipedia = Wikipedia::<Client>::default();
let pages: Vec<_> = PAGES
.iter()
.map(|&p| wikipedia.page_from_title(p.to_string()))
.collect();
let processed_pages: Vec<_> = pages.iter().map(|p| process_page(p)).collect();
for page in processed_pages {
//time how long it takes to process each page
let start_page = std::time::Instant::now();
println!("Title: {}", page.title);
//grab first sentence of the page
let first_sentence = page.data.split('.').next().unwrap();
println!("First sentence: {}", first_sentence);
//count the number of words in the page
let word_count = page.data.split_whitespace().count();
println!("Word count: {}", word_count);
//prints time it took to process each page
println!("Page time: {:?}", start_page.elapsed());
}
//descriptive statistics of: total time, average time per page, and total number of pages, as well as the number of threads used
println!("Total time: {:?}", start.elapsed());
println!(
"Average time per page: {:?}",
start.elapsed() / PAGES.len() as u32
);
println!("Total number of pages: {}", PAGES.len());
println!("Number of threads: {}", rayon::current_num_threads());
}

0 comments on commit 3a6add1

Please sign in to comment.