Permalink
Browse files

Set crawler multi-threaded

  • Loading branch information...
madeindjs committed Feb 8, 2018
1 parent ff81169 commit 5f20d73651530a83b4a7a68fbb588c458e098fbf
Showing with 40 additions and 30 deletions.
  1. +2 −2 Cargo.toml
  2. +2 −2 README.md
  3. +11 −16 src/page.rs
  4. +25 −10 src/website.rs
View
@@ -1,8 +1,8 @@
[package]
name = "spider"
version = "1.0.3"
version = "1.1.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>"]
description = "Web spider framework that can spider a domain and collect pages it visits."
description = "Multithreaded Web spider crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
readme = "README.md"
keywords = ["crawler", "spider"]
View
@@ -2,7 +2,7 @@
![crate version](https://img.shields.io/crates/v/spider.svg)
Web spider framework that can spider a domain and collect pages it visits.
Multithreaded Web spider crawler written in Rust.
## Depensencies
@@ -39,7 +39,7 @@ fn main() {
## TODO
- [ ] multi-threaded system
- [x] multi-threaded system
- [ ] respect _robot.txt_ file
- [ ] add configuratioon object for polite delay, etc..
- [ ] parse command line arguments
View
@@ -10,17 +10,20 @@ pub struct Page {
/// URL of this page
url: String,
/// HTML parsed with [scraper](https://crates.io/crates/scraper) lib
html: Html,
html: String,
}
impl Page {
/// Instanciate a new page a start to scrape it.
pub fn new(url: &str) -> Self {
let html = Self::visit(url);
// TODO: handle uwrap here
let mut res = reqwest::get(url).unwrap();
let mut body = String::new();
res.read_to_string(&mut body).unwrap();
Self {
url: url.to_string(),
html: html,
html: body,
}
}
@@ -29,29 +32,21 @@ impl Page {
self.url.clone()
}
/// HTML getter
/// HTML parser
pub fn get_html(&self) -> Html {
self.html.clone()
Html::parse_document(&self.html)
}
/// Launch an HTTP GET query to te given URL & parse body response content
fn visit(url: &str) -> Html {
// TODO: handle uwrap here
let mut res = reqwest::get(url).unwrap();
let mut body = String::new();
res.read_to_string(&mut body).unwrap();
Html::parse_document(&body)
pub fn get_plain_html(&self) -> String {
self.html.clone()
}
/// Find all href links and return them
pub fn links(&self, domain: &str) -> Vec<String> {
let mut urls: Vec<String> = Vec::new();
let selector = Selector::parse("a").unwrap();
for element in self.html.select(&selector) {
for element in self.get_html().select(&selector) {
match element.value().attr("href") {
Some(href) => {
View
@@ -1,4 +1,6 @@
use page::Page;
use std::thread;
use std::thread::JoinHandle;
/// Represent a website to scrawl. To start crawling, instanciate a new `struct` using
/// <pre>
@@ -46,25 +48,38 @@ impl Website {
pub fn crawl(&mut self) {
// scrawl while links exists
while self.links.len() > 0 {
let mut workers: Vec<JoinHandle<Page>> = Vec::new();
let mut new_links: Vec<String> = Vec::new();
for link in &self.links {
// extends visibility
let thread_link: String = link.to_string();
// verify that URL was not already scrawled
if self.links_visited.contains(link) {
continue;
}
// scrape page & found links
let page = Page::new(link);
for link_founded in page.links(&self.domain) {
// add only links not already vistited
if !self.links_visited.contains(&link_founded) {
new_links.push(link_founded);
workers.push(thread::spawn(move || Page::new(&thread_link)));
}
for worker in workers {
match worker.join() {
Ok(page) => {
// get links founded on
for link_founded in page.links(&self.domain) {
// add only links not already vistited
if !self.links_visited.contains(&link_founded) {
new_links.push(link_founded);
}
}
// add page to scrawled pages
self.links_visited.push(page.get_url());
self.pages.push(page);
}
Err(_) => (),
}
// add page to scrawled pages
self.pages.push(page);
self.links_visited.push(link.to_string());
}
self.links = new_links.clone();

0 comments on commit 5f20d73

Please sign in to comment.