Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(latency): add connection re-use across request #25

Merged
merged 1 commit into from Apr 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -6,6 +6,7 @@
- feat(cli): add cli separation binary [#17](https://github.com/madeindjs/spider/pull/17/commits/b41e25fc507c6cd3ef251d2e25c97b936865e1a9)
- feat(robots): add robots crawl delay respect and ua assign [#24](https://github.com/madeindjs/spider/pull/24)
- feat(async): add async page body gathering
- perf(latency): add connection re-use across request [#25](https://github.com/madeindjs/spider/pull/25)

## v1.4.0

Expand Down
4 changes: 2 additions & 2 deletions spider/Cargo.toml
Expand Up @@ -15,10 +15,10 @@ edition = "2018"
maintenance = { status = "as-is" }

[dependencies]
reqwest = { version = "0.11" }
reqwest = { version = "0.11.10" }
scraper = "0.12"
robotparser-fork = "0.10.5"
url = "2.2"
rayon = "1.1"
rayon = "1.5"
num_cpus = "1.13.0"
tokio = { version = "^1.17.0", features = ["rt-multi-thread", "net", "macros"] }
2 changes: 2 additions & 0 deletions spider/src/lib.rs
Expand Up @@ -12,3 +12,5 @@ pub mod configuration;
pub mod page;
/// A website to crawl
pub mod website;
/// Application utils
pub mod utils;
48 changes: 19 additions & 29 deletions spider/src/page.rs
@@ -1,7 +1,5 @@
use reqwest;
use scraper::{Html, Selector};
use url::Url;
use reqwest::Error;

/// Represent a page visited. This page contains HTML scraped with [scraper](https://crates.io/crates/scraper).
///
Expand All @@ -14,33 +12,10 @@ pub struct Page {
html: String,
}

// TODO: RE-EXPORTING RUNTIME FROM RAYON instead install matching
#[tokio::main]
pub async fn fetch_page_html(url: &str, user_agent: &str) -> Result<String, Error> {
let client = reqwest::Client::builder()
.user_agent(user_agent)
.build()
.unwrap();

let mut body = String::new();

let res = client
.get(url)
.send()
.await;

match res {
Ok(result) => body = result.text().await?,
Err(e) => eprintln!("[error] {}: {}", url, e),
}

Ok(body)
}

impl Page {
/// Instanciate a new page and start to scrape it.
pub fn new(url: &str, user_agent: &str) -> Self {
Page::build(url, &fetch_page_html(url, user_agent).unwrap())
pub fn new(url: &str, html: &str) -> Self {
Page::build(url, html)
}

/// Instanciate a new page without scraping it (used for testing purposes)
Expand Down Expand Up @@ -96,7 +71,15 @@ impl Page {

#[test]
fn parse_links() {
let page: Page = Page::new("https://choosealicense.com/", "spider/1.1.2");
use crate::utils::{Client, fetch_page_html};
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();

let link_result = "https://choosealicense.com/";
let html = fetch_page_html(&link_result, &client).unwrap();
let page: Page = Page::new(&link_result, &html);

assert!(
page.links("https://choosealicense.com")
Expand All @@ -109,7 +92,14 @@ fn parse_links() {

#[test]
fn test_abs_path() {
let page: Page = Page::new("https://choosealicense.com/", "spider/1.1.2");
use crate::utils::{Client, fetch_page_html};
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let html = fetch_page_html(&link_result, &client).unwrap();
let page: Page = Page::new(&link_result, &html);

assert_eq!(
page.abs_path("/page"),
Expand Down
13 changes: 13 additions & 0 deletions spider/src/utils.rs
@@ -0,0 +1,13 @@
pub use reqwest::{Client, Error};

#[tokio::main]
pub async fn fetch_page_html(url: &str, client: &Client) -> Result<String, Error> {
let body = client
.get(url)
.send()
.await?
.text()
.await?;

Ok(body)
}
38 changes: 28 additions & 10 deletions spider/src/website.rs
Expand Up @@ -5,6 +5,7 @@ use robotparser_fork::RobotFileParser;

use std::collections::HashSet;
use std::{sync, thread, time::Duration};
use crate::utils::{fetch_page_html, Client};

/// Represent a website to scrawl. To start crawling, instanciate a new `struct` using
/// <pre>
Expand Down Expand Up @@ -33,8 +34,10 @@ pub struct Website<'a> {
pub on_link_find_callback: fn(String) -> String,
/// Robot.txt parser holder
robot_file_parser: RobotFileParser<'a>,
// Configured the robots parser
configured_robots_parser: bool
// configured the robots parser
configured_robots_parser: bool,
// fetch client
client: Client,
}

impl<'a> Website<'a> {
Expand All @@ -51,7 +54,8 @@ impl<'a> Website<'a> {
links_visited: HashSet::new(),
pages: Vec::new(),
robot_file_parser: RobotFileParser::new(&format!("{}/robots.txt", domain)), // TODO: lazy establish
on_link_find_callback: |s| s
on_link_find_callback: |s| s,
client: Client::new()
}
}

Expand Down Expand Up @@ -85,6 +89,11 @@ impl<'a> Website<'a> {
.num_threads(self.configuration.concurrency)
.build()
.expect("Failed building thread pool.");
self.client = Client::builder()
.user_agent(user_agent)
.pool_max_idle_per_host(0)
.build()
.expect("Failed building client.");

// crawl while links exists
while !self.links.is_empty() {
Expand All @@ -103,18 +112,17 @@ impl<'a> Website<'a> {
}

let tx = tx.clone();
let cx = self.client.clone();

pool.spawn(move || {
let link_result = on_link_find_callback(thread_link);
tx.send(Page::new(&link_result, &user_agent)).unwrap();
let html = fetch_page_html(&link_result, &cx).unwrap_or("".to_string());
tx.send(Page::new(&link_result, &html)).unwrap();
thread::sleep(delay);
});
});

drop(tx);
drop(&self.robot_file_parser);
drop(&self.on_link_find_callback);
drop(&self.links);

rx.into_iter().for_each(|page| {
let url = page.get_url();
Expand Down Expand Up @@ -173,6 +181,17 @@ fn crawl() {
);
}

#[test]
fn crawl_invalid() {
let url = "https://w.com";
let mut website: Website = Website::new(url);
website.crawl();
let mut uniq = HashSet::new();
uniq.insert(format!("{}/", url.to_string())); // TODO: remove trailing slash mutate

assert_eq!(website.links_visited, uniq); // only the target url should exist
}

#[test]
fn crawl_link_callback() {
let mut website: Website = Website::new("https://choosealicense.com");
Expand All @@ -181,7 +200,6 @@ fn crawl_link_callback() {
s
};
website.crawl();

assert!(
website
.links_visited
Expand Down Expand Up @@ -236,12 +254,12 @@ fn test_link_duplicates() {
T: IntoIterator,
T::Item: Eq + std::hash::Hash,
{
let mut uniq = std::collections::HashSet::new();
let mut uniq = HashSet::new();
iter.into_iter().all(move |x| uniq.insert(x))
}

let mut website: Website = Website::new("http://0.0.0.0:8000");
website.crawl();

assert!(has_unique_elements(&website.links_visited));
}
}