spider-rs · j-mendez · Apr 8, 2022 · Apr 1, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 - feat(cli): add cli separation binary [#17](https://github.com/madeindjs/spider/pull/17/commits/b41e25fc507c6cd3ef251d2e25c97b936865e1a9)
 - feat(robots): add robots crawl delay respect and ua assign [#24](https://github.com/madeindjs/spider/pull/24)
 - feat(async): add async page body gathering
+- perf(latency): add connection re-use across request [#25](https://github.com/madeindjs/spider/pull/25)
 
 ## v1.4.0
 

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -15,10 +15,10 @@ edition = "2018"
 maintenance = { status = "as-is" }
 
 [dependencies]
-reqwest = { version = "0.11" }
+reqwest = { version = "0.11.10" }
 scraper = "0.12"
 robotparser-fork = "0.10.5"
 url = "2.2"
-rayon = "1.1"
+rayon = "1.5"
 num_cpus = "1.13.0"
 tokio = { version = "^1.17.0", features = ["rt-multi-thread", "net", "macros"] }
diff --git a/spider/src/lib.rs b/spider/src/lib.rs
@@ -12,3 +12,5 @@ pub mod configuration;
 pub mod page;
 /// A website to crawl
 pub mod website;
+/// Application utils
+pub mod utils;
diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -1,7 +1,5 @@
-use reqwest;
 use scraper::{Html, Selector};
 use url::Url;
-use reqwest::Error;
 
 /// Represent a page visited. This page contains HTML scraped with [scraper](https://crates.io/crates/scraper).
 ///
@@ -14,33 +12,10 @@ pub struct Page {
     html: String,
 }
 
-// TODO: RE-EXPORTING RUNTIME FROM RAYON instead install matching
-#[tokio::main]
-pub async fn fetch_page_html(url: &str, user_agent: &str) -> Result<String, Error> {
-    let client = reqwest::Client::builder()
-        .user_agent(user_agent)
-        .build()
-        .unwrap();
-
-    let mut body = String::new();
-
-	let res = client
-		.get(url)
-		.send()
-		.await;
-
-    match res {
-        Ok(result) => body = result.text().await?,
-        Err(e) => eprintln!("[error] {}: {}", url, e),
-    }
-
-    Ok(body)
-}
-
 impl Page {
     /// Instanciate a new page and start to scrape it.
-    pub fn new(url: &str, user_agent: &str) -> Self {
-        Page::build(url, &fetch_page_html(url, user_agent).unwrap())
+    pub fn new(url: &str, html: &str) -> Self {
+        Page::build(url, html)
     }
 
     /// Instanciate a new page without scraping it (used for testing purposes)
@@ -96,7 +71,15 @@ impl Page {
 
 #[test]
 fn parse_links() {
-    let page: Page = Page::new("https://choosealicense.com/", "spider/1.1.2");
+    use crate::utils::{Client, fetch_page_html};
+    let client = Client::builder()
+        .user_agent("spider/1.1.2")
+        .build()
+        .unwrap();
+
+    let link_result = "https://choosealicense.com/";
+    let html = fetch_page_html(&link_result, &client).unwrap();
+    let page: Page = Page::new(&link_result, &html);
 
     assert!(
         page.links("https://choosealicense.com")
@@ -109,7 +92,14 @@ fn parse_links() {
 
 #[test]
 fn test_abs_path() {
-    let page: Page = Page::new("https://choosealicense.com/", "spider/1.1.2");
+    use crate::utils::{Client, fetch_page_html};
+    let client = Client::builder()
+        .user_agent("spider/1.1.2")
+        .build()
+        .unwrap();
+    let link_result = "https://choosealicense.com/";
+    let html = fetch_page_html(&link_result, &client).unwrap();
+    let page: Page = Page::new(&link_result, &html);
 
     assert_eq!(
         page.abs_path("/page"),

diff --git a/spider/src/utils.rs b/spider/src/utils.rs
@@ -0,0 +1,13 @@
+pub use reqwest::{Client, Error};
+
+#[tokio::main]
+pub async fn fetch_page_html(url: &str, client: &Client) -> Result<String, Error> {
+	let body = client
+		.get(url)
+		.send()
+		.await?
+        .text()
+        .await?;
+
+    Ok(body)
+}
diff --git a/spider/src/website.rs b/spider/src/website.rs
@@ -5,6 +5,7 @@ use robotparser_fork::RobotFileParser;
 
 use std::collections::HashSet;
 use std::{sync, thread, time::Duration};
+use crate::utils::{fetch_page_html, Client};
 
 /// Represent a website to scrawl. To start crawling, instanciate a new `struct` using
 /// <pre>
@@ -33,8 +34,10 @@ pub struct Website<'a> {
     pub on_link_find_callback: fn(String) -> String,
     /// Robot.txt parser holder
     robot_file_parser: RobotFileParser<'a>,
-    // Configured the robots parser
-    configured_robots_parser: bool
+    // configured the robots parser
+    configured_robots_parser: bool,
+    // fetch client
+    client: Client,
 }
 
 impl<'a> Website<'a> {
@@ -51,7 +54,8 @@ impl<'a> Website<'a> {
             links_visited: HashSet::new(),
             pages: Vec::new(),
             robot_file_parser: RobotFileParser::new(&format!("{}/robots.txt", domain)), // TODO: lazy establish
-            on_link_find_callback: |s| s
+            on_link_find_callback: |s| s,
+            client: Client::new()
         }
     }
 
@@ -85,6 +89,11 @@ impl<'a> Website<'a> {
             .num_threads(self.configuration.concurrency)
             .build()
             .expect("Failed building thread pool.");
+        self.client = Client::builder()
+            .user_agent(user_agent)
+            .pool_max_idle_per_host(0)
+            .build()
+            .expect("Failed building client.");
 
         // crawl while links exists
         while !self.links.is_empty() {
@@ -103,18 +112,17 @@ impl<'a> Website<'a> {
                     }
 
                     let tx = tx.clone();
+                    let cx = self.client.clone();
 
                     pool.spawn(move || {
                         let link_result = on_link_find_callback(thread_link);
-                        tx.send(Page::new(&link_result, &user_agent)).unwrap();
+                        let html = fetch_page_html(&link_result, &cx).unwrap_or("".to_string());
+                        tx.send(Page::new(&link_result, &html)).unwrap();
                         thread::sleep(delay);
                     });
                 });
 
             drop(tx);
-            drop(&self.robot_file_parser);
-            drop(&self.on_link_find_callback);
-            drop(&self.links);
 
             rx.into_iter().for_each(|page| {
                 let url = page.get_url();
@@ -173,6 +181,17 @@ fn crawl() {
     );
 }
 
+#[test]
+fn crawl_invalid() {
+    let url = "https://w.com";
+    let mut website: Website = Website::new(url);
+    website.crawl();
+    let mut uniq = HashSet::new();
+    uniq.insert(format!("{}/", url.to_string())); // TODO: remove trailing slash mutate
+
+    assert_eq!(website.links_visited, uniq); // only the target url should exist
+}
+
 #[test]
 fn crawl_link_callback() {
     let mut website: Website = Website::new("https://choosealicense.com");
@@ -181,7 +200,6 @@ fn crawl_link_callback() {
         s 
     };
     website.crawl();
-
     assert!(
         website
             .links_visited
@@ -236,12 +254,12 @@ fn test_link_duplicates() {
         T: IntoIterator,
         T::Item: Eq + std::hash::Hash,
     {
-        let mut uniq = std::collections::HashSet::new();
+        let mut uniq = HashSet::new();
         iter.into_iter().all(move |x| uniq.insert(x))
     }
 
     let mut website: Website = Website::new("http://0.0.0.0:8000");
     website.crawl();
 
     assert!(has_unique_elements(&website.links_visited));
-}
+}