Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow excluding cache based on status code #1403

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,22 @@ Options:

[default: 1d]

--cache-exclude-status <CACHE_EXCLUDE_STATUS>
A List status codes that will be ignored from the cache

The following accept range syntax is supported: [start]..[=]end|code. Some valid
examples are:

- 429
- 500..=599
- 500..

Use "lychee --cache-exclude-status '429, 500..502' <inputs>..." to provide a comma- separated
list of excluded status codes. This example will not cache results with a status code of 429, 500,
501 and 502.

[default: 100..=103,200..=299]

--dump
Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked

Expand Down
76 changes: 71 additions & 5 deletions lychee-bin/src/commands/check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use reqwest::Url;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;

use lychee_lib::{Client, ErrorKind, Request, Response};
use lychee_lib::{Client, ErrorKind, Request, Response, Uri};
use lychee_lib::{InputSource, Result};
use lychee_lib::{ResponseBody, Status};

Expand Down Expand Up @@ -45,6 +45,7 @@ where

let client = params.client;
let cache = params.cache;
let cache_exclude_status = params.cfg.cache_exclude_status.into_set();
let accept = params.cfg.accept.into_set();

let pb = if params.cfg.no_progress || params.cfg.verbose.log_level() >= log::Level::Info {
Expand All @@ -60,6 +61,7 @@ where
max_concurrency,
client,
cache,
cache_exclude_status,
accept,
));

Expand Down Expand Up @@ -216,14 +218,22 @@ async fn request_channel_task(
max_concurrency: usize,
client: Client,
cache: Arc<Cache>,
cache_exclude_status: HashSet<u16>,
accept: HashSet<u16>,
) {
StreamExt::for_each_concurrent(
ReceiverStream::new(recv_req),
max_concurrency,
|request: Result<Request>| async {
let request = request.expect("cannot read request");
let response = handle(&client, cache.clone(), request, accept.clone()).await;
let response = handle(
&client,
cache.clone(),
cache_exclude_status.clone(),
request,
accept.clone(),
)
.await;

send_resp
.send(response)
Expand Down Expand Up @@ -257,6 +267,7 @@ async fn check_url(client: &Client, request: Request) -> Response {
async fn handle(
client: &Client,
cache: Arc<Cache>,
cache_exclude_status: HashSet<u16>,
request: Request,
accept: HashSet<u16>,
) -> Response {
Expand Down Expand Up @@ -284,16 +295,29 @@ async fn handle(
// benefit.
// - Skip caching unsupported URLs as they might be supported in a
// future run.
// - Skip caching excluded links; they might not be excluded in the next run
// - Skip caching excluded links; they might not be excluded in the next run.
// - Skip caching links for which the status code has been explicitly excluded from the cache.
let status = response.status();
if uri.is_file() || status.is_excluded() || status.is_unsupported() || status.is_unknown() {
if ignore_cache(&uri, status, &cache_exclude_status) {
return response;
}

cache.insert(uri, status.into());
response
}

fn ignore_cache(uri: &Uri, status: &Status, cache_exclude_status: &HashSet<u16>) -> bool {
let status_code_excluded = status
.code()
.map_or(false, |code| cache_exclude_status.contains(&code.as_u16()));

uri.is_file()
|| status.is_excluded()
|| status.is_unsupported()
|| status.is_unknown()
|| status_code_excluded
}

fn show_progress(
output: &mut dyn Write,
progress_bar: &Option<ProgressBar>,
Expand Down Expand Up @@ -341,8 +365,9 @@ fn get_failed_urls(stats: &mut ResponseStats) -> Vec<(InputSource, Url)> {
#[cfg(test)]
mod tests {
use crate::{formatters::get_response_formatter, options};
use http::StatusCode;
use log::info;
use lychee_lib::{CacheStatus, ClientBuilder, InputSource, Uri};
use lychee_lib::{CacheStatus, ClientBuilder, ErrorKind, InputSource, Uri};

use super::*;

Expand Down Expand Up @@ -403,4 +428,45 @@ mod tests {
Status::Error(ErrorKind::InvalidURI(_))
));
}

#[test]
fn test_ignore_cache() {
let mut exclude = HashSet::new();

// Cache is not ignored
assert!(!ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::Ok(StatusCode::OK),
&exclude
));

// Cache is ignored for file URLs
assert!(ignore_cache(
&Uri::try_from("file:///home").unwrap(),
&Status::Ok(StatusCode::OK),
&exclude
));

// Cache is ignored for unsupported status
assert!(ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::Unsupported(ErrorKind::EmptyUrl),
&exclude
));

// Cache is ignored for unknown status
assert!(ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::UnknownStatusCode(StatusCode::IM_A_TEAPOT),
&exclude
));

// Cache is ignored for excluded status codes
exclude.insert(200);
assert!(ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::Ok(StatusCode::OK),
&exclude
));
}
}
31 changes: 26 additions & 5 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use clap::builder::PossibleValuesParser;
use clap::{arg, builder::TypedValueParser, Parser};
use const_format::{concatcp, formatcp};
use lychee_lib::{
AcceptSelector, Base, BasicAuthSelector, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES,
Base, BasicAuthSelector, Input, StatusCodeSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES,
DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
};
use secrecy::{ExposeSecret, SecretString};
Expand Down Expand Up @@ -145,7 +145,7 @@ default_function! {
retry_wait_time: usize = DEFAULT_RETRY_WAIT_TIME_SECS;
method: String = DEFAULT_METHOD.to_string();
verbosity: Verbosity = Verbosity::default();
accept_selector: AcceptSelector = AcceptSelector::default();
accept_selector: StatusCodeSelector = StatusCodeSelector::default();
}

// Macro for merging configuration values
Expand Down Expand Up @@ -231,6 +231,26 @@ pub(crate) struct Config {
#[serde(with = "humantime_serde")]
pub(crate) max_cache_age: Duration,

/// A list of status codes that will be excluded from the cache
#[arg(
long,
default_value_t,
long_help = "A List status codes that will be ignored from the cache

The following accept range syntax is supported: [start]..[=]end|code. Some valid
examples are:

- 429
- 500..=599
- 500..

Use \"lychee --cache-exclude-status '429, 500..502' <inputs>...\" to provide a comma- separated
list of excluded status codes. This example will not cache results with a status code of 429, 500,
501 and 502."
)]
#[serde(default)]
pub(crate) cache_exclude_status: StatusCodeSelector,

/// Don't perform any link checking.
/// Instead, dump all the links extracted from inputs that would be checked
#[arg(long)]
Expand Down Expand Up @@ -394,7 +414,7 @@ separated list of accepted status codes. This example will accept 200, 201,
202, 203, 204, 429, and 500 as valid status codes."
)]
#[serde(default = "accept_selector")]
pub(crate) accept: AcceptSelector,
pub(crate) accept: StatusCodeSelector,

/// Enable the checking of fragments in links.
#[arg(long)]
Expand Down Expand Up @@ -498,6 +518,7 @@ impl Config {
max_retries: DEFAULT_MAX_RETRIES;
max_concurrency: DEFAULT_MAX_CONCURRENCY;
max_cache_age: humantime::parse_duration(DEFAULT_MAX_CACHE_AGE).unwrap();
cache_exclude_status: StatusCodeSelector::new();
threads: None;
user_agent: DEFAULT_USER_AGENT;
insecure: false;
Expand Down Expand Up @@ -527,7 +548,7 @@ impl Config {
require_https: false;
cookie_jar: None;
include_fragments: false;
accept: AcceptSelector::default();
accept: StatusCodeSelector::default();
}

if self
Expand All @@ -553,7 +574,7 @@ mod tests {
#[test]
fn test_accept_status_codes() {
let toml = Config {
accept: AcceptSelector::from_str("200..=204, 429, 500").unwrap(),
accept: StatusCodeSelector::from_str("200..=204, 429, 500").unwrap(),
..Default::default()
};

Expand Down
59 changes: 59 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,65 @@ mod cli {
Ok(())
}

#[tokio::test]
async fn test_lycheecache_exclude_custom_status_codes() -> Result<()> {
let base_path = fixtures_path().join("cache");
let cache_file = base_path.join(LYCHEE_CACHE_FILE);

// Unconditionally remove cache file if it exists
let _ = fs::remove_file(&cache_file);

let mock_server_ok = mock_server!(StatusCode::OK);
let mock_server_no_content = mock_server!(StatusCode::NO_CONTENT);
let mock_server_too_many_requests = mock_server!(StatusCode::TOO_MANY_REQUESTS);

let dir = tempfile::tempdir()?;
let mut file = File::create(dir.path().join("c.md"))?;

writeln!(file, "{}", mock_server_ok.uri().as_str())?;
writeln!(file, "{}", mock_server_no_content.uri().as_str())?;
writeln!(file, "{}", mock_server_too_many_requests.uri().as_str())?;

let mut cmd = main_command();
let test_cmd = cmd
.current_dir(&base_path)
.arg(dir.path().join("c.md"))
.arg("--verbose")
.arg("--no-progress")
.arg("--cache")
.arg("--cache-exclude-status")
.arg("204,429");

assert!(
!cache_file.exists(),
"cache file should not exist before this test"
);

// run first without cache to generate the cache file
test_cmd
.assert()
.stderr(contains(format!("[200] {}/\n", mock_server_ok.uri())))
.stderr(contains(format!(
"[204] {}/ | OK (204 No Content): No Content\n",
mock_server_no_content.uri()
)))
.stderr(contains(format!(
"[429] {}/ | Failed: Network error: Too Many Requests\n",
mock_server_too_many_requests.uri()
)));

// check content of cache file
let data = fs::read_to_string(&cache_file)?;
assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri())));
assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri())));

// clear the cache file
fs::remove_file(&cache_file)?;

Ok(())
}

#[tokio::test]
async fn test_lycheecache_accept_custom_status_codes() -> Result<()> {
let base_path = fixtures_path().join("cache_accept_custom_status_codes");
Expand Down
4 changes: 2 additions & 2 deletions lychee-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ pub use crate::{
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{
uri::valid::Uri, AcceptRange, AcceptRangeError, AcceptSelector, Base, BasicAuthCredentials,
uri::valid::Uri, AcceptRange, AcceptRangeError, Base, BasicAuthCredentials,
BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileType, Input, InputContent,
InputSource, Request, Response, ResponseBody, Result, Status,
InputSource, Request, Response, ResponseBody, Result, Status, StatusCodeSelector,
},
};
2 changes: 0 additions & 2 deletions lychee-lib/src/types/accept/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
mod range;
mod selector;

pub use range::*;
pub use selector::*;
10 changes: 5 additions & 5 deletions lychee-lib/src/types/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use thiserror::Error;
use tokio::task::JoinError;

use super::InputContent;
use crate::types::AcceptSelectorError;
use crate::types::StatusCodeSelectorError;
use crate::{basic_auth::BasicAuthExtractorError, utils, Uri};

/// Kinds of status errors
Expand Down Expand Up @@ -142,9 +142,9 @@ pub enum ErrorKind {
#[error("Cannot load cookies")]
Cookies(String),

/// Accept selector parse error
#[error("Accept range error")]
AcceptSelectorError(#[from] AcceptSelectorError),
/// Status code selector parse error
#[error("Status code range error")]
StatusCodeSelectorError(#[from] StatusCodeSelectorError),
}

impl ErrorKind {
Expand Down Expand Up @@ -290,7 +290,7 @@ impl Hash for ErrorKind {
Self::TooManyRedirects(e) => e.to_string().hash(state),
Self::BasicAuthExtractorError(e) => e.to_string().hash(state),
Self::Cookies(e) => e.to_string().hash(state),
Self::AcceptSelectorError(e) => e.to_string().hash(state),
Self::StatusCodeSelectorError(e) => e.to_string().hash(state),
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions lychee-lib/src/types/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub(crate) mod mail;
mod request;
mod response;
mod status;
mod status_code;
pub(crate) mod uri;

pub use accept::*;
Expand All @@ -25,6 +26,7 @@ pub use input::{Input, InputContent, InputSource};
pub use request::Request;
pub use response::{Response, ResponseBody};
pub use status::Status;
pub use status_code::*;

/// The lychee `Result` type
pub type Result<T> = std::result::Result<T, crate::ErrorKind>;
3 changes: 3 additions & 0 deletions lychee-lib/src/types/status_code/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
mod selector;

pub use selector::*;
Loading
Loading