Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to download inline links directly after their parent page when downloading recursively #2

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/wget.texi
Original file line number Diff line number Diff line change
Expand Up @@ -1916,6 +1916,10 @@ case.
Turn on recursive retrieving. @xref{Recursive Download}, for more
details. The default maximum depth is 5.

@item --queue-type=@var{queuetype}
Specify the queue type (@pxref{Recursive Download}). Accepted values are @samp{fifo}
(the default) and @samp{browser}.

@item -l @var{depth}
@itemx --level=@var{depth}
Specify recursion maximum depth level @var{depth} (@pxref{Recursive
Expand Down Expand Up @@ -2296,6 +2300,12 @@ documents linked by them, and so on. In other words, Wget first
downloads the documents at depth 1, then those at depth 2, and so on
until the specified maximum depth.

The @dfn{queue type} is FIFO (default) or browser. FIFO download the
first enqueued links first. Browser download inline links directly after
their parent page. If the parent page contain temporary inline links
this can prevent that they expire before they're downloaded. Pages
sometimes use temporary links to prevent direct links to files.

The maximum @dfn{depth} to which the retrieval may descend is specified
with the @samp{-l} option. The default maximum depth is five layers.

Expand Down
21 changes: 21 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ CMD_DECLARE (cmd_spec_htmlify);
CMD_DECLARE (cmd_spec_mirror);
CMD_DECLARE (cmd_spec_prefer_family);
CMD_DECLARE (cmd_spec_progress);
CMD_DECLARE (cmd_spec_queue_type);
CMD_DECLARE (cmd_spec_recursive);
CMD_DECLARE (cmd_spec_regex_type);
CMD_DECLARE (cmd_spec_restrict_file_names);
Expand Down Expand Up @@ -247,6 +248,7 @@ static const struct {
{ "proxypasswd", &opt.proxy_passwd, cmd_string }, /* deprecated */
{ "proxypassword", &opt.proxy_passwd, cmd_string },
{ "proxyuser", &opt.proxy_user, cmd_string },
{ "queuetype", &opt.queue_type, cmd_spec_queue_type },
{ "quiet", &opt.quiet, cmd_boolean },
{ "quota", &opt.quota, cmd_bytes_sum },
#ifdef HAVE_SSL
Expand Down Expand Up @@ -403,6 +405,8 @@ defaults (void)
opt.restrict_files_nonascii = false;
opt.restrict_files_case = restrict_no_case_restriction;

opt.queue_type = queue_type_fifo;

opt.regex_type = regex_type_posix;

opt.max_redirect = 20;
Expand Down Expand Up @@ -1441,6 +1445,23 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored _GL_UN
return true;
}

/* Validate --queue-type and set the choice. */

static bool
cmd_spec_queue_type (const char *com, const char *val, void *place_ignored _GL_UNUSED)
{
static const struct decode_item choices[] = {
{ "fifo", queue_type_fifo },
{ "browser", queue_type_browser },
};
int queue_type = queue_type_fifo;
int ok = decode_string (val, choices, countof (choices), &queue_type);
if (!ok)
fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val));
opt.queue_type = queue_type;
return ok;
}

/* Validate --regex-type and set the choice. */

static bool
Expand Down
3 changes: 3 additions & 0 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ static struct cmdline_option option_data[] =
{ "proxy-passwd", 0, OPT_VALUE, "proxypassword", -1 }, /* deprecated */
{ "proxy-password", 0, OPT_VALUE, "proxypassword", -1 },
{ "proxy-user", 0, OPT_VALUE, "proxyuser", -1 },
{ "queue-type", 0, OPT_VALUE, "queuetype", -1 },
{ "quiet", 'q', OPT_BOOLEAN, "quiet", -1 },
{ "quota", 'Q', OPT_VALUE, "quota", -1 },
{ "random-file", 0, OPT_VALUE, "randomfile", -1 },
Expand Down Expand Up @@ -736,6 +737,8 @@ WARC options:\n"),
Recursive download:\n"),
N_("\
-r, --recursive specify recursive download\n"),
N_("\
--queue-type=TYPE queue type (fifo|browser).\n"),
N_("\
-l, --level=NUMBER maximum recursion depth (inf or 0 for infinite)\n"),
N_("\
Expand Down
4 changes: 4 additions & 0 deletions src/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ struct options
bool relative_only; /* Follow only relative links. */
bool no_parent; /* Restrict access to the parent
directory. */
enum {
queue_type_fifo,
queue_type_browser
} queue_type; /* Recursion queue type */
int reclevel; /* Maximum level of recursion */
bool dirstruct; /* Do we build the directory structure
as we go along? */
Expand Down
29 changes: 22 additions & 7 deletions src/recur.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,17 @@ url_queue_delete (struct url_queue *queue)

/* Enqueue a URL in the queue. The queue is FIFO: the items will be
retrieved ("dequeued") from the queue in the order they were placed
into it. */
into it. Or browser: inline items are retrieved directly after
their parent page. */

static void
url_enqueue (struct url_queue *queue, struct iri *i,
const char *url, const char *referer, int depth,
bool html_allowed, bool css_allowed)
{
bool append = opt.queue_type == queue_type_fifo
|| (opt.queue_type == queue_type_browser && html_allowed);

struct queue_element *qel = xnew (struct queue_element);
qel->iri = i;
qel->url = url;
Expand All @@ -110,20 +114,31 @@ url_enqueue (struct url_queue *queue, struct iri *i,
if (queue->count > queue->maxcount)
queue->maxcount = queue->count;

DEBUGP (("Enqueuing %s at depth %d\n",
DEBUGP (("%s %s at depth %d\n", append ? "Appending" : "Prepending",
quotearg_n_style (0, escape_quoting_style, url), depth));
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));

if (i)
DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
DEBUGP (("[IRI %s %s with %s\n", append ? "Appending" : "Prepending",
quote_n (0, url), i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));

if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
if (append)
{
if (queue->tail)
queue->tail->next = qel;
queue->tail = qel;
}
else
{
if (queue->head)
qel->next = queue->head;
queue->head = qel;
}

if (!queue->head)
queue->head = queue->tail;
if (!queue->tail)
queue->tail = queue->head;
}

/* Take a URL out of the queue. Return true if this operation
Expand Down
1 change: 1 addition & 0 deletions testenv/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ if HAVE_PYTHON3
Test-Post.py \
Test-504.py \
Test--spider-r.py \
Test--spider-r-browser.py \
Test-redirect-crash.py

# added test cases expected to fail here and under TESTS
Expand Down
2 changes: 1 addition & 1 deletion testenv/README
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ Various variables used consistently across all tests are:
* ExpectedDownloadedFiles: A list of files that are expected in the local
directory after Wget has finished executing. This does not include the files
already existing before Wget was launched and must be mentioned again.
* Request_List: An unordered list of Requests that each server must receive.
* Request_List: An ordered list of Requests that each server must receive.
This too is a list of lists and follows the same convention as others above.

Both, the HTTPTest and FTPTest modules have the same prototype:
Expand Down
109 changes: 109 additions & 0 deletions testenv/Test--spider-r-browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python3
from sys import exit
from test.http_test import HTTPTest
from misc.wget_file import WgetFile

"""
This test executed Wget in Spider Browser mode.
"""
TEST_NAME = "Recursive Spider Browser"
############# File Definitions ###############################################
mainpage = """
<html>
<head>
<title>Main Page</title>
</head>
<body>
<p>
Some text and a link to a <a href="http://127.0.0.1:{{port}}/secondpage.html">second page</a>.
Also, an image <img src="http://127.0.0.1:{{port}}/image.svg">
Also, a <a href="http://127.0.0.1:{{port}}/nonexistent">broken link</a>.
</p>
</body>
</html>
"""


secondpage = """
<html>
<head>
<title>Second Page</title>
</head>
<body>
<p>
Some text and a link to a <a href="http://127.0.0.1:{{port}}/thirdpage.html">third page</a>.
Also, a <a href="http://127.0.0.1:{{port}}/nonexistent">broken link</a>.
</p>
</body>
</html>
"""

thirdpage = """
<html>
<head>
<title>Third Page</title>
</head>
<body>
<p>
Some text and a link to a <a href="http://127.0.0.1:{{port}}/dummy.txt">text file</a>.
Also, another <a href="http://127.0.0.1:{{port}}/againnonexistent">broken link</a>.
</p>
</body>
</html>
"""

dummyfile = "Don't care."


index_html = WgetFile ("index.html", mainpage)
image_svg = WgetFile ("image.svg", dummyfile)
secondpage_html = WgetFile ("secondpage.html", secondpage)
thirdpage_html = WgetFile ("thirdpage.html", thirdpage)
dummy_txt = WgetFile ("dummy.txt", dummyfile)

Request_List = [
[
"HEAD /",
"GET /",
"GET /robots.txt",
"HEAD /image.svg",
"HEAD /secondpage.html",
"GET /secondpage.html",
"HEAD /nonexistent",
"HEAD /thirdpage.html",
"GET /thirdpage.html",
"HEAD /dummy.txt",
"HEAD /againnonexistent"
]
]

WGET_OPTIONS = "--spider -r --queue-type=browser"
WGET_URLS = [[""]]

Files = [[index_html, image_svg, secondpage_html, thirdpage_html, dummy_txt]]

ExpectedReturnCode = 8
ExpectedDownloadedFiles = []

################ Pre and Post Test Hooks #####################################
pre_test = {
"ServerFiles" : Files
}
test_options = {
"WgetCommands" : WGET_OPTIONS,
"Urls" : WGET_URLS
}
post_test = {
"ExpectedFiles" : ExpectedDownloadedFiles,
"ExpectedRetcode" : ExpectedReturnCode,
"FilesCrawled" : Request_List
}

err = HTTPTest (
name=TEST_NAME,
pre_hook=pre_test,
test_params=test_options,
post_hook=post_test
).begin ()

exit (err)
12 changes: 5 additions & 7 deletions testenv/conf/files_crawled.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from misc.colour_terminal import print_red
from misc.colour_terminal import print_green, print_red
from conf import hook
from exc.test_failed import TestFailed

Expand All @@ -18,10 +18,8 @@ def __init__(self, request_headers):
self.request_headers = request_headers

def __call__(self, test_obj):
for headers, remaining in zip(map(set, self.request_headers),
test_obj.request_remaining()):
diff = headers.symmetric_difference(remaining)
if self.request_headers != test_obj.request_remaining():
print_green ('Expected: %s' % self.request_headers)
print_red ('Got: %s' % test_obj.request_remaining())

if diff:
print_red (str(diff))
raise TestFailed('Not all files were crawled correctly.')
raise TestFailed('Not all files were crawled correctly.')