Permalink
Browse files

Add --import-ignores for starting with a non-empty DIR/ignores file

  • Loading branch information...
ivan committed Dec 27, 2017
1 parent 6b6d578 commit 82de2f2b2bad56e69636783e93ec93392dc2d5f3
Showing with 14 additions and 6 deletions.
  1. +2 −0 .travis.yml
  2. +2 −0 README.md
  3. +1 −1 libgrabsite/__init__.py
  4. +9 −5 libgrabsite/main.py
View
@@ -16,6 +16,8 @@ script:
- ls -l www.google.com*
- gs-dump-urls www.google.com*/wpull.db done
- grab-site --1 --permanent-error-status-codes=404 https://www.google.com/
- echo '.*' > ignores
- grab-site --import-ignores ignores https://www.google.com/
- grab-site --1 --id my-id --no-dupespotter --no-video --concurrent 3 https://www.google.com/ https://www.google.com/intl/en/ads/
- grab-site --1 -i https://gist.githubusercontent.com/ivan/1d437237746cf067e134/raw/47d2253c0caf4a6b964dc657840ddce22eb9eac7/grab-site%2520test%2520urls.txt
- curl https://gist.githubusercontent.com/ivan/1d437237746cf067e134/raw/47d2253c0caf4a6b964dc657840ddce22eb9eac7/grab-site%2520test%2520urls.txt > local-url-list
View
@@ -292,6 +292,8 @@ Options can come before or after the URL.
Can be a range like X-Y to use a random delay between X and Y. Can be changed during
the crawl by editing the `DIR/delay` file.
* `--import-ignores`: Copy this file to to `DIR/ignores` before the crawl begins.
* `--warc-max-size=BYTES`: Try to limit each WARC file to around `BYTES` bytes
before rolling over to a new WARC file (default: 5368709120, which is 5GiB).
Note that the resulting WARC files may be drastically larger if there are very
View
@@ -1 +1 @@
__version__ = '1.5.3'
__version__ = '1.5.4'
View
@@ -71,6 +71,9 @@ def is_multicast(text):
@click.option('--ignore-sets', default="", metavar='LIST',
help='Alias for --igsets.')
@click.option('--import-ignores', default=None, metavar='FILE',
help='Copy this file to DIR/ignores before the crawl begins.')
@click.option('--igon/--igoff', default=False,
help=
'--igon (default: false) to print all URLs being ignored to the terminal '
@@ -169,10 +172,10 @@ def is_multicast(text):
@click.argument('start_url', nargs=-1, required=False)
def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
ignore_sets, igon, video, level, page_requisites_level, max_content_length,
sitemaps, dupespotter, warc_max_size, ua, input_file, wpull_args, start_url,
id, dir, finished_warc_dir, permanent_error_status_codes, custom_hooks,
which_wpull_args_partial, which_wpull_command):
ignore_sets, import_ignores, igon, video, level, page_requisites_level,
max_content_length, sitemaps, dupespotter, warc_max_size, ua, input_file,
wpull_args, start_url, id, dir, finished_warc_dir, permanent_error_status_codes,
custom_hooks, which_wpull_args_partial, which_wpull_command):
if not (input_file or start_url):
print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr)
sys.exit(1)
@@ -356,7 +359,8 @@ def get_base_wpull_args():
pass
with open("{}/ignores".format(working_dir), "w") as f:
pass
if import_ignores is not None:
f.write(open(import_ignores, "r").read())
with open("{}/delay".format(working_dir), "w") as f:
f.write(delay)

0 comments on commit 82de2f2

Please sign in to comment.