Permalink
Browse files

Implement --custom-hooks so that users can modify wpull_hook

  • Loading branch information...
ivan committed Feb 21, 2016
1 parent 7f426d2 commit c37b32bd1c95a39b7af92917c20d423c26b183af
Showing with 91 additions and 16 deletions.
  1. +5 −0 .editorconfig
  2. +9 −3 README.md
  3. +31 −0 extra_docs/custom_hooks_sample.py
  4. +1 −1 libgrabsite/__init__.py
  5. +15 −1 libgrabsite/main.py
  6. +30 −11 libgrabsite/wpull_hooks.py
View
@@ -0,0 +1,5 @@
root = true
[*]
indent_style = tab
indent_size = 4
View
@@ -227,9 +227,9 @@ Options can come before or after the URL.
the crawl by editing the `DIR/delay` file.
* `--warc-max-size=BYTES`: Try to limit each WARC file to around `BYTES` bytes
before rolling over to a new WARC file (default: 5368709120, which is 5GiB).
Note that the resulting WARC files may be drastically larger if there are very
large responses.
before rolling over to a new WARC file (default: 5368709120, which is 5GiB).
Note that the resulting WARC files may be drastically larger if there are very
large responses.
* `--level=N`: recurse `N` levels instead of `inf` levels.
@@ -251,6 +251,12 @@ Options can come before or after the URL.
Also useful: `--wpull-args=--no-skip-getaddrinfo` to respect `/etc/hosts` entries.
* `--custom-hooks=PY_SCRIPT`: Copy `PY_SCRIPT` to `DIR/custom_hooks.py`,
then exec `DIR/custom_hooks.py` on startup and every time it changes.
The script gets a `wpull_hook` global that can be used to change crawl behavior.
See [libgrabsite/wpull_hooks.py](https://github.com/ludios/grab-site/blob/master/libgrabsite/wpull_hooks.py)
and [custom_hooks_sample.py](https://github.com/ludios/grab-site/blob/master/extra_docs/custom_hooks_sample.py).
* `--help`: print help text.
### Warnings
@@ -0,0 +1,31 @@
"""
This is a sample script that can be passed to grab-site --custom-hooks=.
It drops http:// URLs before they can be queued, and it aborts responses
that have a Content-Type: that starts with 'audio/'
"""
accept_url_grabsite = wpull_hook.callbacks.accept_url
def accept_url(url_info, record_info, verdict, reasons):
url = url_info['url']
if url.startswith("http://"):
print("Dropping insecure URL %s" % url)
return False
return accept_url_grabsite(url_info, record_info, verdict, reasons)
def has_content_type_audio(response_info):
try:
t = list(p for p in response_info["fields"] if p[0] == "Content-Type")[0][1]
return t.lower().startswith("audio/")
except (IndexError, ValueError):
return False
handle_pre_response_grabsite = wpull_hook.callbacks.handle_pre_response
def handle_pre_response(url_info, url_record, response_info):
url = url_info['url']
if has_content_type_audio(response_info):
print("Dropping %s because it has audio mime type" % url)
return wpull_hook.actions.FINISH
return handle_pre_response_grabsite(url_info, url_record, response_info)
wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.handle_pre_response = handle_pre_response
View
@@ -1 +1 @@
__version__ = '0.9.19'
__version__ = '0.10'
View
@@ -117,6 +117,13 @@ def print_version(ctx, param, value):
@click.option('--finished-warc-dir', default=None, type=str, metavar='FINISHED_WARC_DIR',
help='Move finished .warc.gz and .cdx files to this directory.')
@click.option('--custom-hooks', default=None, type=str, metavar='PY_SCRIPT',
help=
'Copy PY_SCRIPT to DIR/custom_hooks.py, then exec DIR/custom_hooks.py '
'on startup and every time it changes. The script gets a `wpull_hook` '
'global that can be used to change crawl behavior. '
'See libgrabsite/wpull_hooks.py and extra_docs/custom_hooks_sample.py.')
@click.option('--version', is_flag=True, callback=print_version,
expose_value=False, is_eager=True, help='Print version and exit.')
@@ -125,7 +132,7 @@ def print_version(ctx, param, value):
def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
ignore_sets, igon, video, level, page_requisites_level, max_content_length,
sitemaps, dupespotter, warc_max_size, ua, input_file, wpull_args, start_url,
id, dir, finished_warc_dir):
id, dir, finished_warc_dir, custom_hooks):
if not (input_file or start_url):
print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr)
sys.exit(1)
@@ -167,6 +174,13 @@ def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
temp_dir = os.path.join(working_dir, "temp")
os.makedirs(temp_dir)
DIR_custom_hooks = os.path.join(working_dir, "custom_hooks.py")
if custom_hooks:
shutil.copyfile(custom_hooks, DIR_custom_hooks)
else:
with open(DIR_custom_hooks, "wb") as _:
pass
def get_base_wpull_args():
return ["-U", ua,
"--header=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
View
@@ -183,6 +183,8 @@ def has_changed(self):
now_mtime = mtime_with_cache(self.fname)
changed = now_mtime != self.last_mtime
self.last_mtime = now_mtime
if changed:
print("Picked up the changes to %s" % self.fname)
return changed
@@ -191,6 +193,7 @@ def has_changed(self):
delay_watcher = FileChangedWatcher(os.path.join(working_dir, "delay"))
concurrency_watcher = FileChangedWatcher(os.path.join(working_dir, "concurrency"))
max_content_length_watcher = FileChangedWatcher(os.path.join(working_dir, "max_content_length"))
custom_hooks_watcher = FileChangedWatcher(os.path.join(working_dir, "custom_hooks.py"))
ignoracle = Ignoracle()
@@ -514,22 +517,38 @@ def update_concurrency():
def wait_time(_):
update_delay()
# While we're at it, update the concurrency level
update_concurrency()
update_custom_hooks()
return random.uniform(job_data["delay_min"], job_data["delay_max"]) / 1000
assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS
@swallow_exception
def update_custom_hooks():
if not custom_hooks_watcher.has_changed():
return
wpull_hook.callbacks.version = 2
wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.queued_url = queued_url
wpull_hook.callbacks.dequeued_url = dequeued_url
wpull_hook.callbacks.handle_response = handle_response
wpull_hook.callbacks.handle_error = handle_error
wpull_hook.callbacks.handle_pre_response = handle_pre_response
wpull_hook.callbacks.exit_status = exit_status
wpull_hook.callbacks.wait_time = wait_time
assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS
# Set these every time, because custom_hooks.py may be wrapping them,
# and when custom_hooks.py is reloaded, we want it re-wrap the base functions
# instead of its already-wrapped functions.
wpull_hook.callbacks.version = 2
wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.queued_url = queued_url
wpull_hook.callbacks.dequeued_url = dequeued_url
wpull_hook.callbacks.handle_response = handle_response
wpull_hook.callbacks.handle_error = handle_error
wpull_hook.callbacks.handle_pre_response = handle_pre_response
wpull_hook.callbacks.exit_status = exit_status
wpull_hook.callbacks.wait_time = wait_time
custom_hooks_filename = os.path.join(working_dir, "custom_hooks.py")
with open(custom_hooks_filename, 'rb') as in_file:
code = compile(in_file.read(), custom_hooks_filename, 'exec')
context = {'wpull_hook': wpull_hook}
exec(code, context, context)
update_custom_hooks()
really_swallow_exceptions = True

0 comments on commit c37b32b

Please sign in to comment.