Skip to content

Commit

Permalink
Implement --custom-hooks so that users can modify wpull_hook
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan committed Feb 21, 2016
1 parent 7f426d2 commit c37b32b
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 16 deletions.
5 changes: 5 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
root = true

[*]
indent_style = tab
indent_size = 4
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,9 @@ Options can come before or after the URL.
the crawl by editing the `DIR/delay` file.

* `--warc-max-size=BYTES`: Try to limit each WARC file to around `BYTES` bytes
before rolling over to a new WARC file (default: 5368709120, which is 5GiB).
Note that the resulting WARC files may be drastically larger if there are very
large responses.
before rolling over to a new WARC file (default: 5368709120, which is 5GiB).
Note that the resulting WARC files may be drastically larger if there are very
large responses.

* `--level=N`: recurse `N` levels instead of `inf` levels.

Expand All @@ -251,6 +251,12 @@ Options can come before or after the URL.

Also useful: `--wpull-args=--no-skip-getaddrinfo` to respect `/etc/hosts` entries.

* `--custom-hooks=PY_SCRIPT`: Copy `PY_SCRIPT` to `DIR/custom_hooks.py`,
then exec `DIR/custom_hooks.py` on startup and every time it changes.
The script gets a `wpull_hook` global that can be used to change crawl behavior.
See [libgrabsite/wpull_hooks.py](https://github.com/ludios/grab-site/blob/master/libgrabsite/wpull_hooks.py)
and [custom_hooks_sample.py](https://github.com/ludios/grab-site/blob/master/extra_docs/custom_hooks_sample.py).

* `--help`: print help text.

### Warnings
Expand Down
31 changes: 31 additions & 0 deletions extra_docs/custom_hooks_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
This is a sample script that can be passed to grab-site --custom-hooks=.
It drops http:// URLs before they can be queued, and it aborts responses
that have a Content-Type: that starts with 'audio/'
"""

accept_url_grabsite = wpull_hook.callbacks.accept_url
def accept_url(url_info, record_info, verdict, reasons):
url = url_info['url']
if url.startswith("http://"):
print("Dropping insecure URL %s" % url)
return False
return accept_url_grabsite(url_info, record_info, verdict, reasons)

def has_content_type_audio(response_info):
try:
t = list(p for p in response_info["fields"] if p[0] == "Content-Type")[0][1]
return t.lower().startswith("audio/")
except (IndexError, ValueError):
return False

handle_pre_response_grabsite = wpull_hook.callbacks.handle_pre_response
def handle_pre_response(url_info, url_record, response_info):
url = url_info['url']
if has_content_type_audio(response_info):
print("Dropping %s because it has audio mime type" % url)
return wpull_hook.actions.FINISH
return handle_pre_response_grabsite(url_info, url_record, response_info)

wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.handle_pre_response = handle_pre_response
2 changes: 1 addition & 1 deletion libgrabsite/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.9.19'
__version__ = '0.10'
16 changes: 15 additions & 1 deletion libgrabsite/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ def print_version(ctx, param, value):
@click.option('--finished-warc-dir', default=None, type=str, metavar='FINISHED_WARC_DIR',
help='Move finished .warc.gz and .cdx files to this directory.')

@click.option('--custom-hooks', default=None, type=str, metavar='PY_SCRIPT',
help=
'Copy PY_SCRIPT to DIR/custom_hooks.py, then exec DIR/custom_hooks.py '
'on startup and every time it changes. The script gets a `wpull_hook` '
'global that can be used to change crawl behavior. '
'See libgrabsite/wpull_hooks.py and extra_docs/custom_hooks_sample.py.')

@click.option('--version', is_flag=True, callback=print_version,
expose_value=False, is_eager=True, help='Print version and exit.')

Expand All @@ -125,7 +132,7 @@ def print_version(ctx, param, value):
def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
ignore_sets, igon, video, level, page_requisites_level, max_content_length,
sitemaps, dupespotter, warc_max_size, ua, input_file, wpull_args, start_url,
id, dir, finished_warc_dir):
id, dir, finished_warc_dir, custom_hooks):
if not (input_file or start_url):
print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr)
sys.exit(1)
Expand Down Expand Up @@ -167,6 +174,13 @@ def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
temp_dir = os.path.join(working_dir, "temp")
os.makedirs(temp_dir)

DIR_custom_hooks = os.path.join(working_dir, "custom_hooks.py")
if custom_hooks:
shutil.copyfile(custom_hooks, DIR_custom_hooks)
else:
with open(DIR_custom_hooks, "wb") as _:
pass

def get_base_wpull_args():
return ["-U", ua,
"--header=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
Expand Down
41 changes: 30 additions & 11 deletions libgrabsite/wpull_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ def has_changed(self):
now_mtime = mtime_with_cache(self.fname)
changed = now_mtime != self.last_mtime
self.last_mtime = now_mtime
if changed:
print("Picked up the changes to %s" % self.fname)
return changed


Expand All @@ -191,6 +193,7 @@ def has_changed(self):
delay_watcher = FileChangedWatcher(os.path.join(working_dir, "delay"))
concurrency_watcher = FileChangedWatcher(os.path.join(working_dir, "concurrency"))
max_content_length_watcher = FileChangedWatcher(os.path.join(working_dir, "max_content_length"))
custom_hooks_watcher = FileChangedWatcher(os.path.join(working_dir, "custom_hooks.py"))

ignoracle = Ignoracle()

Expand Down Expand Up @@ -514,22 +517,38 @@ def update_concurrency():

def wait_time(_):
update_delay()
# While we're at it, update the concurrency level
update_concurrency()
update_custom_hooks()

return random.uniform(job_data["delay_min"], job_data["delay_max"]) / 1000


assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS
@swallow_exception
def update_custom_hooks():
if not custom_hooks_watcher.has_changed():
return

wpull_hook.callbacks.version = 2
wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.queued_url = queued_url
wpull_hook.callbacks.dequeued_url = dequeued_url
wpull_hook.callbacks.handle_response = handle_response
wpull_hook.callbacks.handle_error = handle_error
wpull_hook.callbacks.handle_pre_response = handle_pre_response
wpull_hook.callbacks.exit_status = exit_status
wpull_hook.callbacks.wait_time = wait_time
assert 2 in wpull_hook.callbacks.AVAILABLE_VERSIONS

# Set these every time, because custom_hooks.py may be wrapping them,
# and when custom_hooks.py is reloaded, we want it re-wrap the base functions
# instead of its already-wrapped functions.
wpull_hook.callbacks.version = 2
wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.queued_url = queued_url
wpull_hook.callbacks.dequeued_url = dequeued_url
wpull_hook.callbacks.handle_response = handle_response
wpull_hook.callbacks.handle_error = handle_error
wpull_hook.callbacks.handle_pre_response = handle_pre_response
wpull_hook.callbacks.exit_status = exit_status
wpull_hook.callbacks.wait_time = wait_time

custom_hooks_filename = os.path.join(working_dir, "custom_hooks.py")
with open(custom_hooks_filename, 'rb') as in_file:
code = compile(in_file.read(), custom_hooks_filename, 'exec')
context = {'wpull_hook': wpull_hook}
exec(code, context, context)

update_custom_hooks()

really_swallow_exceptions = True

0 comments on commit c37b32b

Please sign in to comment.