Permalink
Browse files

extra_docs/custom_hooks_sample.py: add a hook that queues additional …

…URLs
  • Loading branch information...
ivan committed Oct 25, 2017
1 parent 2b56a73 commit 7200878118158781b36862d25c687faa9e514420
Showing with 20 additions and 4 deletions.
  1. +19 −3 extra_docs/custom_hooks_sample.py
  2. +1 −1 libgrabsite/__init__.py
@@ -1,9 +1,15 @@
"""
This is a sample script that can be passed to grab-site --custom-hooks=.
It drops http:// URLs before they can be queued, and it aborts responses
that have a Content-Type: that starts with 'audio/'
This is a sample script that can be passed to grab-site --custom-hooks=. It
1) drops http:// URLs before they can be queued
2) aborts responses that have a Content-Type: that starts with 'audio/'
3) queues additional URLs on Twitter to get original-quality images
For self-help on writing hooks, `git clone https://github.com/chfoo/wpull`,
`git checkout v1.2.3`, and read wpull/hook.py.
"""
import re
accept_url_grabsite = wpull_hook.callbacks.accept_url
def accept_url(url_info, record_info, verdict, reasons):
url = url_info['url']
@@ -27,5 +33,15 @@ def handle_pre_response(url_info, url_record, response_info):
return wpull_hook.actions.FINISH
return handle_pre_response_grabsite(url_info, url_record, response_info)
def get_urls(filename, url_info, document_info):
url = url_info["url"]
# If we see this URL, also queue the URL for the :orig quality image
if url.startswith("https://pbs.twimg.com/media/"):
new_url = re.sub(":[a-z]{1,10}$", "", url) + ":orig"
extra_urls = [dict(url=new_url, link_type="media")] # see wpull/item.py:LinkType
print("Queueing %r" % (extra_urls,))
return extra_urls
wpull_hook.callbacks.accept_url = accept_url
wpull_hook.callbacks.handle_pre_response = handle_pre_response
wpull_hook.callbacks.get_urls = get_urls
View
@@ -1 +1 @@
__version__ = '1.3.1'
__version__ = '1.3.2'

0 comments on commit 7200878

Please sign in to comment.