Permalink
Browse files

Add default get_urls hook to get :orig images on Twitter and ?share=1…

… pages on Quora
  • Loading branch information...
ivan committed Aug 19, 2018
1 parent a3f1c51 commit 0ea3d4093860ac526ea5e2d8c591ea31df3ccd44
Showing with 19 additions and 2 deletions.
  1. +2 −1 extra_docs/custom_hooks_sample.py
  2. +1 −1 libgrabsite/__init__.py
  3. +16 −0 libgrabsite/wpull_hooks.py
@@ -38,7 +38,8 @@ def get_urls(filename, url_info, document_info):
# If we see this URL, also queue the URL for the :orig quality image
if url.startswith("https://pbs.twimg.com/media/"):
new_url = re.sub(":[a-z]{1,10}$", "", url) + ":orig"
extra_urls = [dict(url=new_url, link_type="media")] # see wpull/item.py:LinkType
# see wpull/item.py:LinkType
extra_urls = [dict(url=new_url, link_type="media", inline=True)]
print("Queueing %r" % (extra_urls,))
return extra_urls
View
@@ -1 +1 @@
__version__ = '1.7.2'
__version__ = '1.7.3'
View
@@ -532,6 +532,21 @@ def wait_time(seconds, url_info, record_info, response_info, error_info):
return random.uniform(job_data["delay_min"], job_data["delay_max"]) / 1000
def get_urls(filename, url_info, document_info):
url = url_info["url"]
extra_urls = None
# If we see this URL, also queue the URL for the :orig quality image
if url.startswith("https://pbs.twimg.com/media/"):
new_url = re.sub(":[a-z]{1,10}$", "", url) + ":orig"
# see wpull/item.py:LinkType
extra_urls = [dict(url=new_url, link_type="media", inline=True)]
# Quora shows login-required screen unless you add ?share=1
elif url.startswith("https://www.quora.com/") and not "?" in url:
new_url = url + "?share=1"
extra_urls = [dict(url=new_url, link_type="html")]
return extra_urls
@swallow_exception
def update_custom_hooks():
if not custom_hooks_watcher.has_changed():
@@ -551,6 +566,7 @@ def update_custom_hooks():
wpull_hook.callbacks.handle_pre_response = handle_pre_response
wpull_hook.callbacks.exit_status = exit_status
wpull_hook.callbacks.wait_time = wait_time
wpull_hook.callbacks.get_urls = get_urls
custom_hooks_filename = os.path.join(working_dir, "custom_hooks.py")
with open(custom_hooks_filename, 'rb') as in_file:

0 comments on commit 0ea3d40

Please sign in to comment.