Permalink
Browse files

Use DIR/scrape file to control whether to scrape for new URLs in resp…

…onses

present = do scrape
missing = don't scrape
  • Loading branch information...
ivan committed Aug 28, 2018
1 parent 90c3752 commit cdd79287502ce9b46be53c7b96b5c2c6caa27539
Showing with 20 additions and 1 deletion.
  1. +1 −1 libgrabsite/__init__.py
  2. +3 −0 libgrabsite/main.py
  3. +16 −0 libgrabsite/wpull_hooks.py
View
@@ -1 +1 @@
__version__ = '1.7.4'
__version__ = '1.8.0'
View
@@ -370,6 +370,9 @@ def get_base_wpull_args():
with open("{}/delay".format(working_dir), "w") as f:
f.write(delay)
with open("{}/scrape".format(working_dir), "w") as f:
pass
# We don't actually need to write control files for this mode to work, but the
# only reason to use this is if you're starting wpull manually with modified
# arguments, and wpull_hooks.py requires the control files.
View
@@ -269,6 +269,7 @@ def dequeued_url(url_info, record_info):
"max_content_length": -1,
"suppress_ignore_reports": True,
"video": True,
"scrape": True,
"concurrency": 2,
"bytes_downloaded": 0,
"items_queued": 0,
@@ -356,6 +357,19 @@ def update_video():
update_video()
scrape_path = os.path.join(working_dir, "scrape")
def update_scrape():
scrape = path_exists_with_cache(scrape_path)
job_data["scrape"] = scrape
if not scrape:
# Empty the list of scrapers, which will stop scraping for new URLs
# but still keep going through what is already in the queue.
wpull_hook.factory.get('DemuxDocumentScraper')._document_scrapers = []
update_scrape()
def maybe_log_ignore(url, pattern):
update_igoff()
if not job_data["suppress_ignore_reports"]:
@@ -404,6 +418,8 @@ def has_video_ext(url):
def handle_pre_response(url_info, url_record, response_info):
url = url_info['url']
update_scrape()
update_max_content_length()
if job_data["max_content_length"] != -1:
##pprint.pprint(response_info)

0 comments on commit cdd7928

Please sign in to comment.