From 9a7fb68485e2ae9b7808ab5ea1efcefa8e811ad8 Mon Sep 17 00:00:00 2001 From: Michael Lynch Date: Sun, 8 Jul 2018 19:39:59 -0400 Subject: [PATCH] Adding ketovale --- README.md | 1 + ketohub/spiders.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/README.md b/README.md index 9c6768e..f9862fe 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ scrapy crawl hey-keto-mama -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl keto-size-me -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl ketoconnect -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl ketogasm -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" +scrapy crawl ketovale -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl ketovangelist-kitchen -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl low-carb-yum -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl queen-bs -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" diff --git a/ketohub/spiders.py b/ketohub/spiders.py index 378e631..25f5bda 100644 --- a/ketohub/spiders.py +++ b/ketohub/spiders.py @@ -209,6 +209,31 @@ class KetovangelistKitchen(spiders.CrawlSpider): ] +class Ketovale(spiders.CrawlSpider): + name = 'ketovale' + + callback_handler = CallbackHandler( + content_saver=persist.ContentSaver(_get_download_root())) + + allowed_domains = ['ketovale.com'] + start_urls = ['https://www.ketovale.com/category/recipes/'] + + rules = [ + # Extract links for finding additional recipe pages, + # e.g. https://www.ketovale.com/category/recipes/page/3/ + spiders.Rule( + linkextractors.LinkExtractor( + allow=r'https://www.ketovale.com/category/recipes/page/\d+/')), + # Extract links for recipes. + spiders.Rule( + linkextractors.LinkExtractor( + allow=r'https://www.ketovale.com/recipe/.*/$', + restrict_xpaths='//h2[@class="entry-title"]'), + callback=callback_handler.process_callback, + follow=False), + ] + + class LowCarbYum(spiders.CrawlSpider): name = 'low-carb-yum'