diff --git a/README.md b/README.md index 62fb30d..7ba2e6d 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ scrapy crawl ketovangelist-kitchen -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl low-carb-yum -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl queen-bs -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl ruled-me -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" +scrapy crawl skinny-taste -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl sugar-free-mom -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl wholesome-yum -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl your-friends-j -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" diff --git a/ketohub/spiders.py b/ketohub/spiders.py index fad0ebd..de6ee00 100644 --- a/ketohub/spiders.py +++ b/ketohub/spiders.py @@ -296,6 +296,33 @@ class QueenBs(spiders.CrawlSpider): ] +class SkinnyTaste(spiders.CrawlSpider): + name = 'skinny-taste' + + callback_handler = CallbackHandler( + content_saver=persist.ContentSaver(_get_download_root())) + + allowed_domains = ['skinnytaste.com'] + start_urls = ['https://www.skinnytaste.com/recipes/keto/'] + + rules = [ + # Extract links for finding additional recipe pages, + # e.g. https://www.skinnytaste.com/recipes/keto/page/2/ + spiders.Rule( + linkextractors.LinkExtractor( + allow=r'skinnytaste.com/recipes/keto/page/\d+/')), + # Extract links for recipes. + spiders.Rule( + linkextractors.LinkExtractor( + allow=[ + r'skinnytaste.com/[^\/]+/$', + ], + restrict_xpaths='//div[@class="archives"]'), + callback=callback_handler.process_callback, + follow=False), + ] + + class SugarFreeMom(spiders.CrawlSpider): name = 'sugar-free-mom'