From 96502661279a8da85839fa31bf3155aafede3d73 Mon Sep 17 00:00:00 2001 From: Michael Lynch Date: Thu, 25 Jan 2018 20:38:05 -0500 Subject: [PATCH] Adding spider for YourFriendsJ --- README.md | 1 + ketohub/spiders.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/README.md b/README.md index 4126ccd..9c6768e 100644 --- a/README.md +++ b/README.md @@ -16,4 +16,5 @@ scrapy crawl ketovangelist-kitchen -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl low-carb-yum -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl queen-bs -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" scrapy crawl ruled-me -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" +scrapy crawl your-friends-j -s "DOWNLOAD_ROOT=${OUTPUT_DIR}" ``` diff --git a/ketohub/spiders.py b/ketohub/spiders.py index b5087fd..fa63b6f 100644 --- a/ketohub/spiders.py +++ b/ketohub/spiders.py @@ -262,3 +262,24 @@ class QueenBs(spiders.CrawlSpider): callback=callback_handler.process_callback, follow=False) ] + + +class YourFriendsJ(spiders.CrawlSpider): + name = 'your-friends-j' + + callback_handler = CallbackHandler( + content_saver=persist.ContentSaver(_get_download_root())) + + allowed_domains = ['yourfriendsj.com'] + start_urls = ['http://yourfriendsj.com/category/keto/'] + + rules = [ + # Extract links for recipes, + # e.g. http://yourfriendsj.com/easy-guacamole-recipe/ + spiders.Rule( + linkextractors.LinkExtractor( + allow=r'http://yourfriendsj.com/[^\/]*/$', + restrict_xpaths='//div[@class="post-content"]'), + callback=callback_handler.process_callback, + follow=False) + ]