diff --git a/ketohub/recipe_key.py b/ketohub/recipe_key.py
new file mode 100644
index 0000000..8551eb4
--- /dev/null
+++ b/ketohub/recipe_key.py
@@ -0,0 +1,15 @@
+import re
+
+
+def from_url(url):
+ """Converts a URL to a recipe key."""
+ # Strip out http:// or https:// prefix and www.
+ url = re.sub(r'http.://www\.', '', url)
+ # Strip trailing slash
+ url = re.sub(r'/$', '', url)
+ # Convert all characters to lowercase
+ url = url.lower()
+ # Replace all non a-z0-9/ characters with -
+ url = re.sub(r'[^a-z0-9/]', '-', url)
+ # Replace all / characters with _
+ return re.sub(r'/', '_', url)
diff --git a/ketohub/spiders/raw_content_spider.py b/ketohub/spiders/raw_content_spider.py
index b9dad61..2873c78 100644
--- a/ketohub/spiders/raw_content_spider.py
+++ b/ketohub/spiders/raw_content_spider.py
@@ -1,13 +1,13 @@
+import datetime
import json
import os
-import re
import urllib
-from datetime import datetime
-
from scrapy import crawler
from scrapy import spiders
+from ketohub import recipe_key
+
class Error(Exception):
"""Base Error class."""
@@ -38,38 +38,27 @@ def _write_to_file(filepath, content):
class RawContentSpider(spiders.CrawlSpider):
- """Base class to crawl keto sites and save the html and image to a local file."""
+ """Base class to crawl keto sites and save the html and image to a local file."""
name = 'raw_content'
def __init__(self):
- self._filepath_prefix = None
+ # Directory within the download root in which to place downloaded files.
+ self._download_subdir = datetime.datetime.utcnow().strftime(
+ '%Y%m%d/%H%M%SZ')
super(RawContentSpider, self).__init__()
def _get_recipe_main_image_url(self, response):
- """Returns the URL for the recipe's primary image. Unimplemented in base class."""
- pass
+ """Returns the URL for the recipe's primary image.
- def _format_recipe_key(self, url):
- """Formats the recipe key from the response url."""
- # Strip out http:// or https:// prefix and www.
- url = re.sub(r'http.://www\.', '', url)
- # Strip trailing slash
- url = re.sub(r'/$', '', url)
- # Convert all characters to lowercase
- url = url.lower()
- # Replace all non a-z0-9/ characters with -
- url = re.sub(r'[^a-z0-9/]', '-', url)
- # Replace all / characters with _
- return re.sub(r'/', '_', url)
-
- def _set_download_root(self):
- download_root = self.settings.get('DOWNLOAD_ROOT')
- if not download_root:
- raise MissingDownloadDirectory(
- 'Make sure you\'re providing a download directory.')
+ Child classes must override this method.
- self._filepath_prefix = os.path.join(
- download_root, datetime.utcnow().strftime('%Y%m%d/%H%M%SZ'))
+ Args:
+ response: Page response object.
+
+ Returns:
+ The URL for the main recipe image.
+ """
+ pass
def download_recipe_contents(self, response):
"""Parses responses from the pages of individual recipes.
@@ -80,11 +69,14 @@ def download_recipe_contents(self, response):
[download_root]/YYYYMMDD/hhmmssZ/[source_domain]/[relative_url]/
"""
# Build path for scraped files
- if not self._filepath_prefix:
- self._set_download_root()
+ download_root = self.settings.get('DOWNLOAD_ROOT')
+ if not download_root:
+ raise MissingDownloadDirectory(
+ 'Make sure you\'re providing a download directory.')
+
+ key = recipe_key.from_url(response.url)
- output_dir = os.path.join(self._filepath_prefix,
- self._format_recipe_key(response.url))
+ output_dir = os.path.join(download_root, self._download_subdir, key)
# Write response body to file
_write_to_file(
@@ -100,8 +92,8 @@ def download_recipe_contents(self, response):
# Find image and save it
try:
- image_location = self._get_recipe_main_image_url(response)
+ image_url = self._get_recipe_main_image_url(response)
except IndexError:
raise UnexpectedResponse('Could not extract image from page.')
- urllib.urlretrieve(image_location, os.path.join(output_dir, 'main.jpg'))
+ urllib.urlretrieve(image_url, os.path.join(output_dir, 'main.jpg'))
diff --git a/ketohub/spiders/ruled_me_crawl_spider.py b/ketohub/spiders/ruled_me_crawl_spider.py
index 0f47f66..e6c4a23 100644
--- a/ketohub/spiders/ruled_me_crawl_spider.py
+++ b/ketohub/spiders/ruled_me_crawl_spider.py
@@ -1,10 +1,10 @@
from scrapy import linkextractors
from scrapy import spiders
-import ketohub.spiders.raw_content_spider
+from ketohub.spiders import raw_content_spider
-class RuledMeCrawlSpider(ketohub.spiders.raw_content_spider.RawContentSpider):
+class RuledMeCrawlSpider(raw_content_spider.RawContentSpider):
"""Spider to crawl keto sites and save the html and image to a local file for each recipe."""
name = 'ruled_me_raw_content'
allowed_domains = ['ruled.me']
diff --git a/tests/test_ketoconnect_crawl_spider.py b/tests/test_ketoconnect_crawl_spider.py
index f3f636b..f77024a 100644
--- a/tests/test_ketoconnect_crawl_spider.py
+++ b/tests/test_ketoconnect_crawl_spider.py
@@ -8,7 +8,7 @@ class KetoconnectCrawlSpiderTest(
tests.test_raw_content_spider.RawContentSpiderTest):
"""Test case for the ketoconnect_raw_content spider."""
- def test_get_recipe_main_image_url__returns_second_image(self):
+ def test_get_recipe_main_image_url_returns_second_image(self):
"""Tests that the correct second image is extracted."""
file_content = (
""
@@ -20,10 +20,10 @@ def test_get_recipe_main_image_url__returns_second_image(self):
body=file_content)
spider = ketoconnect_crawl_spider.KetoconnectCrawlSpider()
- spider._filepath_prefix = '/foo/download/root/20170102/030405Z'
+ spider.settings = self.mock_settings
spider.download_recipe_contents(response)
# Make sure _write_to_file is called with correct arguments from get_recipe_main_image
self.urllib_patch.assert_called_with(
'images/right_image.jpg',
- '/foo/download/root/20170102/030405Z/foo-com/main.jpg')
+ 'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
diff --git a/tests/test_raw_content_spider.py b/tests/test_raw_content_spider.py
index a494690..85def7a 100644
--- a/tests/test_raw_content_spider.py
+++ b/tests/test_raw_content_spider.py
@@ -1,3 +1,4 @@
+import datetime
import unittest
import mock
@@ -16,6 +17,14 @@ def setUp(self):
self.addCleanup(mock_urllib.stop)
self.urllib_patch = mock_urllib.start()
+ self.mock_start_scrape_time = datetime.datetime(
+ year=2017, month=1, day=2, hour=3, minute=4, second=5)
+ mock_datetime = mock.patch(
+ 'ketohub.spiders.raw_content_spider.datetime.datetime')
+ self.addCleanup(mock_datetime.stop)
+ datetime_patch = mock_datetime.start()
+ datetime_patch.utcnow.return_value = self.mock_start_scrape_time
+
mock_write_to_file = mock.patch(
'ketohub.spiders.raw_content_spider._write_to_file')
self.addCleanup(mock_write_to_file.stop)
@@ -27,6 +36,9 @@ def setUp(self):
self.addCleanup(mock_get_recipe_main_image.stop)
self.get_image_patch = mock_get_recipe_main_image.start()
+ self.mock_settings = mock.Mock(spec=['get'])
+ self.mock_settings.get.return_value = 'dummy_download_root'
+
def test_download_recipe_contents_with_a_simple_response(self):
"""Tests that download_recipe_contents works as expected for a simple response."""
response = http.TextResponse(
@@ -36,20 +48,20 @@ def test_download_recipe_contents_with_a_simple_response(self):
self.get_image_patch.return_value = 'test_image.jpg'
spider = raw_content_spider.RawContentSpider()
- spider._filepath_prefix = '/foo/download/root/20170102/030405Z'
+ spider.settings = self.mock_settings
spider.download_recipe_contents(response)
self.write_to_file_patch.assert_has_calls([
- mock.call('/foo/download/root/20170102/030405Z/foo-com/index.html',
+ mock.call('dummy_download_root/20170102/030405Z/foo-com/index.html',
''),
mock.call(
- '/foo/download/root/20170102/030405Z/foo-com/metadata.json',
+ 'dummy_download_root/20170102/030405Z/foo-com/metadata.json',
'{\n "url":"https://www.foo.com"\n}')
])
self.urllib_patch.assert_called_with(
'test_image.jpg',
- '/foo/download/root/20170102/030405Z/foo-com/main.jpg')
+ 'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
def test_download_recipe_contents_with_an_empty_response(self):
"""Tests that download recipe contents raises an error on an empty response."""
@@ -60,7 +72,7 @@ def test_download_recipe_contents_with_an_empty_response(self):
self.get_image_patch.side_effect = IndexError
spider = raw_content_spider.RawContentSpider()
- spider._filepath_prefix = '/mock/download/root//20170102/030405Z'
+ spider.settings = self.mock_settings
with self.assertRaises(raw_content_spider.UnexpectedResponse):
spider.download_recipe_contents(response)
@@ -79,11 +91,3 @@ def test_that_undefined_download_folder_location_raises_error(self):
with self.assertRaises(raw_content_spider.MissingDownloadDirectory):
spider.download_recipe_contents(response)
-
- def test_format_recipe_key_with_simple_url(self):
- """Tests that _format_recipe_key returns an the recipe key as expected."""
- spider = raw_content_spider.RawContentSpider()
- actual_key = spider._format_recipe_key(
- 'https://www.mock.com/Mikes_Chicken_Kiev/')
-
- self.assertEqual(actual_key, 'mock-com_mikes-chicken-kiev')
diff --git a/tests/test_recipe_key.py b/tests/test_recipe_key.py
new file mode 100644
index 0000000..d62cce4
--- /dev/null
+++ b/tests/test_recipe_key.py
@@ -0,0 +1,11 @@
+import unittest
+
+from ketohub import recipe_key
+
+
+class RawContentSpiderTest(unittest.TestCase):
+
+ def test_from_url_replaces_correct_characters(self):
+ self.assertEqual(
+ recipe_key.from_url('https://www.mock.com/Mikes_Chicken_Kiev/'),
+ 'mock-com_mikes-chicken-kiev')
diff --git a/tests/test_ruled_me_crawl_spider.py b/tests/test_ruled_me_crawl_spider.py
index b168798..4511b80 100644
--- a/tests/test_ruled_me_crawl_spider.py
+++ b/tests/test_ruled_me_crawl_spider.py
@@ -20,10 +20,10 @@ def test_get_recipe_main_image_url_returns_first_image(self):
body=file_content)
spider = ruled_me_crawl_spider.RuledMeCrawlSpider()
- spider._filepath_prefix = '/foo/download/root/20170102/030405Z'
+ spider.settings = self.mock_settings
spider.download_recipe_contents(response)
# Make sure _write_to_file is called with correct arguments from get_recipe_main_image
self.urllib_patch.assert_called_with(
'images/right_image.jpg',
- '/foo/download/root/20170102/030405Z/foo-com/main.jpg')
+ 'dummy_download_root/20170102/030405Z/foo-com/main.jpg')