diff --git a/ketohub/recipe_key.py b/ketohub/recipe_key.py new file mode 100644 index 0000000..8551eb4 --- /dev/null +++ b/ketohub/recipe_key.py @@ -0,0 +1,15 @@ +import re + + +def from_url(url): + """Converts a URL to a recipe key.""" + # Strip out http:// or https:// prefix and www. + url = re.sub(r'http.://www\.', '', url) + # Strip trailing slash + url = re.sub(r'/$', '', url) + # Convert all characters to lowercase + url = url.lower() + # Replace all non a-z0-9/ characters with - + url = re.sub(r'[^a-z0-9/]', '-', url) + # Replace all / characters with _ + return re.sub(r'/', '_', url) diff --git a/ketohub/spiders/raw_content_spider.py b/ketohub/spiders/raw_content_spider.py index b9dad61..2873c78 100644 --- a/ketohub/spiders/raw_content_spider.py +++ b/ketohub/spiders/raw_content_spider.py @@ -1,13 +1,13 @@ +import datetime import json import os -import re import urllib -from datetime import datetime - from scrapy import crawler from scrapy import spiders +from ketohub import recipe_key + class Error(Exception): """Base Error class.""" @@ -38,38 +38,27 @@ def _write_to_file(filepath, content): class RawContentSpider(spiders.CrawlSpider): - """Base class to crawl keto sites and save the html and image to a local file.""" + """Base class to crawl keto sites and save the html and image to a local file.""" name = 'raw_content' def __init__(self): - self._filepath_prefix = None + # Directory within the download root in which to place downloaded files. + self._download_subdir = datetime.datetime.utcnow().strftime( + '%Y%m%d/%H%M%SZ') super(RawContentSpider, self).__init__() def _get_recipe_main_image_url(self, response): - """Returns the URL for the recipe's primary image. Unimplemented in base class.""" - pass + """Returns the URL for the recipe's primary image. - def _format_recipe_key(self, url): - """Formats the recipe key from the response url.""" - # Strip out http:// or https:// prefix and www. - url = re.sub(r'http.://www\.', '', url) - # Strip trailing slash - url = re.sub(r'/$', '', url) - # Convert all characters to lowercase - url = url.lower() - # Replace all non a-z0-9/ characters with - - url = re.sub(r'[^a-z0-9/]', '-', url) - # Replace all / characters with _ - return re.sub(r'/', '_', url) - - def _set_download_root(self): - download_root = self.settings.get('DOWNLOAD_ROOT') - if not download_root: - raise MissingDownloadDirectory( - 'Make sure you\'re providing a download directory.') + Child classes must override this method. - self._filepath_prefix = os.path.join( - download_root, datetime.utcnow().strftime('%Y%m%d/%H%M%SZ')) + Args: + response: Page response object. + + Returns: + The URL for the main recipe image. + """ + pass def download_recipe_contents(self, response): """Parses responses from the pages of individual recipes. @@ -80,11 +69,14 @@ def download_recipe_contents(self, response): [download_root]/YYYYMMDD/hhmmssZ/[source_domain]/[relative_url]/ """ # Build path for scraped files - if not self._filepath_prefix: - self._set_download_root() + download_root = self.settings.get('DOWNLOAD_ROOT') + if not download_root: + raise MissingDownloadDirectory( + 'Make sure you\'re providing a download directory.') + + key = recipe_key.from_url(response.url) - output_dir = os.path.join(self._filepath_prefix, - self._format_recipe_key(response.url)) + output_dir = os.path.join(download_root, self._download_subdir, key) # Write response body to file _write_to_file( @@ -100,8 +92,8 @@ def download_recipe_contents(self, response): # Find image and save it try: - image_location = self._get_recipe_main_image_url(response) + image_url = self._get_recipe_main_image_url(response) except IndexError: raise UnexpectedResponse('Could not extract image from page.') - urllib.urlretrieve(image_location, os.path.join(output_dir, 'main.jpg')) + urllib.urlretrieve(image_url, os.path.join(output_dir, 'main.jpg')) diff --git a/ketohub/spiders/ruled_me_crawl_spider.py b/ketohub/spiders/ruled_me_crawl_spider.py index 0f47f66..e6c4a23 100644 --- a/ketohub/spiders/ruled_me_crawl_spider.py +++ b/ketohub/spiders/ruled_me_crawl_spider.py @@ -1,10 +1,10 @@ from scrapy import linkextractors from scrapy import spiders -import ketohub.spiders.raw_content_spider +from ketohub.spiders import raw_content_spider -class RuledMeCrawlSpider(ketohub.spiders.raw_content_spider.RawContentSpider): +class RuledMeCrawlSpider(raw_content_spider.RawContentSpider): """Spider to crawl keto sites and save the html and image to a local file for each recipe.""" name = 'ruled_me_raw_content' allowed_domains = ['ruled.me'] diff --git a/tests/test_ketoconnect_crawl_spider.py b/tests/test_ketoconnect_crawl_spider.py index f3f636b..f77024a 100644 --- a/tests/test_ketoconnect_crawl_spider.py +++ b/tests/test_ketoconnect_crawl_spider.py @@ -8,7 +8,7 @@ class KetoconnectCrawlSpiderTest( tests.test_raw_content_spider.RawContentSpiderTest): """Test case for the ketoconnect_raw_content spider.""" - def test_get_recipe_main_image_url__returns_second_image(self): + def test_get_recipe_main_image_url_returns_second_image(self): """Tests that the correct second image is extracted.""" file_content = ( "" @@ -20,10 +20,10 @@ def test_get_recipe_main_image_url__returns_second_image(self): body=file_content) spider = ketoconnect_crawl_spider.KetoconnectCrawlSpider() - spider._filepath_prefix = '/foo/download/root/20170102/030405Z' + spider.settings = self.mock_settings spider.download_recipe_contents(response) # Make sure _write_to_file is called with correct arguments from get_recipe_main_image self.urllib_patch.assert_called_with( 'images/right_image.jpg', - '/foo/download/root/20170102/030405Z/foo-com/main.jpg') + 'dummy_download_root/20170102/030405Z/foo-com/main.jpg') diff --git a/tests/test_raw_content_spider.py b/tests/test_raw_content_spider.py index a494690..85def7a 100644 --- a/tests/test_raw_content_spider.py +++ b/tests/test_raw_content_spider.py @@ -1,3 +1,4 @@ +import datetime import unittest import mock @@ -16,6 +17,14 @@ def setUp(self): self.addCleanup(mock_urllib.stop) self.urllib_patch = mock_urllib.start() + self.mock_start_scrape_time = datetime.datetime( + year=2017, month=1, day=2, hour=3, minute=4, second=5) + mock_datetime = mock.patch( + 'ketohub.spiders.raw_content_spider.datetime.datetime') + self.addCleanup(mock_datetime.stop) + datetime_patch = mock_datetime.start() + datetime_patch.utcnow.return_value = self.mock_start_scrape_time + mock_write_to_file = mock.patch( 'ketohub.spiders.raw_content_spider._write_to_file') self.addCleanup(mock_write_to_file.stop) @@ -27,6 +36,9 @@ def setUp(self): self.addCleanup(mock_get_recipe_main_image.stop) self.get_image_patch = mock_get_recipe_main_image.start() + self.mock_settings = mock.Mock(spec=['get']) + self.mock_settings.get.return_value = 'dummy_download_root' + def test_download_recipe_contents_with_a_simple_response(self): """Tests that download_recipe_contents works as expected for a simple response.""" response = http.TextResponse( @@ -36,20 +48,20 @@ def test_download_recipe_contents_with_a_simple_response(self): self.get_image_patch.return_value = 'test_image.jpg' spider = raw_content_spider.RawContentSpider() - spider._filepath_prefix = '/foo/download/root/20170102/030405Z' + spider.settings = self.mock_settings spider.download_recipe_contents(response) self.write_to_file_patch.assert_has_calls([ - mock.call('/foo/download/root/20170102/030405Z/foo-com/index.html', + mock.call('dummy_download_root/20170102/030405Z/foo-com/index.html', ''), mock.call( - '/foo/download/root/20170102/030405Z/foo-com/metadata.json', + 'dummy_download_root/20170102/030405Z/foo-com/metadata.json', '{\n "url":"https://www.foo.com"\n}') ]) self.urllib_patch.assert_called_with( 'test_image.jpg', - '/foo/download/root/20170102/030405Z/foo-com/main.jpg') + 'dummy_download_root/20170102/030405Z/foo-com/main.jpg') def test_download_recipe_contents_with_an_empty_response(self): """Tests that download recipe contents raises an error on an empty response.""" @@ -60,7 +72,7 @@ def test_download_recipe_contents_with_an_empty_response(self): self.get_image_patch.side_effect = IndexError spider = raw_content_spider.RawContentSpider() - spider._filepath_prefix = '/mock/download/root//20170102/030405Z' + spider.settings = self.mock_settings with self.assertRaises(raw_content_spider.UnexpectedResponse): spider.download_recipe_contents(response) @@ -79,11 +91,3 @@ def test_that_undefined_download_folder_location_raises_error(self): with self.assertRaises(raw_content_spider.MissingDownloadDirectory): spider.download_recipe_contents(response) - - def test_format_recipe_key_with_simple_url(self): - """Tests that _format_recipe_key returns an the recipe key as expected.""" - spider = raw_content_spider.RawContentSpider() - actual_key = spider._format_recipe_key( - 'https://www.mock.com/Mikes_Chicken_Kiev/') - - self.assertEqual(actual_key, 'mock-com_mikes-chicken-kiev') diff --git a/tests/test_recipe_key.py b/tests/test_recipe_key.py new file mode 100644 index 0000000..d62cce4 --- /dev/null +++ b/tests/test_recipe_key.py @@ -0,0 +1,11 @@ +import unittest + +from ketohub import recipe_key + + +class RawContentSpiderTest(unittest.TestCase): + + def test_from_url_replaces_correct_characters(self): + self.assertEqual( + recipe_key.from_url('https://www.mock.com/Mikes_Chicken_Kiev/'), + 'mock-com_mikes-chicken-kiev') diff --git a/tests/test_ruled_me_crawl_spider.py b/tests/test_ruled_me_crawl_spider.py index b168798..4511b80 100644 --- a/tests/test_ruled_me_crawl_spider.py +++ b/tests/test_ruled_me_crawl_spider.py @@ -20,10 +20,10 @@ def test_get_recipe_main_image_url_returns_first_image(self): body=file_content) spider = ruled_me_crawl_spider.RuledMeCrawlSpider() - spider._filepath_prefix = '/foo/download/root/20170102/030405Z' + spider.settings = self.mock_settings spider.download_recipe_contents(response) # Make sure _write_to_file is called with correct arguments from get_recipe_main_image self.urllib_patch.assert_called_with( 'images/right_image.jpg', - '/foo/download/root/20170102/030405Z/foo-com/main.jpg') + 'dummy_download_root/20170102/030405Z/foo-com/main.jpg')