Refactoring save logic into ContentSaver

mtlynch · Sep 29, 2017 · aab7969 · aab7969
1 parent 08125de
commit aab7969
Show file tree

Hide file tree

Showing 6 changed files with 121 additions and 63 deletions.
diff --git a/ketohub/persist.py b/ketohub/persist.py
@@ -0,0 +1,36 @@
+import json
+import os
+
+
+def _ensure_directory_exists(directory_path):
+    """Ensures the directories in directory_path exist."""
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
+
+
+def _write_to_file(filepath, content):
+    """writes content to a local file."""
+    _ensure_directory_exists(os.path.dirname(filepath))
+    open(filepath, 'wb').write(content)
+
+
+class ContentSaver(object):
+    """Saves recipe content to disk."""
+
+    def __init__(self, root, write_file_fn=_write_to_file):
+        self._root = root
+        self._write_file_fn = write_file_fn
+
+    def save_metadata(self, metadata):
+        self._write_file_fn(
+            self._output_path('metadata.json'),
+            json.dumps(metadata, indent=4, separators=(',', ':')))
+
+    def save_recipe_html(self, recipe_html):
+        self._write_file_fn(self._output_path('index.html'), recipe_html)
+
+    def save_main_image(self, main_image_data):
+        self._write_file_fn(self._output_path('main.jpg'), main_image_data)
+
+    def _output_path(self, filename):
+        return os.path.join(self._root, filename)
diff --git a/ketohub/spiders/raw_content_spider.py b/ketohub/spiders/raw_content_spider.py
@@ -6,6 +6,7 @@
 from scrapy import crawler
 from scrapy import spiders
 
+from ketohub import persist
 from ketohub import recipe_key
 
 
@@ -24,19 +25,6 @@ class MissingDownloadDirectory(Error):
     pass
 
 
-def _ensure_directory_exists(directory_path):
-    """Ensures the directories in directory_path exist."""
-    if os.path.exists(directory_path):
-        return True
-    os.makedirs(directory_path)
-
-
-def _write_to_file(filepath, content):
-    """Writes content to a local file."""
-    _ensure_directory_exists(os.path.dirname(filepath))
-    open(filepath, 'w').write(content)
-
-
 class RawContentSpider(spiders.CrawlSpider):
     """Base class to crawl keto sites and save the html and image to a local file."""
     name = 'raw_content'
@@ -60,40 +48,35 @@ def _get_recipe_main_image_url(self, response):
         """
         pass
 
-    def download_recipe_contents(self, response):
-        """Parses responses from the pages of individual recipes.
-
-        Saves a recipe image as main.jpg and page html as index.html for each recipe page link
-        extracted. Each recipe is saved in a location that follows this schema:
-
-        [download_root]/YYYYMMDD/hhmmssZ/[source_domain]/[relative_url]/
-         """
-        # Build path for scraped files
+    def _make_content_saver(self, url):
         download_root = self.settings.get('DOWNLOAD_ROOT')
         if not download_root:
             raise MissingDownloadDirectory(
                 'Make sure you\'re providing a download directory.')
 
-        key = recipe_key.from_url(response.url)
+        key = recipe_key.from_url(url)
 
         output_dir = os.path.join(download_root, self._download_subdir, key)
+        return persist.ContentSaver(output_dir)
+
+    def download_recipe_contents(self, response):
+        """Parses responses from the pages of individual recipes.
 
-        # Write response body to file
-        _write_to_file(
-            os.path.join(output_dir, 'index.html'),
-            response.text.encode('utf8'))
+        Saves a recipe image as main.jpg and page html as index.html for each recipe page link
+        extracted. Each recipe is saved in a location that follows this schema:
 
-        # Write url to metadata file
-        _write_to_file(
-            os.path.join(output_dir, 'metadata.json'),
-            json.dumps({
-                'url': response.url
-            }, indent=4, separators=(',', ':')))
+        [download_root]/YYYYMMDD/hhmmssZ/[source_domain]/[relative_url]/
+         """
+        content_saver = self._make_content_saver(response.url)
+        content_saver.save_recipe_html(response.text.encode('utf8'))
+        content_saver.save_metadata({'url': response.url})
 
         # Find image and save it
         try:
             image_url = self._get_recipe_main_image_url(response)
+
         except IndexError:
             raise UnexpectedResponse('Could not extract image from page.')
 
-        urllib.urlretrieve(image_url, os.path.join(output_dir, 'main.jpg'))
+        image_handle = urllib.urlopen(image_url)
+        content_saver.save_main_image(image_handle.read())
diff --git a/tests/test_ketoconnect_crawl_spider.py b/tests/test_ketoconnect_crawl_spider.py
@@ -23,7 +23,4 @@ def test_get_recipe_main_image_url_returns_second_image(self):
         spider.settings = self.mock_settings
         spider.download_recipe_contents(response)
 
-        # Make sure _write_to_file is called with correct arguments from get_recipe_main_image
-        self.urllib_patch.assert_called_with(
-            'images/right_image.jpg',
-            'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
+        self.urlopen_patch.assert_called_with('images/right_image.jpg')
diff --git a/tests/test_persist.py b/tests/test_persist.py
@@ -0,0 +1,41 @@
+import unittest
+
+import mock
+
+from ketohub import persist
+
+
+class PersistTest(unittest.TestCase):
+
+    def setUp(self):
+        self.mock_write_to_file_fn = mock.Mock()
+
+    def test_save_recipe_html_saves_to_correct_file(self):
+        saver = persist.ContentSaver('downloads/foo',
+                                     self.mock_write_to_file_fn)
+        saver.save_recipe_html('<html>Mock HTML</html>')
+
+        self.mock_write_to_file_fn.assert_has_calls([
+            mock.call('downloads/foo/index.html', '<html>Mock HTML</html>'),
+        ])
+
+    def test_save_main_image_saves_to_correct_file(self):
+        saver = persist.ContentSaver('downloads/foo',
+                                     self.mock_write_to_file_fn)
+        saver.save_main_image('dummy image data')
+
+        self.mock_write_to_file_fn.assert_has_calls([
+            mock.call('downloads/foo/main.jpg', 'dummy image data'),
+        ])
+
+    def test_save_metadata_saves_to_correct_file(self):
+        saver = persist.ContentSaver('downloads/foo',
+                                     self.mock_write_to_file_fn)
+        saver.save_metadata({'dummy_key': 'dummy value'})
+
+        self.mock_write_to_file_fn.assert_has_calls([
+            mock.call('downloads/foo/metadata.json', """
+{
+    "dummy_key":"dummy value"
+}""".strip()),
+        ])
diff --git a/tests/test_raw_content_spider.py b/tests/test_raw_content_spider.py
@@ -1,4 +1,5 @@
 import datetime
+import io
 import unittest
 
 import mock
@@ -11,11 +12,10 @@ class RawContentSpiderTest(unittest.TestCase):
     """Test case for the raw_content spider."""
 
     def setUp(self):
-        mock_urllib = mock.patch(
-            'ketohub.spiders.raw_content_spider.urllib.urlretrieve',
-            autospec=True)
-        self.addCleanup(mock_urllib.stop)
-        self.urllib_patch = mock_urllib.start()
+        mock_urlopen = mock.patch(
+            'ketohub.spiders.raw_content_spider.urllib.urlopen', autospec=True)
+        self.addCleanup(mock_urlopen.stop)
+        self.urlopen_patch = mock_urlopen.start()
 
         self.mock_start_scrape_time = datetime.datetime(
             year=2017, month=1, day=2, hour=3, minute=4, second=5)
@@ -25,10 +25,12 @@ def setUp(self):
         datetime_patch = mock_datetime.start()
         datetime_patch.utcnow.return_value = self.mock_start_scrape_time
 
-        mock_write_to_file = mock.patch(
-            'ketohub.spiders.raw_content_spider._write_to_file')
-        self.addCleanup(mock_write_to_file.stop)
-        self.write_to_file_patch = mock_write_to_file.start()
+        mock_content_saver = mock.patch(
+            'ketohub.spiders.raw_content_spider.persist.ContentSaver')
+        self.addCleanup(mock_content_saver.stop)
+        self.content_saver_patch = mock_content_saver.start()
+        self.mock_saver = mock.Mock()
+        self.content_saver_patch.return_value = self.mock_saver
 
         mock_get_recipe_main_image = mock.patch(
             'ketohub.spiders.raw_content_spider.RawContentSpider._get_recipe_main_image_url'
@@ -46,22 +48,24 @@ def test_download_recipe_contents_with_a_simple_response(self):
             request=http.Request('https://www.foo.com'),
             body='<html></html>')
 
-        self.get_image_patch.return_value = 'test_image.jpg'
+        self.get_image_patch.return_value = 'https://mock.com/test_image.jpg'
+        self.urlopen_patch.return_value = io.BytesIO('dummy image data')
         spider = raw_content_spider.RawContentSpider()
         spider.settings = self.mock_settings
         spider.download_recipe_contents(response)
 
-        self.write_to_file_patch.assert_has_calls([
-            mock.call('dummy_download_root/20170102/030405Z/foo-com/index.html',
-                      '<html></html>'),
-            mock.call(
-                'dummy_download_root/20170102/030405Z/foo-com/metadata.json',
-                '{\n    "url":"https://www.foo.com"\n}')
-        ])
-
-        self.urllib_patch.assert_called_with(
-            'test_image.jpg',
-            'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
+        self.content_saver_patch.assert_called_once_with(
+            'dummy_download_root/20170102/030405Z/foo-com')
+        self.mock_saver.save_recipe_html.assert_called_once_with(
+            '<html></html>')
+        self.mock_saver.save_metadata.assert_called_once_with({
+            'url':
+            'https://www.foo.com',
+        })
+        self.mock_saver.save_main_image.assert_called_once_with(
+            'dummy image data')
+
+        self.urlopen_patch.assert_called_with('https://mock.com/test_image.jpg')
 
     def test_download_recipe_contents_with_an_empty_response(self):
         """Tests that download recipe contents raises an error on an empty response."""

diff --git a/tests/test_ruled_me_crawl_spider.py b/tests/test_ruled_me_crawl_spider.py
@@ -23,7 +23,4 @@ def test_get_recipe_main_image_url_returns_first_image(self):
         spider.settings = self.mock_settings
         spider.download_recipe_contents(response)
 
-        # Make sure _write_to_file is called with correct arguments from get_recipe_main_image
-        self.urllib_patch.assert_called_with(
-            'images/right_image.jpg',
-            'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
+        self.urlopen_patch.assert_called_once_with('images/right_image.jpg')