Refactoring

Moving recipe key derivation to its own module. Tweaking tests so that they depend less on the spiders' internals.
mtlynch · Sep 28, 2017 · 49120af · 49120af
1 parent af7e892
commit 49120af
Show file tree

Hide file tree

Showing 7 changed files with 75 additions and 53 deletions.
diff --git a/ketohub/recipe_key.py b/ketohub/recipe_key.py
@@ -0,0 +1,15 @@
+import re
+
+
+def from_url(url):
+    """Converts a URL to a recipe key."""
+    # Strip out http:// or https:// prefix and www.
+    url = re.sub(r'http.://www\.', '', url)
+    # Strip trailing slash
+    url = re.sub(r'/$', '', url)
+    # Convert all characters to lowercase
+    url = url.lower()
+    # Replace all non a-z0-9/ characters with -
+    url = re.sub(r'[^a-z0-9/]', '-', url)
+    # Replace all / characters with _
+    return re.sub(r'/', '_', url)
diff --git a/ketohub/spiders/raw_content_spider.py b/ketohub/spiders/raw_content_spider.py
@@ -1,13 +1,13 @@
+import datetime
 import json
 import os
-import re
 import urllib
 
-from datetime import datetime
-
 from scrapy import crawler
 from scrapy import spiders
 
+from ketohub import recipe_key
+
 
 class Error(Exception):
     """Base Error class."""
@@ -38,38 +38,27 @@ def _write_to_file(filepath, content):
 
 
 class RawContentSpider(spiders.CrawlSpider):
-    """Base class to crawl keto sites and save  the html and image to a local file."""
+    """Base class to crawl keto sites and save the html and image to a local file."""
     name = 'raw_content'
 
     def __init__(self):
-        self._filepath_prefix = None
+        # Directory within the download root in which to place downloaded files.
+        self._download_subdir = datetime.datetime.utcnow().strftime(
+            '%Y%m%d/%H%M%SZ')
         super(RawContentSpider, self).__init__()
 
     def _get_recipe_main_image_url(self, response):
-        """Returns the URL for the recipe's primary image. Unimplemented in base class."""
-        pass
+        """Returns the URL for the recipe's primary image.
 
-    def _format_recipe_key(self, url):
-        """Formats the recipe key from the response url."""
-        # Strip out http:// or https:// prefix and www.
-        url = re.sub(r'http.://www\.', '', url)
-        # Strip trailing slash
-        url = re.sub(r'/$', '', url)
-        # Convert all characters to lowercase
-        url = url.lower()
-        # Replace all non a-z0-9/ characters with -
-        url = re.sub(r'[^a-z0-9/]', '-', url)
-        # Replace all / characters with _
-        return re.sub(r'/', '_', url)
-
-    def _set_download_root(self):
-        download_root = self.settings.get('DOWNLOAD_ROOT')
-        if not download_root:
-            raise MissingDownloadDirectory(
-                'Make sure you\'re providing a download directory.')
+        Child classes must override this method.
 
-        self._filepath_prefix = os.path.join(
-            download_root, datetime.utcnow().strftime('%Y%m%d/%H%M%SZ'))
+        Args:
+            response: Page response object.
+
+        Returns:
+            The URL for the main recipe image.
+        """
+        pass
 
     def download_recipe_contents(self, response):
         """Parses responses from the pages of individual recipes.
@@ -80,11 +69,14 @@ def download_recipe_contents(self, response):
         [download_root]/YYYYMMDD/hhmmssZ/[source_domain]/[relative_url]/
          """
         # Build path for scraped files
-        if not self._filepath_prefix:
-            self._set_download_root()
+        download_root = self.settings.get('DOWNLOAD_ROOT')
+        if not download_root:
+            raise MissingDownloadDirectory(
+                'Make sure you\'re providing a download directory.')
+
+        key = recipe_key.from_url(response.url)
 
-        output_dir = os.path.join(self._filepath_prefix,
-                                  self._format_recipe_key(response.url))
+        output_dir = os.path.join(download_root, self._download_subdir, key)
 
         # Write response body to file
         _write_to_file(
@@ -100,8 +92,8 @@ def download_recipe_contents(self, response):
 
         # Find image and save it
         try:
-            image_location = self._get_recipe_main_image_url(response)
+            image_url = self._get_recipe_main_image_url(response)
         except IndexError:
             raise UnexpectedResponse('Could not extract image from page.')
 
-        urllib.urlretrieve(image_location, os.path.join(output_dir, 'main.jpg'))
+        urllib.urlretrieve(image_url, os.path.join(output_dir, 'main.jpg'))
diff --git a/ketohub/spiders/ruled_me_crawl_spider.py b/ketohub/spiders/ruled_me_crawl_spider.py
@@ -1,10 +1,10 @@
 from scrapy import linkextractors
 from scrapy import spiders
 
-import ketohub.spiders.raw_content_spider
+from ketohub.spiders import raw_content_spider
 
 
-class RuledMeCrawlSpider(ketohub.spiders.raw_content_spider.RawContentSpider):
+class RuledMeCrawlSpider(raw_content_spider.RawContentSpider):
     """Spider to crawl keto sites and save the html and image to a local file for each recipe."""
     name = 'ruled_me_raw_content'
     allowed_domains = ['ruled.me']

diff --git a/tests/test_ketoconnect_crawl_spider.py b/tests/test_ketoconnect_crawl_spider.py
@@ -8,7 +8,7 @@ class KetoconnectCrawlSpiderTest(
         tests.test_raw_content_spider.RawContentSpiderTest):
     """Test case for the ketoconnect_raw_content spider."""
 
-    def test_get_recipe_main_image_url__returns_second_image(self):
+    def test_get_recipe_main_image_url_returns_second_image(self):
         """Tests that the correct second image is extracted."""
         file_content = (
             "<html><img src='images/wrong_image.jpg'><img src='images/right_image.jpg'></html>"
@@ -20,10 +20,10 @@ def test_get_recipe_main_image_url__returns_second_image(self):
             body=file_content)
 
         spider = ketoconnect_crawl_spider.KetoconnectCrawlSpider()
-        spider._filepath_prefix = '/foo/download/root/20170102/030405Z'
+        spider.settings = self.mock_settings
         spider.download_recipe_contents(response)
 
         # Make sure _write_to_file is called with correct arguments from get_recipe_main_image
         self.urllib_patch.assert_called_with(
             'images/right_image.jpg',
-            '/foo/download/root/20170102/030405Z/foo-com/main.jpg')
+            'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
diff --git a/tests/test_raw_content_spider.py b/tests/test_raw_content_spider.py
@@ -1,3 +1,4 @@
+import datetime
 import unittest
 
 import mock
@@ -16,6 +17,14 @@ def setUp(self):
         self.addCleanup(mock_urllib.stop)
         self.urllib_patch = mock_urllib.start()
 
+        self.mock_start_scrape_time = datetime.datetime(
+            year=2017, month=1, day=2, hour=3, minute=4, second=5)
+        mock_datetime = mock.patch(
+            'ketohub.spiders.raw_content_spider.datetime.datetime')
+        self.addCleanup(mock_datetime.stop)
+        datetime_patch = mock_datetime.start()
+        datetime_patch.utcnow.return_value = self.mock_start_scrape_time
+
         mock_write_to_file = mock.patch(
             'ketohub.spiders.raw_content_spider._write_to_file')
         self.addCleanup(mock_write_to_file.stop)
@@ -27,6 +36,9 @@ def setUp(self):
         self.addCleanup(mock_get_recipe_main_image.stop)
         self.get_image_patch = mock_get_recipe_main_image.start()
 
+        self.mock_settings = mock.Mock(spec=['get'])
+        self.mock_settings.get.return_value = 'dummy_download_root'
+
     def test_download_recipe_contents_with_a_simple_response(self):
         """Tests that download_recipe_contents works as expected for a simple response."""
         response = http.TextResponse(
@@ -36,20 +48,20 @@ def test_download_recipe_contents_with_a_simple_response(self):
 
         self.get_image_patch.return_value = 'test_image.jpg'
         spider = raw_content_spider.RawContentSpider()
-        spider._filepath_prefix = '/foo/download/root/20170102/030405Z'
+        spider.settings = self.mock_settings
         spider.download_recipe_contents(response)
 
         self.write_to_file_patch.assert_has_calls([
-            mock.call('/foo/download/root/20170102/030405Z/foo-com/index.html',
+            mock.call('dummy_download_root/20170102/030405Z/foo-com/index.html',
                       '<html></html>'),
             mock.call(
-                '/foo/download/root/20170102/030405Z/foo-com/metadata.json',
+                'dummy_download_root/20170102/030405Z/foo-com/metadata.json',
                 '{\n    "url":"https://www.foo.com"\n}')
         ])
 
         self.urllib_patch.assert_called_with(
             'test_image.jpg',
-            '/foo/download/root/20170102/030405Z/foo-com/main.jpg')
+            'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
 
     def test_download_recipe_contents_with_an_empty_response(self):
         """Tests that download recipe contents raises an error on an empty response."""
@@ -60,7 +72,7 @@ def test_download_recipe_contents_with_an_empty_response(self):
 
         self.get_image_patch.side_effect = IndexError
         spider = raw_content_spider.RawContentSpider()
-        spider._filepath_prefix = '/mock/download/root//20170102/030405Z'
+        spider.settings = self.mock_settings
 
         with self.assertRaises(raw_content_spider.UnexpectedResponse):
             spider.download_recipe_contents(response)
@@ -79,11 +91,3 @@ def test_that_undefined_download_folder_location_raises_error(self):
 
         with self.assertRaises(raw_content_spider.MissingDownloadDirectory):
             spider.download_recipe_contents(response)
-
-    def test_format_recipe_key_with_simple_url(self):
-        """Tests that _format_recipe_key returns an the recipe key as expected."""
-        spider = raw_content_spider.RawContentSpider()
-        actual_key = spider._format_recipe_key(
-            'https://www.mock.com/Mikes_Chicken_Kiev/')
-
-        self.assertEqual(actual_key, 'mock-com_mikes-chicken-kiev')
diff --git a/tests/test_recipe_key.py b/tests/test_recipe_key.py
@@ -0,0 +1,11 @@
+import unittest
+
+from ketohub import recipe_key
+
+
+class RawContentSpiderTest(unittest.TestCase):
+
+    def test_from_url_replaces_correct_characters(self):
+        self.assertEqual(
+            recipe_key.from_url('https://www.mock.com/Mikes_Chicken_Kiev/'),
+            'mock-com_mikes-chicken-kiev')
diff --git a/tests/test_ruled_me_crawl_spider.py b/tests/test_ruled_me_crawl_spider.py
@@ -20,10 +20,10 @@ def test_get_recipe_main_image_url_returns_first_image(self):
             body=file_content)
 
         spider = ruled_me_crawl_spider.RuledMeCrawlSpider()
-        spider._filepath_prefix = '/foo/download/root/20170102/030405Z'
+        spider.settings = self.mock_settings
         spider.download_recipe_contents(response)
 
         # Make sure _write_to_file is called with correct arguments from get_recipe_main_image
         self.urllib_patch.assert_called_with(
             'images/right_image.jpg',
-            '/foo/download/root/20170102/030405Z/foo-com/main.jpg')
+            'dummy_download_root/20170102/030405Z/foo-com/main.jpg')