Skip to content

Commit

Permalink
Refactoring
Browse files Browse the repository at this point in the history
Moving recipe key derivation to its own module.
Tweaking tests so that they depend less on the spiders' internals.
  • Loading branch information
mtlynch committed Sep 28, 2017
1 parent af7e892 commit 49120af
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 53 deletions.
15 changes: 15 additions & 0 deletions ketohub/recipe_key.py
@@ -0,0 +1,15 @@
import re


def from_url(url):
"""Converts a URL to a recipe key."""
# Strip out http:// or https:// prefix and www.
url = re.sub(r'http.://www\.', '', url)
# Strip trailing slash
url = re.sub(r'/$', '', url)
# Convert all characters to lowercase
url = url.lower()
# Replace all non a-z0-9/ characters with -
url = re.sub(r'[^a-z0-9/]', '-', url)
# Replace all / characters with _
return re.sub(r'/', '_', url)
58 changes: 25 additions & 33 deletions ketohub/spiders/raw_content_spider.py
@@ -1,13 +1,13 @@
import datetime
import json
import os
import re
import urllib

from datetime import datetime

from scrapy import crawler
from scrapy import spiders

from ketohub import recipe_key


class Error(Exception):
"""Base Error class."""
Expand Down Expand Up @@ -38,38 +38,27 @@ def _write_to_file(filepath, content):


class RawContentSpider(spiders.CrawlSpider):
"""Base class to crawl keto sites and save the html and image to a local file."""
"""Base class to crawl keto sites and save the html and image to a local file."""
name = 'raw_content'

def __init__(self):
self._filepath_prefix = None
# Directory within the download root in which to place downloaded files.
self._download_subdir = datetime.datetime.utcnow().strftime(
'%Y%m%d/%H%M%SZ')
super(RawContentSpider, self).__init__()

def _get_recipe_main_image_url(self, response):
"""Returns the URL for the recipe's primary image. Unimplemented in base class."""
pass
"""Returns the URL for the recipe's primary image.
def _format_recipe_key(self, url):
"""Formats the recipe key from the response url."""
# Strip out http:// or https:// prefix and www.
url = re.sub(r'http.://www\.', '', url)
# Strip trailing slash
url = re.sub(r'/$', '', url)
# Convert all characters to lowercase
url = url.lower()
# Replace all non a-z0-9/ characters with -
url = re.sub(r'[^a-z0-9/]', '-', url)
# Replace all / characters with _
return re.sub(r'/', '_', url)

def _set_download_root(self):
download_root = self.settings.get('DOWNLOAD_ROOT')
if not download_root:
raise MissingDownloadDirectory(
'Make sure you\'re providing a download directory.')
Child classes must override this method.
self._filepath_prefix = os.path.join(
download_root, datetime.utcnow().strftime('%Y%m%d/%H%M%SZ'))
Args:
response: Page response object.
Returns:
The URL for the main recipe image.
"""
pass

def download_recipe_contents(self, response):
"""Parses responses from the pages of individual recipes.
Expand All @@ -80,11 +69,14 @@ def download_recipe_contents(self, response):
[download_root]/YYYYMMDD/hhmmssZ/[source_domain]/[relative_url]/
"""
# Build path for scraped files
if not self._filepath_prefix:
self._set_download_root()
download_root = self.settings.get('DOWNLOAD_ROOT')
if not download_root:
raise MissingDownloadDirectory(
'Make sure you\'re providing a download directory.')

key = recipe_key.from_url(response.url)

output_dir = os.path.join(self._filepath_prefix,
self._format_recipe_key(response.url))
output_dir = os.path.join(download_root, self._download_subdir, key)

# Write response body to file
_write_to_file(
Expand All @@ -100,8 +92,8 @@ def download_recipe_contents(self, response):

# Find image and save it
try:
image_location = self._get_recipe_main_image_url(response)
image_url = self._get_recipe_main_image_url(response)
except IndexError:
raise UnexpectedResponse('Could not extract image from page.')

urllib.urlretrieve(image_location, os.path.join(output_dir, 'main.jpg'))
urllib.urlretrieve(image_url, os.path.join(output_dir, 'main.jpg'))
4 changes: 2 additions & 2 deletions ketohub/spiders/ruled_me_crawl_spider.py
@@ -1,10 +1,10 @@
from scrapy import linkextractors
from scrapy import spiders

import ketohub.spiders.raw_content_spider
from ketohub.spiders import raw_content_spider


class RuledMeCrawlSpider(ketohub.spiders.raw_content_spider.RawContentSpider):
class RuledMeCrawlSpider(raw_content_spider.RawContentSpider):
"""Spider to crawl keto sites and save the html and image to a local file for each recipe."""
name = 'ruled_me_raw_content'
allowed_domains = ['ruled.me']
Expand Down
6 changes: 3 additions & 3 deletions tests/test_ketoconnect_crawl_spider.py
Expand Up @@ -8,7 +8,7 @@ class KetoconnectCrawlSpiderTest(
tests.test_raw_content_spider.RawContentSpiderTest):
"""Test case for the ketoconnect_raw_content spider."""

def test_get_recipe_main_image_url__returns_second_image(self):
def test_get_recipe_main_image_url_returns_second_image(self):
"""Tests that the correct second image is extracted."""
file_content = (
"<html><img src='images/wrong_image.jpg'><img src='images/right_image.jpg'></html>"
Expand All @@ -20,10 +20,10 @@ def test_get_recipe_main_image_url__returns_second_image(self):
body=file_content)

spider = ketoconnect_crawl_spider.KetoconnectCrawlSpider()
spider._filepath_prefix = '/foo/download/root/20170102/030405Z'
spider.settings = self.mock_settings
spider.download_recipe_contents(response)

# Make sure _write_to_file is called with correct arguments from get_recipe_main_image
self.urllib_patch.assert_called_with(
'images/right_image.jpg',
'/foo/download/root/20170102/030405Z/foo-com/main.jpg')
'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
30 changes: 17 additions & 13 deletions tests/test_raw_content_spider.py
@@ -1,3 +1,4 @@
import datetime
import unittest

import mock
Expand All @@ -16,6 +17,14 @@ def setUp(self):
self.addCleanup(mock_urllib.stop)
self.urllib_patch = mock_urllib.start()

self.mock_start_scrape_time = datetime.datetime(
year=2017, month=1, day=2, hour=3, minute=4, second=5)
mock_datetime = mock.patch(
'ketohub.spiders.raw_content_spider.datetime.datetime')
self.addCleanup(mock_datetime.stop)
datetime_patch = mock_datetime.start()
datetime_patch.utcnow.return_value = self.mock_start_scrape_time

mock_write_to_file = mock.patch(
'ketohub.spiders.raw_content_spider._write_to_file')
self.addCleanup(mock_write_to_file.stop)
Expand All @@ -27,6 +36,9 @@ def setUp(self):
self.addCleanup(mock_get_recipe_main_image.stop)
self.get_image_patch = mock_get_recipe_main_image.start()

self.mock_settings = mock.Mock(spec=['get'])
self.mock_settings.get.return_value = 'dummy_download_root'

def test_download_recipe_contents_with_a_simple_response(self):
"""Tests that download_recipe_contents works as expected for a simple response."""
response = http.TextResponse(
Expand All @@ -36,20 +48,20 @@ def test_download_recipe_contents_with_a_simple_response(self):

self.get_image_patch.return_value = 'test_image.jpg'
spider = raw_content_spider.RawContentSpider()
spider._filepath_prefix = '/foo/download/root/20170102/030405Z'
spider.settings = self.mock_settings
spider.download_recipe_contents(response)

self.write_to_file_patch.assert_has_calls([
mock.call('/foo/download/root/20170102/030405Z/foo-com/index.html',
mock.call('dummy_download_root/20170102/030405Z/foo-com/index.html',
'<html></html>'),
mock.call(
'/foo/download/root/20170102/030405Z/foo-com/metadata.json',
'dummy_download_root/20170102/030405Z/foo-com/metadata.json',
'{\n "url":"https://www.foo.com"\n}')
])

self.urllib_patch.assert_called_with(
'test_image.jpg',
'/foo/download/root/20170102/030405Z/foo-com/main.jpg')
'dummy_download_root/20170102/030405Z/foo-com/main.jpg')

def test_download_recipe_contents_with_an_empty_response(self):
"""Tests that download recipe contents raises an error on an empty response."""
Expand All @@ -60,7 +72,7 @@ def test_download_recipe_contents_with_an_empty_response(self):

self.get_image_patch.side_effect = IndexError
spider = raw_content_spider.RawContentSpider()
spider._filepath_prefix = '/mock/download/root//20170102/030405Z'
spider.settings = self.mock_settings

with self.assertRaises(raw_content_spider.UnexpectedResponse):
spider.download_recipe_contents(response)
Expand All @@ -79,11 +91,3 @@ def test_that_undefined_download_folder_location_raises_error(self):

with self.assertRaises(raw_content_spider.MissingDownloadDirectory):
spider.download_recipe_contents(response)

def test_format_recipe_key_with_simple_url(self):
"""Tests that _format_recipe_key returns an the recipe key as expected."""
spider = raw_content_spider.RawContentSpider()
actual_key = spider._format_recipe_key(
'https://www.mock.com/Mikes_Chicken_Kiev/')

self.assertEqual(actual_key, 'mock-com_mikes-chicken-kiev')
11 changes: 11 additions & 0 deletions tests/test_recipe_key.py
@@ -0,0 +1,11 @@
import unittest

from ketohub import recipe_key


class RawContentSpiderTest(unittest.TestCase):

def test_from_url_replaces_correct_characters(self):
self.assertEqual(
recipe_key.from_url('https://www.mock.com/Mikes_Chicken_Kiev/'),
'mock-com_mikes-chicken-kiev')
4 changes: 2 additions & 2 deletions tests/test_ruled_me_crawl_spider.py
Expand Up @@ -20,10 +20,10 @@ def test_get_recipe_main_image_url_returns_first_image(self):
body=file_content)

spider = ruled_me_crawl_spider.RuledMeCrawlSpider()
spider._filepath_prefix = '/foo/download/root/20170102/030405Z'
spider.settings = self.mock_settings
spider.download_recipe_contents(response)

# Make sure _write_to_file is called with correct arguments from get_recipe_main_image
self.urllib_patch.assert_called_with(
'images/right_image.jpg',
'/foo/download/root/20170102/030405Z/foo-com/main.jpg')
'dummy_download_root/20170102/030405Z/foo-com/main.jpg')

0 comments on commit 49120af

Please sign in to comment.