Skip to content

Commit

Permalink
Refactoring save logic into ContentSaver
Browse files Browse the repository at this point in the history
  • Loading branch information
mtlynch committed Sep 29, 2017
1 parent 08125de commit aab7969
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 63 deletions.
36 changes: 36 additions & 0 deletions ketohub/persist.py
@@ -0,0 +1,36 @@
import json
import os


def _ensure_directory_exists(directory_path):
"""Ensures the directories in directory_path exist."""
if not os.path.exists(directory_path):
os.makedirs(directory_path)


def _write_to_file(filepath, content):
"""writes content to a local file."""
_ensure_directory_exists(os.path.dirname(filepath))
open(filepath, 'wb').write(content)


class ContentSaver(object):
"""Saves recipe content to disk."""

def __init__(self, root, write_file_fn=_write_to_file):
self._root = root
self._write_file_fn = write_file_fn

def save_metadata(self, metadata):
self._write_file_fn(
self._output_path('metadata.json'),
json.dumps(metadata, indent=4, separators=(',', ':')))

def save_recipe_html(self, recipe_html):
self._write_file_fn(self._output_path('index.html'), recipe_html)

def save_main_image(self, main_image_data):
self._write_file_fn(self._output_path('main.jpg'), main_image_data)

def _output_path(self, filename):
return os.path.join(self._root, filename)
51 changes: 17 additions & 34 deletions ketohub/spiders/raw_content_spider.py
Expand Up @@ -6,6 +6,7 @@
from scrapy import crawler
from scrapy import spiders

from ketohub import persist
from ketohub import recipe_key


Expand All @@ -24,19 +25,6 @@ class MissingDownloadDirectory(Error):
pass


def _ensure_directory_exists(directory_path):
"""Ensures the directories in directory_path exist."""
if os.path.exists(directory_path):
return True
os.makedirs(directory_path)


def _write_to_file(filepath, content):
"""Writes content to a local file."""
_ensure_directory_exists(os.path.dirname(filepath))
open(filepath, 'w').write(content)


class RawContentSpider(spiders.CrawlSpider):
"""Base class to crawl keto sites and save the html and image to a local file."""
name = 'raw_content'
Expand All @@ -60,40 +48,35 @@ def _get_recipe_main_image_url(self, response):
"""
pass

def download_recipe_contents(self, response):
"""Parses responses from the pages of individual recipes.
Saves a recipe image as main.jpg and page html as index.html for each recipe page link
extracted. Each recipe is saved in a location that follows this schema:
[download_root]/YYYYMMDD/hhmmssZ/[source_domain]/[relative_url]/
"""
# Build path for scraped files
def _make_content_saver(self, url):
download_root = self.settings.get('DOWNLOAD_ROOT')
if not download_root:
raise MissingDownloadDirectory(
'Make sure you\'re providing a download directory.')

key = recipe_key.from_url(response.url)
key = recipe_key.from_url(url)

output_dir = os.path.join(download_root, self._download_subdir, key)
return persist.ContentSaver(output_dir)

def download_recipe_contents(self, response):
"""Parses responses from the pages of individual recipes.
# Write response body to file
_write_to_file(
os.path.join(output_dir, 'index.html'),
response.text.encode('utf8'))
Saves a recipe image as main.jpg and page html as index.html for each recipe page link
extracted. Each recipe is saved in a location that follows this schema:
# Write url to metadata file
_write_to_file(
os.path.join(output_dir, 'metadata.json'),
json.dumps({
'url': response.url
}, indent=4, separators=(',', ':')))
[download_root]/YYYYMMDD/hhmmssZ/[source_domain]/[relative_url]/
"""
content_saver = self._make_content_saver(response.url)
content_saver.save_recipe_html(response.text.encode('utf8'))
content_saver.save_metadata({'url': response.url})

# Find image and save it
try:
image_url = self._get_recipe_main_image_url(response)

except IndexError:
raise UnexpectedResponse('Could not extract image from page.')

urllib.urlretrieve(image_url, os.path.join(output_dir, 'main.jpg'))
image_handle = urllib.urlopen(image_url)
content_saver.save_main_image(image_handle.read())
5 changes: 1 addition & 4 deletions tests/test_ketoconnect_crawl_spider.py
Expand Up @@ -23,7 +23,4 @@ def test_get_recipe_main_image_url_returns_second_image(self):
spider.settings = self.mock_settings
spider.download_recipe_contents(response)

# Make sure _write_to_file is called with correct arguments from get_recipe_main_image
self.urllib_patch.assert_called_with(
'images/right_image.jpg',
'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
self.urlopen_patch.assert_called_with('images/right_image.jpg')
41 changes: 41 additions & 0 deletions tests/test_persist.py
@@ -0,0 +1,41 @@
import unittest

import mock

from ketohub import persist


class PersistTest(unittest.TestCase):

def setUp(self):
self.mock_write_to_file_fn = mock.Mock()

def test_save_recipe_html_saves_to_correct_file(self):
saver = persist.ContentSaver('downloads/foo',
self.mock_write_to_file_fn)
saver.save_recipe_html('<html>Mock HTML</html>')

self.mock_write_to_file_fn.assert_has_calls([
mock.call('downloads/foo/index.html', '<html>Mock HTML</html>'),
])

def test_save_main_image_saves_to_correct_file(self):
saver = persist.ContentSaver('downloads/foo',
self.mock_write_to_file_fn)
saver.save_main_image('dummy image data')

self.mock_write_to_file_fn.assert_has_calls([
mock.call('downloads/foo/main.jpg', 'dummy image data'),
])

def test_save_metadata_saves_to_correct_file(self):
saver = persist.ContentSaver('downloads/foo',
self.mock_write_to_file_fn)
saver.save_metadata({'dummy_key': 'dummy value'})

self.mock_write_to_file_fn.assert_has_calls([
mock.call('downloads/foo/metadata.json', """
{
"dummy_key":"dummy value"
}""".strip()),
])
46 changes: 25 additions & 21 deletions tests/test_raw_content_spider.py
@@ -1,4 +1,5 @@
import datetime
import io
import unittest

import mock
Expand All @@ -11,11 +12,10 @@ class RawContentSpiderTest(unittest.TestCase):
"""Test case for the raw_content spider."""

def setUp(self):
mock_urllib = mock.patch(
'ketohub.spiders.raw_content_spider.urllib.urlretrieve',
autospec=True)
self.addCleanup(mock_urllib.stop)
self.urllib_patch = mock_urllib.start()
mock_urlopen = mock.patch(
'ketohub.spiders.raw_content_spider.urllib.urlopen', autospec=True)
self.addCleanup(mock_urlopen.stop)
self.urlopen_patch = mock_urlopen.start()

self.mock_start_scrape_time = datetime.datetime(
year=2017, month=1, day=2, hour=3, minute=4, second=5)
Expand All @@ -25,10 +25,12 @@ def setUp(self):
datetime_patch = mock_datetime.start()
datetime_patch.utcnow.return_value = self.mock_start_scrape_time

mock_write_to_file = mock.patch(
'ketohub.spiders.raw_content_spider._write_to_file')
self.addCleanup(mock_write_to_file.stop)
self.write_to_file_patch = mock_write_to_file.start()
mock_content_saver = mock.patch(
'ketohub.spiders.raw_content_spider.persist.ContentSaver')
self.addCleanup(mock_content_saver.stop)
self.content_saver_patch = mock_content_saver.start()
self.mock_saver = mock.Mock()
self.content_saver_patch.return_value = self.mock_saver

mock_get_recipe_main_image = mock.patch(
'ketohub.spiders.raw_content_spider.RawContentSpider._get_recipe_main_image_url'
Expand All @@ -46,22 +48,24 @@ def test_download_recipe_contents_with_a_simple_response(self):
request=http.Request('https://www.foo.com'),
body='<html></html>')

self.get_image_patch.return_value = 'test_image.jpg'
self.get_image_patch.return_value = 'https://mock.com/test_image.jpg'
self.urlopen_patch.return_value = io.BytesIO('dummy image data')
spider = raw_content_spider.RawContentSpider()
spider.settings = self.mock_settings
spider.download_recipe_contents(response)

self.write_to_file_patch.assert_has_calls([
mock.call('dummy_download_root/20170102/030405Z/foo-com/index.html',
'<html></html>'),
mock.call(
'dummy_download_root/20170102/030405Z/foo-com/metadata.json',
'{\n "url":"https://www.foo.com"\n}')
])

self.urllib_patch.assert_called_with(
'test_image.jpg',
'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
self.content_saver_patch.assert_called_once_with(
'dummy_download_root/20170102/030405Z/foo-com')
self.mock_saver.save_recipe_html.assert_called_once_with(
'<html></html>')
self.mock_saver.save_metadata.assert_called_once_with({
'url':
'https://www.foo.com',
})
self.mock_saver.save_main_image.assert_called_once_with(
'dummy image data')

self.urlopen_patch.assert_called_with('https://mock.com/test_image.jpg')

def test_download_recipe_contents_with_an_empty_response(self):
"""Tests that download recipe contents raises an error on an empty response."""
Expand Down
5 changes: 1 addition & 4 deletions tests/test_ruled_me_crawl_spider.py
Expand Up @@ -23,7 +23,4 @@ def test_get_recipe_main_image_url_returns_first_image(self):
spider.settings = self.mock_settings
spider.download_recipe_contents(response)

# Make sure _write_to_file is called with correct arguments from get_recipe_main_image
self.urllib_patch.assert_called_with(
'images/right_image.jpg',
'dummy_download_root/20170102/030405Z/foo-com/main.jpg')
self.urlopen_patch.assert_called_once_with('images/right_image.jpg')

0 comments on commit aab7969

Please sign in to comment.