Skip to content

Commit

Permalink
Refactoring downloader to separate module
Browse files Browse the repository at this point in the history
  • Loading branch information
mtlynch committed Oct 17, 2017
1 parent 3832db7 commit 4e724c0
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 39 deletions.
31 changes: 1 addition & 30 deletions downloader/downloader.py
Expand Up @@ -9,14 +9,6 @@
logger = logging.getLogger(__name__)


class Error(Exception):
pass


class UnexpectedImageType(Error):
pass


def configure_logging():
root_logger = logging.getLogger()
handler = logging.StreamHandler()
Expand Down Expand Up @@ -46,22 +38,6 @@ def _write_to_file(filepath, content):
open(filepath, 'wb').write(content)


def _download_image_data(image_url):
image_handle = urllib2.urlopen(
urllib2.Request(
image_url,
headers={
'User-Agent':
('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 '
'Safari/537.36')
}))
if image_handle.info().type != 'image/jpeg':
raise UnexpectedImageType('Expected image/jpeg, got ' +
image_handle.info().type)
return image_handle.read()


def _download_image_urls(url_dict, output_root):
download_queue = Queue.Queue()

Expand All @@ -88,7 +64,7 @@ def _download_image_urls(url_dict, output_root):

try:
_write_to_file(item['destination'],
_download_image_data(item['url']))
url.download_image_data(item['url']))
download_delay = max(download_delay - 1.0, 2.0)
except urllib2.HTTPError as e:
logger.warn('Got error trying to download %s: %s', item['url'], e)
Expand All @@ -102,13 +78,8 @@ def _download_image_urls(url_dict, output_root):
time.sleep(download_delay)


def dummy():
pass


def main(args):
configure_logging()
dummy()
with open(args.input_file) as input_file:
url_dict = _parse_input_file(input_file)
logger.info('Read %d input URLs', len(url_dict))
Expand Down
22 changes: 22 additions & 0 deletions downloader/url.py
@@ -0,0 +1,22 @@
import urllib2

_USER_AGENT = (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/61.0.3163.100 Safari/537.36')


class Error(Exception):
pass


class UnexpectedImageType(Error):
pass


def download_image_data(image_url):
image_handle = urllib2.urlopen(
urllib2.Request(image_url, headers={'User-Agent': _USER_AGENT}))
if image_handle.info().type != 'image/jpeg':
raise UnexpectedImageType('Expected image/jpeg, got ' +
image_handle.info().type)
return image_handle.read()
9 changes: 0 additions & 9 deletions tests/test_downloader.py

This file was deleted.

43 changes: 43 additions & 0 deletions tests/test_url.py
@@ -0,0 +1,43 @@
import unittest
import urllib2

import mock

from downloader import url


class ImagesTest(unittest.TestCase):

def setUp(self):
mock_urlopen_patch = mock.patch(
'downloader.url.urllib2.urlopen', autospec=True)
self.addCleanup(mock_urlopen_patch.stop)
self.mock_urlopen = mock_urlopen_patch.start()

mock_request_patch = mock.patch(
'downloader.url.urllib2.Request', autospec=True)
self.addCleanup(mock_request_patch.stop)
self.mock_request = mock_request_patch.start()

def test_download_succeeds_when_server_is_ok(self):
mock_handle = mock.Mock()
mock_handle.info.return_value = mock.Mock(type='image/jpeg')
mock_handle.read.return_value = 'dummy image data'
self.mock_urlopen.return_value = mock_handle
self.assertEqual(
url.download_image_data('http://mock.com/image.jpg'),
'dummy image data')

def test_download_fails_when_server_returns_403(self):
self.mock_urlopen.side_effect = urllib2.HTTPError(
url='', code=404, msg='', hdrs=None, fp=None)
with self.assertRaises(urllib2.HTTPError):
url.download_image_data('http://mock.com/image.jpg')

def test_download_fails_when_content_type_is_not_jpeg(self):
mock_handle = mock.Mock()
mock_handle.info.return_value = mock.Mock(type='image/png')
mock_handle.read.return_value = 'dummy image data'
self.mock_urlopen.return_value = mock_handle
with self.assertRaises(url.UnexpectedImageType):
url.download_image_data('http://mock.com/image.jpg')

0 comments on commit 4e724c0

Please sign in to comment.