diff --git a/downloader/downloader.py b/downloader/downloader.py index 478522d..f905156 100644 --- a/downloader/downloader.py +++ b/downloader/downloader.py @@ -9,14 +9,6 @@ logger = logging.getLogger(__name__) -class Error(Exception): - pass - - -class UnexpectedImageType(Error): - pass - - def configure_logging(): root_logger = logging.getLogger() handler = logging.StreamHandler() @@ -46,22 +38,6 @@ def _write_to_file(filepath, content): open(filepath, 'wb').write(content) -def _download_image_data(image_url): - image_handle = urllib2.urlopen( - urllib2.Request( - image_url, - headers={ - 'User-Agent': - ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 ' - 'Safari/537.36') - })) - if image_handle.info().type != 'image/jpeg': - raise UnexpectedImageType('Expected image/jpeg, got ' + - image_handle.info().type) - return image_handle.read() - - def _download_image_urls(url_dict, output_root): download_queue = Queue.Queue() @@ -88,7 +64,7 @@ def _download_image_urls(url_dict, output_root): try: _write_to_file(item['destination'], - _download_image_data(item['url'])) + url.download_image_data(item['url'])) download_delay = max(download_delay - 1.0, 2.0) except urllib2.HTTPError as e: logger.warn('Got error trying to download %s: %s', item['url'], e) @@ -102,13 +78,8 @@ def _download_image_urls(url_dict, output_root): time.sleep(download_delay) -def dummy(): - pass - - def main(args): configure_logging() - dummy() with open(args.input_file) as input_file: url_dict = _parse_input_file(input_file) logger.info('Read %d input URLs', len(url_dict)) diff --git a/downloader/url.py b/downloader/url.py new file mode 100644 index 0000000..ee5cd86 --- /dev/null +++ b/downloader/url.py @@ -0,0 +1,22 @@ +import urllib2 + +_USER_AGENT = ( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/61.0.3163.100 Safari/537.36') + + +class Error(Exception): + pass + + +class UnexpectedImageType(Error): + pass + + +def download_image_data(image_url): + image_handle = urllib2.urlopen( + urllib2.Request(image_url, headers={'User-Agent': _USER_AGENT})) + if image_handle.info().type != 'image/jpeg': + raise UnexpectedImageType('Expected image/jpeg, got ' + + image_handle.info().type) + return image_handle.read() diff --git a/tests/test_downloader.py b/tests/test_downloader.py deleted file mode 100644 index 1a09541..0000000 --- a/tests/test_downloader.py +++ /dev/null @@ -1,9 +0,0 @@ -import unittest - -from downloader import downloader - - -class DownloaderTest(unittest.TestCase): - - def test_dummy(self): - downloader.dummy() diff --git a/tests/test_url.py b/tests/test_url.py new file mode 100644 index 0000000..8cad493 --- /dev/null +++ b/tests/test_url.py @@ -0,0 +1,43 @@ +import unittest +import urllib2 + +import mock + +from downloader import url + + +class ImagesTest(unittest.TestCase): + + def setUp(self): + mock_urlopen_patch = mock.patch( + 'downloader.url.urllib2.urlopen', autospec=True) + self.addCleanup(mock_urlopen_patch.stop) + self.mock_urlopen = mock_urlopen_patch.start() + + mock_request_patch = mock.patch( + 'downloader.url.urllib2.Request', autospec=True) + self.addCleanup(mock_request_patch.stop) + self.mock_request = mock_request_patch.start() + + def test_download_succeeds_when_server_is_ok(self): + mock_handle = mock.Mock() + mock_handle.info.return_value = mock.Mock(type='image/jpeg') + mock_handle.read.return_value = 'dummy image data' + self.mock_urlopen.return_value = mock_handle + self.assertEqual( + url.download_image_data('http://mock.com/image.jpg'), + 'dummy image data') + + def test_download_fails_when_server_returns_403(self): + self.mock_urlopen.side_effect = urllib2.HTTPError( + url='', code=404, msg='', hdrs=None, fp=None) + with self.assertRaises(urllib2.HTTPError): + url.download_image_data('http://mock.com/image.jpg') + + def test_download_fails_when_content_type_is_not_jpeg(self): + mock_handle = mock.Mock() + mock_handle.info.return_value = mock.Mock(type='image/png') + mock_handle.read.return_value = 'dummy image data' + self.mock_urlopen.return_value = mock_handle + with self.assertRaises(url.UnexpectedImageType): + url.download_image_data('http://mock.com/image.jpg')