From 1c43f1953f0d577603d4649dfc8e8b7d9f03a97d Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 13 Aug 2018 12:30:03 +0900 Subject: [PATCH] Use built-in `open` whenever possible (#208) * added integration test * Resolving issue #207: use built-in open whenever possible * rename integration test using correct issue number * update unit tests * isolate failing test case * Updated unit tests Changed the mock target to keep in step with the shortcut_open function. Split tests into separate test cases. * respond to review comments - use tempfile instead of hard-coded file - integrate new test into travis.yml so that Travis runs it * added support for buffering parameter --- .travis.yml | 2 + integration-tests/test_207.py | 35 ++++++++ smart_open/smart_open_lib.py | 19 ++++- smart_open/tests/test_smart_open.py | 119 +++++++++++++++++++--------- 4 files changed, 137 insertions(+), 38 deletions(-) create mode 100644 integration-tests/test_207.py diff --git a/.travis.yml b/.travis.yml index 55260e60..4ef4911d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -54,6 +54,8 @@ script: aws s3 cp .benchmarks/*/*.json ${SO_S3_RESULT_URL}; aws s3 rm --recursive $SO_S3_URL; fi + - pip install numpy + - py.test integration-tests/test_207.py cache: diff --git a/integration-tests/test_207.py b/integration-tests/test_207.py new file mode 100644 index 00000000..887bceab --- /dev/null +++ b/integration-tests/test_207.py @@ -0,0 +1,35 @@ +import os +import sys +import tempfile + +try: + import numpy as np +except ImportError: + print("You really need numpy to proceed with this test") + sys.exit(1) + +import smart_open + + +def tofile(): + dt = np.dtype([('time', [('min', int), ('sec', int)]), ('temp', float)]) + x = np.zeros((1,), dtype=dt) + + with tempfile.NamedTemporaryFile(prefix='test_207', suffix='.dat', delete=False) as fout: + x.tofile(fout.name) + return fout.name + + +def test(): + try: + path = tofile() + with smart_open.smart_open(path, 'rb') as fin: + loaded = np.fromfile(fin) + return 0 + finally: + os.unlink(path) + return 1 + + +if __name__ == '__main__': + sys.exit(test()) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 6657a5c5..6f20f4f1 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -274,6 +274,13 @@ def _shortcut_open(uri, mode, **kw): if extension in ('.gz', '.bz2') and not ignore_extension: return None + # + # https://docs.python.org/2/library/functions.html#open + # + # buffering: 0: off; 1: on; negative number: use system default + # + buffering = kw.get('buffering', -1) + open_kwargs = {} errors = kw.get('errors') if errors is not None: @@ -284,7 +291,17 @@ def _shortcut_open(uri, mode, **kw): open_kwargs['encoding'] = encoding mode = mode.replace('b', '') - return io.open(parsed_uri.uri_path, mode, **open_kwargs) + # + # Under Py3, the built-in open accepts kwargs, and it's OK to use that. + # Under Py2, the built-in open _doesn't_ accept kwargs, but we still use it + # whenever possible (see issue #207). If we're under Py2 and have to use + # kwargs, then we have no option other to use io.open. + # + if six.PY3: + return open(parsed_uri.uri_path, mode, buffering=buffering, **open_kwargs) + elif not open_kwargs: + return open(parsed_uri.uri_path, mode, buffering=buffering) + return io.open(parsed_uri.uri_path, mode, buffering=buffering, **open_kwargs) def _open_binary_stream(uri, mode, **kw): diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index af3c0a41..aede45cf 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -13,6 +13,7 @@ import os import sys import hashlib +import unittest import boto3 import mock @@ -195,6 +196,14 @@ def test_http_bz2(self): self.assertEqual(smart_open_object.read(), test_string) +# +# What exactly to patch here differs on _how_ we're opening the file. +# See the _shortcut_open function for details. +# +_IO_OPEN = 'io.open' +_BUILTIN_OPEN = 'smart_open.smart_open_lib.open' + + class SmartOpenReadTest(unittest.TestCase): """ Test reading from files under various schemes. @@ -203,9 +212,9 @@ class SmartOpenReadTest(unittest.TestCase): def test_shortcut(self): fpath = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt') - with mock.patch('io.open') as mock_open: + with mock.patch('smart_open.smart_open_lib.open') as mock_open: smart_open.smart_open(fpath, 'r').read() - mock_open.assert_called_with(fpath, 'r') + mock_open.assert_called_with(fpath, 'r', buffering=-1) def test_open_with_keywords(self): """This test captures Issue #142.""" @@ -311,7 +320,7 @@ def test_s3_iter_lines(self): self.assertEqual(b''.join(output), test_string) # TODO: add more complex test for file:// - @mock.patch('io.open') + @mock.patch('smart_open.smart_open_lib.open') def test_file(self, mock_smart_open): """Is file:// line iterator called correctly?""" prefix = "file://" @@ -320,29 +329,52 @@ def test_file(self, mock_smart_open): smart_open_object = smart_open.smart_open(prefix+full_path, read_mode) smart_open_object.__iter__() # called with the correct path? - mock_smart_open.assert_called_with(full_path, read_mode) + mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1) full_path = '/tmp/test#hash##more.txt' read_mode = "rb" smart_open_object = smart_open.smart_open(prefix+full_path, read_mode) smart_open_object.__iter__() # called with the correct path? - mock_smart_open.assert_called_with(full_path, read_mode) + mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1) full_path = 'aa#aa' read_mode = "rb" smart_open_object = smart_open.smart_open(full_path, read_mode) smart_open_object.__iter__() # called with the correct path? - mock_smart_open.assert_called_with(full_path, read_mode) + mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1) + + short_path = "~/tmp/test.txt" + full_path = os.path.expanduser(short_path) + @mock.patch(_IO_OPEN if six.PY2 else _BUILTIN_OPEN) + def test_file_errors(self, mock_smart_open): + prefix = "file://" + full_path = '/tmp/test.txt' + read_mode = "r" short_path = "~/tmp/test.txt" full_path = os.path.expanduser(short_path) smart_open_object = smart_open.smart_open(prefix+short_path, read_mode, errors='strict') smart_open_object.__iter__() # called with the correct expanded path? - mock_smart_open.assert_called_with(full_path, read_mode, errors='strict') + mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1, errors='strict') + + @mock.patch(_BUILTIN_OPEN) + def test_file_buffering(self, mock_smart_open): + smart_open_object = smart_open.smart_open('/tmp/somefile', 'rb', buffering=0) + smart_open_object.__iter__() + # called with the correct expanded path? + mock_smart_open.assert_called_with('/tmp/somefile', 'rb', buffering=0) + + @unittest.skip('smart_open does not currently accept additional positional args') + @mock.patch(_BUILTIN_OPEN) + def test_file_buffering2(self, mock_smart_open): + smart_open_object = smart_open.smart_open('/tmp/somefile', 'rb', 0) + smart_open_object.__iter__() + # called with the correct expanded path? + mock_smart_open.assert_called_with('/tmp/somefile', 'rb', buffering=0) # couldn't find any project for mocking up HDFS data # TODO: we want to test also a content of the files, not just fnc call params @@ -534,17 +566,14 @@ class SmartOpenTest(unittest.TestCase): Test reading and writing from/into files. """ - @mock.patch('smart_open.smart_open_lib.boto') - def test_file_mode_mock(self, mock_boto): - """Are file:// open modes passed correctly?""" - as_text = u'куда идём мы с пятачком - большой большой секрет' - as_bytes = as_text.encode('utf-8') - - # incorrect file mode - self.assertRaises( - NotImplementedError, smart_open.smart_open, "s3://bucket/key", "x" - ) + def setUp(self): + self.as_text = u'куда идём мы с пятачком - большой большой секрет' + self.as_bytes = self.as_text.encode('utf-8') + self.stringio = io.StringIO(self.as_text) + self.bytesio = io.BytesIO(self.as_bytes) + def test_file_mode_mock(self): + """Are file:// open modes passed correctly?""" # correct read modes # # We always open files in binary mode first, but engage @@ -552,47 +581,63 @@ def test_file_mode_mock(self, mock_boto): # _initially_ got opened, we now also check the end result: if the # contents got decoded correctly. # - with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open: + + def test_text(self): + patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN + with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("blah", "r", encoding='utf-8') as fin: - self.assertEqual(fin.read(), as_text) - mock_open.assert_called_with("blah", "r", encoding='utf-8') + self.assertEqual(fin.read(), self.as_text) + mock_open.assert_called_with("blah", "r", buffering=-1, encoding='utf-8') - with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open: + def test_binary(self): + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.bytesio)) as mock_open: with smart_open.smart_open("blah", "rb") as fin: - self.assertEqual(fin.read(), as_bytes) - mock_open.assert_called_with("blah", "rb") + self.assertEqual(fin.read(), self.as_bytes) + mock_open.assert_called_with("blah", "rb", buffering=-1) + def test_expanded_path(self): short_path = "~/blah" full_path = os.path.expanduser(short_path) - with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open: + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open(short_path, "rb") as fin: - mock_open.assert_called_with(full_path, "rb") + mock_open.assert_called_with(full_path, "rb", buffering=-1) + + def test_incorrect(self): + # incorrect file mode + self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "x") # correct write modes, incorrect scheme self.assertRaises(NotImplementedError, smart_open.smart_open, "hdfs:///blah.txt", "wb+") self.assertRaises(NotImplementedError, smart_open.smart_open, "http:///blah.txt", "w") self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "wb+") + def test_write_utf8(self): # correct write mode, correct file:// URI - with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open: + patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN + with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("blah", "w", encoding='utf-8') as fout: - mock_open.assert_called_with("blah", "w", encoding='utf-8') - fout.write(as_text) + mock_open.assert_called_with("blah", "w", buffering=-1, encoding='utf-8') + fout.write(self.as_text) - with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open: + def test_write_utf8_absolute_path(self): + patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN + with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("/some/file.txt", "w", encoding='utf-8') as fout: - mock_open.assert_called_with("/some/file.txt", "w", encoding='utf-8') - fout.write(as_text) + mock_open.assert_called_with("/some/file.txt", "w", buffering=-1, encoding='utf-8') + fout.write(self.as_text) - with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open: + def test_append_utf8(self): + patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN + with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.smart_open("/some/file.txt", "w+", encoding='utf-8') as fout: - mock_open.assert_called_with("/some/file.txt", "w+", encoding='utf-8') - fout.write(as_text) + mock_open.assert_called_with("/some/file.txt", "w+", buffering=-1, encoding='utf-8') + fout.write(self.as_text) - with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open: + def test_append_binary_absolute_path(self): + with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.bytesio)) as mock_open: with smart_open.smart_open("/some/file.txt", "wb+") as fout: - mock_open.assert_called_with("/some/file.txt", "wb+") - fout.write(as_bytes) + mock_open.assert_called_with("/some/file.txt", "wb+", buffering=-1) + fout.write(self.as_bytes) @mock.patch('boto3.Session') def test_s3_mode_mock(self, mock_session):