Skip to content

Commit

Permalink
Fix performance regression using local file-system. Fix #184 (#190)
Browse files Browse the repository at this point in the history
* Fix #184

* update unit tests

* fix branch for http integration tests
  • Loading branch information
mpenkov authored and menshikh-iv committed Apr 21, 2018
1 parent b499f87 commit abb08ae
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 26 deletions.
24 changes: 24 additions & 0 deletions integration-tests/test_184.py
@@ -0,0 +1,24 @@
import sys
import time

import smart_open

open_fn = smart_open.smart_open
# open_fn = open

def report_time_iterate_rows(file_name, report_every=100000):
start = time.time()
last = start
with open_fn(file_name, 'r') as f:
for i, line in enumerate(f, start=1):
if not (i % report_every):
current = time.time()
time_taken = current - last
print('Time taken for %d rows: %.2f seconds, %.2f rows/s' % (
report_every, time_taken, report_every / time_taken))
last = current
total = time.time() - start
print('Total: %d rows, %.2f seconds, %.2f rows/s' % (
i, total, i / total))

report_time_iterate_rows(sys.argv[1])
2 changes: 1 addition & 1 deletion integration-tests/test_http.py
Expand Up @@ -15,7 +15,7 @@

GZIP_MAGIC = b'\x1f\x8b'
BASE_URL = ('https://raw.githubusercontent.com/RaRe-Technologies/smart_open/'
'refactor-rebased/smart_open/tests/test_data/')
'master/smart_open/tests/test_data/')



Expand Down
57 changes: 52 additions & 5 deletions smart_open/smart_open_lib.py
Expand Up @@ -141,6 +141,13 @@ def smart_open(uri, mode="rb", **kw):
"""
logger.debug('%r', locals())

if not isinstance(mode, six.string_types):
raise TypeError('mode should be a string')

fobj = _shortcut_open(uri, mode, **kw)
if fobj is not None:
return fobj

#
# This is a work-around for the problem described in Issue #144.
# If the user has explicitly specified an encoding, then assume they want
Expand All @@ -152,10 +159,6 @@ def smart_open(uri, mode="rb", **kw):
if kw.get('encoding') is not None and 'b' in mode:
mode = mode.replace('b', '')

# validate mode parameter
if not isinstance(mode, six.string_types):
raise TypeError('mode should be a string')

# Support opening ``pathlib.Path`` objects by casting them to strings.
if PATHLIB_SUPPORT and isinstance(uri, pathlib.Path):
uri = str(uri)
Expand Down Expand Up @@ -206,6 +209,50 @@ def smart_open(uri, mode="rb", **kw):
return decoded


def _shortcut_open(uri, mode, **kw):
"""Try to open the URI using the standard library io.open function.
This can be much faster than the alternative of opening in binary mode and
then decoding.
This is only possible under the following conditions:
1. Opening a local file
2. Ignore extension is set to True
If it is not possible to use the built-in open for the specified URI, returns None.
:param str uri: A string indicating what to open.
:param str mode: The mode to pass to the open function.
:param dict kw:
:returns: The opened file
:rtype: file
"""
if not isinstance(uri, six.string_types):
return None

parsed_uri = ParseUri(uri)
if parsed_uri.scheme != 'file':
return None

_, extension = P.splitext(parsed_uri.uri_path)
ignore_extension = kw.get('ignore_extension', False)
if extension in ('.gz', '.bz2') and not ignore_extension:
return None

open_kwargs = {}
errors = kw.get('errors')
if errors is not None:
open_kwargs['errors'] = errors

encoding = kw.get('encoding')
if encoding is not None:
open_kwargs['encoding'] = encoding
mode = mode.replace('b', '')

return io.open(parsed_uri.uri_path, mode, **open_kwargs)


def _open_binary_stream(uri, mode, **kw):
"""Open an arbitrary URI in the specified binary mode.
Expand Down Expand Up @@ -234,7 +281,7 @@ def _open_binary_stream(uri, mode, **kw):
if parsed_uri.scheme in ("file", ):
# local files -- both read & write supported
# compression, if any, is determined by the filename extension (.gz, .bz2)
fobj = open(parsed_uri.uri_path, mode)
fobj = io.open(parsed_uri.uri_path, mode)
return fobj, filename
elif parsed_uri.scheme in ("s3", "s3n", 's3u'):
return _s3_open_uri(parsed_uri, mode, **kw), filename
Expand Down
39 changes: 19 additions & 20 deletions smart_open/tests/test_smart_open.py
Expand Up @@ -201,6 +201,12 @@ class SmartOpenReadTest(unittest.TestCase):
"""

def test_shortcut(self):
fpath = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt')
with mock.patch('io.open') as mock_open:
smart_open.smart_open(fpath, 'r').read()
mock_open.assert_called_with(fpath, 'r')

def test_open_with_keywords(self):
"""This test captures Issue #142."""
fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt')
Expand Down Expand Up @@ -305,7 +311,7 @@ def test_s3_iter_lines(self):
self.assertEqual(b''.join(output), test_string)

# TODO: add more complex test for file://
@mock.patch('smart_open.smart_open_lib.open')
@mock.patch('io.open')
def test_file(self, mock_smart_open):
"""Is file:// line iterator called correctly?"""
prefix = "file://"
Expand Down Expand Up @@ -336,7 +342,7 @@ def test_file(self, mock_smart_open):
smart_open_object = smart_open.smart_open(prefix+short_path, read_mode, errors='strict')
smart_open_object.__iter__()
# called with the correct expanded path?
mock_smart_open.assert_called_with(full_path, read_mode)
mock_smart_open.assert_called_with(full_path, read_mode, errors='strict')

# couldn't find any project for mocking up HDFS data
# TODO: we want to test also a content of the files, not just fnc call params
Expand Down Expand Up @@ -505,22 +511,19 @@ def test_file_mode_mock(self, mock_boto):
# _initially_ got opened, we now also check the end result: if the
# contents got decoded correctly.
#
with mock.patch('smart_open.smart_open_lib.open',
mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
with smart_open.smart_open("blah", "r", encoding='utf-8') as fin:
self.assertEqual(fin.read(), as_text)
mock_open.assert_called_with("blah", "rb")
mock_open.assert_called_with("blah", "r", encoding='utf-8')

with mock.patch('smart_open.smart_open_lib.open',
mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with smart_open.smart_open("blah", "rb") as fin:
self.assertEqual(fin.read(), as_bytes)
mock_open.assert_called_with("blah", "rb")

short_path = "~/blah"
full_path = os.path.expanduser(short_path)
with mock.patch('smart_open.smart_open_lib.open',
mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with smart_open.smart_open(short_path, "rb") as fin:
mock_open.assert_called_with(full_path, "rb")

Expand All @@ -530,26 +533,22 @@ def test_file_mode_mock(self, mock_boto):
self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "wb+")

# correct write mode, correct file:// URI
with mock.patch('smart_open.smart_open_lib.open',
mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
with smart_open.smart_open("blah", "w", encoding='utf-8') as fout:
mock_open.assert_called_with("blah", "wb")
mock_open.assert_called_with("blah", "w", encoding='utf-8')
fout.write(as_text)

with mock.patch('smart_open.smart_open_lib.open',
mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
with smart_open.smart_open("/some/file.txt", "w", encoding='utf-8') as fout:
mock_open.assert_called_with("/some/file.txt", "wb")
mock_open.assert_called_with("/some/file.txt", "w", encoding='utf-8')
fout.write(as_text)

with mock.patch('smart_open.smart_open_lib.open',
mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
with smart_open.smart_open("/some/file.txt", "w+", encoding='utf-8') as fout:
mock_open.assert_called_with("/some/file.txt", "wb+")
mock_open.assert_called_with("/some/file.txt", "w+", encoding='utf-8')
fout.write(as_text)

with mock.patch('smart_open.smart_open_lib.open',
mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with smart_open.smart_open("/some/file.txt", "wb+") as fout:
mock_open.assert_called_with("/some/file.txt", "wb+")
fout.write(as_bytes)
Expand Down

0 comments on commit abb08ae

Please sign in to comment.