Skip to content

Commit

Permalink
Use built-in open whenever possible (#208)
Browse files Browse the repository at this point in the history
* added integration test

* Resolving issue #207: use built-in open whenever possible

* rename integration test using correct issue number

* update unit tests

* isolate failing test case

* Updated unit tests

Changed the mock target to keep in step with the shortcut_open function.
Split tests into separate test cases.

* respond to review comments

- use tempfile instead of hard-coded file
- integrate new test into travis.yml so that Travis runs it

* added support for buffering parameter
  • Loading branch information
mpenkov authored and menshikh-iv committed Aug 13, 2018
1 parent 20c6c93 commit 1c43f19
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 38 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Expand Up @@ -54,6 +54,8 @@ script:
aws s3 cp .benchmarks/*/*.json ${SO_S3_RESULT_URL};
aws s3 rm --recursive $SO_S3_URL;
fi
- pip install numpy
- py.test integration-tests/test_207.py


cache:
Expand Down
35 changes: 35 additions & 0 deletions integration-tests/test_207.py
@@ -0,0 +1,35 @@
import os
import sys
import tempfile

try:
import numpy as np
except ImportError:
print("You really need numpy to proceed with this test")
sys.exit(1)

import smart_open


def tofile():
dt = np.dtype([('time', [('min', int), ('sec', int)]), ('temp', float)])
x = np.zeros((1,), dtype=dt)

with tempfile.NamedTemporaryFile(prefix='test_207', suffix='.dat', delete=False) as fout:
x.tofile(fout.name)
return fout.name


def test():
try:
path = tofile()
with smart_open.smart_open(path, 'rb') as fin:
loaded = np.fromfile(fin)
return 0
finally:
os.unlink(path)
return 1


if __name__ == '__main__':
sys.exit(test())
19 changes: 18 additions & 1 deletion smart_open/smart_open_lib.py
Expand Up @@ -274,6 +274,13 @@ def _shortcut_open(uri, mode, **kw):
if extension in ('.gz', '.bz2') and not ignore_extension:
return None

#
# https://docs.python.org/2/library/functions.html#open
#
# buffering: 0: off; 1: on; negative number: use system default
#
buffering = kw.get('buffering', -1)

open_kwargs = {}
errors = kw.get('errors')
if errors is not None:
Expand All @@ -284,7 +291,17 @@ def _shortcut_open(uri, mode, **kw):
open_kwargs['encoding'] = encoding
mode = mode.replace('b', '')

return io.open(parsed_uri.uri_path, mode, **open_kwargs)
#
# Under Py3, the built-in open accepts kwargs, and it's OK to use that.
# Under Py2, the built-in open _doesn't_ accept kwargs, but we still use it
# whenever possible (see issue #207). If we're under Py2 and have to use
# kwargs, then we have no option other to use io.open.
#
if six.PY3:
return open(parsed_uri.uri_path, mode, buffering=buffering, **open_kwargs)
elif not open_kwargs:
return open(parsed_uri.uri_path, mode, buffering=buffering)
return io.open(parsed_uri.uri_path, mode, buffering=buffering, **open_kwargs)


def _open_binary_stream(uri, mode, **kw):
Expand Down
119 changes: 82 additions & 37 deletions smart_open/tests/test_smart_open.py
Expand Up @@ -13,6 +13,7 @@
import os
import sys
import hashlib
import unittest

import boto3
import mock
Expand Down Expand Up @@ -195,6 +196,14 @@ def test_http_bz2(self):
self.assertEqual(smart_open_object.read(), test_string)


#
# What exactly to patch here differs on _how_ we're opening the file.
# See the _shortcut_open function for details.
#
_IO_OPEN = 'io.open'
_BUILTIN_OPEN = 'smart_open.smart_open_lib.open'


class SmartOpenReadTest(unittest.TestCase):
"""
Test reading from files under various schemes.
Expand All @@ -203,9 +212,9 @@ class SmartOpenReadTest(unittest.TestCase):

def test_shortcut(self):
fpath = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt')
with mock.patch('io.open') as mock_open:
with mock.patch('smart_open.smart_open_lib.open') as mock_open:
smart_open.smart_open(fpath, 'r').read()
mock_open.assert_called_with(fpath, 'r')
mock_open.assert_called_with(fpath, 'r', buffering=-1)

def test_open_with_keywords(self):
"""This test captures Issue #142."""
Expand Down Expand Up @@ -311,7 +320,7 @@ def test_s3_iter_lines(self):
self.assertEqual(b''.join(output), test_string)

# TODO: add more complex test for file://
@mock.patch('io.open')
@mock.patch('smart_open.smart_open_lib.open')
def test_file(self, mock_smart_open):
"""Is file:// line iterator called correctly?"""
prefix = "file://"
Expand All @@ -320,29 +329,52 @@ def test_file(self, mock_smart_open):
smart_open_object = smart_open.smart_open(prefix+full_path, read_mode)
smart_open_object.__iter__()
# called with the correct path?
mock_smart_open.assert_called_with(full_path, read_mode)
mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1)

full_path = '/tmp/test#hash##more.txt'
read_mode = "rb"
smart_open_object = smart_open.smart_open(prefix+full_path, read_mode)
smart_open_object.__iter__()
# called with the correct path?
mock_smart_open.assert_called_with(full_path, read_mode)
mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1)

full_path = 'aa#aa'
read_mode = "rb"
smart_open_object = smart_open.smart_open(full_path, read_mode)
smart_open_object.__iter__()
# called with the correct path?
mock_smart_open.assert_called_with(full_path, read_mode)
mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1)

short_path = "~/tmp/test.txt"
full_path = os.path.expanduser(short_path)

@mock.patch(_IO_OPEN if six.PY2 else _BUILTIN_OPEN)
def test_file_errors(self, mock_smart_open):
prefix = "file://"
full_path = '/tmp/test.txt'
read_mode = "r"
short_path = "~/tmp/test.txt"
full_path = os.path.expanduser(short_path)

smart_open_object = smart_open.smart_open(prefix+short_path, read_mode, errors='strict')
smart_open_object.__iter__()
# called with the correct expanded path?
mock_smart_open.assert_called_with(full_path, read_mode, errors='strict')
mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1, errors='strict')

@mock.patch(_BUILTIN_OPEN)
def test_file_buffering(self, mock_smart_open):
smart_open_object = smart_open.smart_open('/tmp/somefile', 'rb', buffering=0)
smart_open_object.__iter__()
# called with the correct expanded path?
mock_smart_open.assert_called_with('/tmp/somefile', 'rb', buffering=0)

@unittest.skip('smart_open does not currently accept additional positional args')
@mock.patch(_BUILTIN_OPEN)
def test_file_buffering2(self, mock_smart_open):
smart_open_object = smart_open.smart_open('/tmp/somefile', 'rb', 0)
smart_open_object.__iter__()
# called with the correct expanded path?
mock_smart_open.assert_called_with('/tmp/somefile', 'rb', buffering=0)

# couldn't find any project for mocking up HDFS data
# TODO: we want to test also a content of the files, not just fnc call params
Expand Down Expand Up @@ -534,65 +566,78 @@ class SmartOpenTest(unittest.TestCase):
Test reading and writing from/into files.
"""
@mock.patch('smart_open.smart_open_lib.boto')
def test_file_mode_mock(self, mock_boto):
"""Are file:// open modes passed correctly?"""
as_text = u'куда идём мы с пятачком - большой большой секрет'
as_bytes = as_text.encode('utf-8')

# incorrect file mode
self.assertRaises(
NotImplementedError, smart_open.smart_open, "s3://bucket/key", "x"
)
def setUp(self):
self.as_text = u'куда идём мы с пятачком - большой большой секрет'
self.as_bytes = self.as_text.encode('utf-8')
self.stringio = io.StringIO(self.as_text)
self.bytesio = io.BytesIO(self.as_bytes)

def test_file_mode_mock(self):
"""Are file:// open modes passed correctly?"""
# correct read modes
#
# We always open files in binary mode first, but engage
# encoders/decoders as necessary. Instead of checking how the file
# _initially_ got opened, we now also check the end result: if the
# contents got decoded correctly.
#
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:

def test_text(self):
patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN
with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open:
with smart_open.smart_open("blah", "r", encoding='utf-8') as fin:
self.assertEqual(fin.read(), as_text)
mock_open.assert_called_with("blah", "r", encoding='utf-8')
self.assertEqual(fin.read(), self.as_text)
mock_open.assert_called_with("blah", "r", buffering=-1, encoding='utf-8')

with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
def test_binary(self):
with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.bytesio)) as mock_open:
with smart_open.smart_open("blah", "rb") as fin:
self.assertEqual(fin.read(), as_bytes)
mock_open.assert_called_with("blah", "rb")
self.assertEqual(fin.read(), self.as_bytes)
mock_open.assert_called_with("blah", "rb", buffering=-1)

def test_expanded_path(self):
short_path = "~/blah"
full_path = os.path.expanduser(short_path)
with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open:
with smart_open.smart_open(short_path, "rb") as fin:
mock_open.assert_called_with(full_path, "rb")
mock_open.assert_called_with(full_path, "rb", buffering=-1)

def test_incorrect(self):
# incorrect file mode
self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "x")

# correct write modes, incorrect scheme
self.assertRaises(NotImplementedError, smart_open.smart_open, "hdfs:///blah.txt", "wb+")
self.assertRaises(NotImplementedError, smart_open.smart_open, "http:///blah.txt", "w")
self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "wb+")

def test_write_utf8(self):
# correct write mode, correct file:// URI
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN
with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open:
with smart_open.smart_open("blah", "w", encoding='utf-8') as fout:
mock_open.assert_called_with("blah", "w", encoding='utf-8')
fout.write(as_text)
mock_open.assert_called_with("blah", "w", buffering=-1, encoding='utf-8')
fout.write(self.as_text)

with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
def test_write_utf8_absolute_path(self):
patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN
with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open:
with smart_open.smart_open("/some/file.txt", "w", encoding='utf-8') as fout:
mock_open.assert_called_with("/some/file.txt", "w", encoding='utf-8')
fout.write(as_text)
mock_open.assert_called_with("/some/file.txt", "w", buffering=-1, encoding='utf-8')
fout.write(self.as_text)

with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
def test_append_utf8(self):
patch = _IO_OPEN if six.PY2 else _BUILTIN_OPEN
with mock.patch(patch, mock.Mock(return_value=self.stringio)) as mock_open:
with smart_open.smart_open("/some/file.txt", "w+", encoding='utf-8') as fout:
mock_open.assert_called_with("/some/file.txt", "w+", encoding='utf-8')
fout.write(as_text)
mock_open.assert_called_with("/some/file.txt", "w+", buffering=-1, encoding='utf-8')
fout.write(self.as_text)

with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
def test_append_binary_absolute_path(self):
with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.bytesio)) as mock_open:
with smart_open.smart_open("/some/file.txt", "wb+") as fout:
mock_open.assert_called_with("/some/file.txt", "wb+")
fout.write(as_bytes)
mock_open.assert_called_with("/some/file.txt", "wb+", buffering=-1)
fout.write(self.as_bytes)

@mock.patch('boto3.Session')
def test_s3_mode_mock(self, mock_session):
Expand Down

0 comments on commit 1c43f19

Please sign in to comment.