Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

This resolves issue #142 #143

Merged
merged 3 commits into from Nov 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
68 changes: 65 additions & 3 deletions smart_open/smart_open_lib.py
Expand Up @@ -21,6 +21,7 @@

"""

import codecs
import logging
import os
import subprocess
Expand All @@ -32,6 +33,7 @@
import boto.s3.connection
import boto.s3.key
from ssl import SSLError
import sys


IS_PY2 = (sys.version_info[0] == 2)
Expand Down Expand Up @@ -72,6 +74,8 @@

WEBHDFS_MIN_PART_SIZE = 50 * 1024**2 # minimum part size for HDFS multipart uploads

SYSTEM_ENCODING = sys.getdefaultencoding()


def smart_open(uri, mode="rb", **kw):
"""
Expand Down Expand Up @@ -149,7 +153,7 @@ def smart_open(uri, mode="rb", **kw):
if parsed_uri.scheme in ("file", ):
# local files -- both read & write supported
# compression, if any, is determined by the filename extension (.gz, .bz2)
return file_smart_open(parsed_uri.uri_path, mode)
return file_smart_open(parsed_uri.uri_path, mode, encoding=kw.pop('encoding', None))
elif parsed_uri.scheme in ("s3", "s3n", 's3u'):
return s3_open_uri(parsed_uri, mode, **kw)
elif parsed_uri.scheme in ("hdfs", ):
Expand Down Expand Up @@ -544,6 +548,7 @@ def close(self):
if not fileobj.closed:
fileobj.close()


def compression_wrapper(file_obj, filename, mode):
"""
This function will wrap the file_obj with an appropriate
Expand All @@ -564,13 +569,70 @@ def compression_wrapper(file_obj, filename, mode):
return file_obj


def file_smart_open(fname, mode='rb'):
def encoding_wrapper(fileobj, mode, encoding=None):
"""Decode bytes into text, if necessary.

If mode specifies binary access, does nothing, unless the encoding is
specified. A non-null encoding implies text mode.

:arg fileobj: must quack like a filehandle object.
:arg str mode: is the mode which was originally requested by the user.
:arg encoding: The text encoding to use. If mode is binary, overrides mode.
:returns: a file object
"""
logger.debug('encoding_wrapper: %r', locals())

#
# If the mode is binary, but the user specified an encoding, assume they
# want text. If we don't make this assumption, ignore the encoding and
# return bytes, smart_open behavior will diverge from the built-in open:
#
# open(filename, encoding='utf-8') returns a text stream in Py3
# smart_open(filename, encoding='utf-8') would return a byte stream
# without our assumption, because the default mode is rb.
#
if mode in ('rb', 'wb', 'ab') and encoding is None:
return fileobj

if encoding is None:
encoding = SYSTEM_ENCODING

if mode[0] == 'r':
decoder = codecs.getreader(encoding)
else:
decoder = codecs.getwriter(encoding)
return decoder(fileobj)


def file_smart_open(fname, mode='rb', encoding=None):
"""
Stream from/to local filesystem, transparently (de)compressing gzip and bz2
files if necessary.

:arg str fname: The path to the file to open.
:arg str mode: The mode in which to open the file.
:arg str encoding: The text encoding to use.
:returns: A file object
"""
return compression_wrapper(open(fname, mode), fname, mode)
#
# This is how we get from the filename to the end result.
# Decompression is optional, but it always accepts bytes and returns bytes.
# Decoding is also optional, accepts bytes and returns text.
# The diagram below is for reading, for writing, the flow is from right to
# left, but the code is identical.
#
# open as binary decompress? decode?
# filename ---------------> bytes -------------> bytes ---------> text
# raw_fobj decompressed_fobj decoded_fobj
#
try:
raw_mode = {'r': 'rb', 'w': 'wb', 'a': 'ab'}[mode]
except KeyError:
raw_mode = mode
raw_fobj = open(fname, raw_mode)
decompressed_fobj = compression_wrapper(raw_fobj, fname, raw_mode)
decoded_fobj = encoding_wrapper(decompressed_fobj, mode, encoding=encoding)
return decoded_fobj


class HttpReadStream(object):
Expand Down
2 changes: 2 additions & 0 deletions smart_open/tests/test_data/cp852.tsv.txt
@@ -0,0 +1,2 @@
t�mto bude�
budem byli
40 changes: 29 additions & 11 deletions smart_open/tests/test_smart_open.py
Expand Up @@ -6,6 +6,7 @@
# This code is distributed under the terms and conditions
# from the MIT License (MIT).

import io
import unittest
import logging
import tempfile
Expand Down Expand Up @@ -174,6 +175,23 @@ class SmartOpenReadTest(unittest.TestCase):

"""

def test_open_with_keywords(self):
"""This test captures Issue #142."""
fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt')
with open(fpath, 'rb') as fin:
expected = fin.read().decode('cp852')
with smart_open.smart_open(fpath, encoding='cp852') as fin:
actual = fin.read()
self.assertEqual(expected, actual)

def test_open_with_keywords_explicit_r(self):
fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt')
with open(fpath, 'rb') as fin:
expected = fin.read().decode('cp852')
with smart_open.smart_open(fpath, mode='r', encoding='cp852') as fin:
actual = fin.read()
self.assertEqual(expected, actual)

@mock_s3
def test_read_never_returns_none(self):
"""read should never return None."""
Expand Down Expand Up @@ -275,29 +293,29 @@ def test_file(self, mock_smart_open):
smart_open_object = smart_open.smart_open(prefix+full_path, read_mode)
smart_open_object.__iter__()
# called with the correct path?
mock_smart_open.assert_called_with(full_path, read_mode)
mock_smart_open.assert_called_with(full_path, read_mode, encoding=None)

full_path = '/tmp/test#hash##more.txt'
read_mode = "rb"
smart_open_object = smart_open.smart_open(prefix+full_path, read_mode)
smart_open_object.__iter__()
# called with the correct path?
mock_smart_open.assert_called_with(full_path, read_mode)
mock_smart_open.assert_called_with(full_path, read_mode, encoding=None)

full_path = 'aa#aa'
read_mode = "rb"
smart_open_object = smart_open.smart_open(full_path, read_mode)
smart_open_object.__iter__()
# called with the correct path?
mock_smart_open.assert_called_with(full_path, read_mode)
mock_smart_open.assert_called_with(full_path, read_mode, encoding=None)

short_path = "~/tmp/test.txt"
full_path = os.path.expanduser(short_path)

smart_open_object = smart_open.smart_open(prefix+short_path, read_mode)
smart_open_object.__iter__()
# called with the correct expanded path?
mock_smart_open.assert_called_with(full_path, read_mode)
mock_smart_open.assert_called_with(full_path, read_mode, encoding=None)

# couldn't find any project for mocking up HDFS data
# TODO: we want to test also a content of the files, not just fnc call params
Expand Down Expand Up @@ -444,15 +462,15 @@ def test_file_mode_mock(self, mock_file, mock_boto):

# correct read modes
smart_open.smart_open("blah", "r")
mock_file.assert_called_with("blah", "r")
mock_file.assert_called_with("blah", "r", encoding=None)

smart_open.smart_open("blah", "rb")
mock_file.assert_called_with("blah", "rb")
mock_file.assert_called_with("blah", "rb", encoding=None)

short_path = "~/blah"
full_path = os.path.expanduser(short_path)
smart_open.smart_open(short_path, "rb")
mock_file.assert_called_with(full_path, "rb")
mock_file.assert_called_with(full_path, "rb", encoding=None)

# correct write modes, incorrect scheme
self.assertRaises(NotImplementedError, smart_open.smart_open, "hdfs:///blah.txt", "wb+")
Expand All @@ -461,16 +479,16 @@ def test_file_mode_mock(self, mock_file, mock_boto):

# correct write mode, correct file:// URI
smart_open.smart_open("blah", "w")
mock_file.assert_called_with("blah", "w")
mock_file.assert_called_with("blah", "w", encoding=None)

smart_open.smart_open("file:///some/file.txt", "wb")
mock_file.assert_called_with("/some/file.txt", "wb")
mock_file.assert_called_with("/some/file.txt", "wb", encoding=None)

smart_open.smart_open("file:///some/file.txt", "wb+")
mock_file.assert_called_with("/some/file.txt", "wb+")
mock_file.assert_called_with("/some/file.txt", "wb+", encoding=None)

smart_open.smart_open("file:///some/file.txt", "w+")
mock_file.assert_called_with("/some/file.txt", "w+")
mock_file.assert_called_with("/some/file.txt", "w+", encoding=None)

@mock.patch('boto3.Session')
def test_s3_mode_mock(self, mock_session):
Expand Down