Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-use the open filehandle rather than open a new one. #112

Merged
merged 28 commits into from
Mar 27, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
96b886a
Better support for custom S3 servers.
Nov 29, 2016
a51bedd
Fix unit tests
Nov 29, 2016
d1c9fea
updated README.rst with new s3 mode.
Nov 29, 2016
d847291
Added a new unit test for the unsecured calling form
Nov 29, 2016
ad4cac4
Updated style and unit test.
robottwo Dec 1, 2016
df38217
Check that the port argument isnt normally passed.
robottwo Dec 15, 2016
87d782c
Merge branch 'master' of github.com:robottwo/smart_open
robottwo Dec 15, 2016
6043a01
Merge branch 'master' of https://github.com/RaRe-Technologies/smart_o…
robottwo Feb 28, 2017
8a42950
Add generic HTTP and HTTPS streaming support.
robottwo Feb 28, 2017
f8dfc73
removed previous merge artifact;
robottwo Feb 28, 2017
ef17bcb
Raise exception instead of returning it :/
robottwo Feb 28, 2017
0097ac2
Raise http exceptions properly
robottwo Feb 28, 2017
d55efd9
neccessary import
robottwo Feb 28, 2017
6702bf7
python 3 compatibility
robottwo Feb 28, 2017
18fed99
Reverted make_closing -> closing
robottwo Mar 13, 2017
87d1bc6
Refactor the code to get the Python version
robottwo Mar 13, 2017
86a1306
Refactored the GZfile and BZ2File compression wrappers.
robottwo Mar 13, 2017
c5c30ef
Refactored HttpOpenRead unit tests.
robottwo Mar 13, 2017
9db299d
Clean up http unit tests.
robottwo Mar 13, 2017
70b17ab
Cosmetic changes and doc updates.
robottwo Mar 14, 2017
5621b12
Merge commit '4e2ba9f' into HEAD
robottwo Mar 17, 2017
d6df948
Re-use the open filehandle rather than open a new one.
robottwo Mar 17, 2017
be6141e
merge artifact
robottwo Mar 17, 2017
187d0d5
Merge remote-tracking branch 'rara/master'
robottwo Mar 27, 2017
3c0a2db
Add unit tests for compressed httpd reads.
robottwo Mar 27, 2017
97f9dbd
fixed import for python3
robottwo Mar 27, 2017
e374169
removed stray import
robottwo Mar 27, 2017
c5027a9
Handle some python3 byte vs unicode incompatibilityes.
robottwo Mar 27, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
33 changes: 23 additions & 10 deletions smart_open/smart_open_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
IS_PY2 = (sys.version_info[0] == 2)

if IS_PY2:
import cStringIO as StringIO
import httplib
if sys.version_info[1] == 6:
import copy_reg
Expand All @@ -46,6 +47,7 @@ def _reduce_partial(p):

copy_reg.pickle(partial, _reduce_partial)
elif sys.version_info[0] == 3:
import io as StringIO
import http.client as httplib

from boto.compat import BytesIO, urlsplit, six
Expand Down Expand Up @@ -635,11 +637,11 @@ def compression_wrapper(file_obj, filename, mode):
from bz2file import BZ2File
else:
from bz2 import BZ2File
return make_closing(BZ2File)(filename, mode)
return make_closing(BZ2File)(file_obj, mode)

elif ext == '.gz':
from gzip import GzipFile
return make_closing(GzipFile)(filename, mode)
return make_closing(GzipFile)(fileobj=file_obj, mode=mode)

else:
return file_obj
Expand Down Expand Up @@ -670,11 +672,6 @@ def __init__(self, url, mode='r', kerberos=False, user=None, password=None):

If none of those are set, will connect unauthenticated.
"""
if IS_PY2:
from urllib2 import urlopen
else:
from urllib.request import urlopen

if kerberos:
import requests_kerberos
auth = requests_kerberos.HTTPKerberosAuth()
Expand All @@ -688,13 +685,18 @@ def __init__(self, url, mode='r', kerberos=False, user=None, password=None):
if not self.response.ok:
self.response.raise_for_status()

self.mode = mode
self._read_buffer = None
self._read_iter = None
self._readline_iter = None

def __iter__(self):
return self.response.iter_lines()

def binary_content(self):
"""Return the content of the request as bytes."""
return self.response.content

def readline(self):
"""
Mimics the readline call to a filehandle object.
Expand All @@ -705,14 +707,18 @@ def readline(self):
try:
return next(self._readline_iter)
except StopIteration:
raise EOFError()
# When readline runs out of data, it just returns an empty string
return ''

def readlines(self):
"""
Mimics the readlines call to a filehandle object.
"""
return list(self.response.iter_lines())

def seek(self):
raise NotImplementedError('seek() is not implemented')

def read(self, size=None):
"""
Mimics the read call to a filehandle object.
Expand All @@ -732,7 +738,8 @@ def read(self, size=None):
retval = self._read_buffer
self._read_buffer = ''
if len(retval) == 0:
raise EOFError()
# When read runs out of data, it just returns empty
return ''
else:
return retval

Expand Down Expand Up @@ -760,7 +767,13 @@ def HttpOpenRead(parsed_uri, mode='r', **kwargs):
response = HttpReadStream(url, **kwargs)

fname = url.split('/')[-1]
return compression_wrapper(response, fname, mode)

if fname.endswith('.gz'):
# Gzip needs a seek-able filehandle, so we need to buffer it.
buffer = make_closing(io.BytesIO)(response.binary_content())
return compression_wrapper(buffer, fname, mode)
else:
return compression_wrapper(response, fname, mode)


class S3OpenWrite(object):
Expand Down
100 changes: 73 additions & 27 deletions smart_open/tests/test_smart_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,76 @@ def test_webhdfs_uri(self):
self.assertEqual(parsed_uri.scheme, "webhdfs")
self.assertEqual(parsed_uri.uri_path, "host:port/webhdfs/v1/path/file?query_part_1&query_part2")


class SmartOpenHttpTest(unittest.TestCase):
"""
Test reading from HTTP connections in various ways.

"""
@responses.activate
def test_http_read(self):
"""Does http read method work correctly"""
responses.add(responses.GET, "http://127.0.0.1/index.html", body='line1\nline2')
smart_open_object = smart_open.HttpOpenRead(smart_open.ParseUri("http://127.0.0.1/index.html"))
self.assertEqual(smart_open_object.read().decode("utf-8"), "line1\nline2")

@responses.activate
def test_https_readline(self):
"""Does https readline method work correctly"""
responses.add(responses.GET, "https://127.0.0.1/index.html", body='line1\nline2')
smart_open_object = smart_open.HttpOpenRead(smart_open.ParseUri("https://127.0.0.1/index.html"))
self.assertEqual(smart_open_object.readline().decode("utf-8"), "line1")

@responses.activate
def test_http_pass(self):
"""Does http authentication work correctly"""
responses.add(responses.GET, "http://127.0.0.1/index.html", body='line1\nline2')
_ = smart_open.HttpOpenRead(smart_open.ParseUri("http://127.0.0.1/index.html"), user='me', password='pass')
self.assertEquals(len(responses.calls), 1)
actual_request = responses.calls[0].request
self.assert_('Authorization' in actual_request.headers)
self.assert_(actual_request.headers['Authorization'].startswith('Basic '))

@responses.activate
def test_http_gz(self):
"""Can open gzip via http?"""
fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz')
data = open(fpath, 'rb').read()

responses.add(responses.GET, "http://127.0.0.1/data.gz",
body=data)
smart_open_object = smart_open.HttpOpenRead(
smart_open.ParseUri("http://127.0.0.1/data.gz"))

m = hashlib.md5(smart_open_object.read())
# decompress the gzip and get the same md5 hash
self.assertEqual(m.hexdigest(),'18473e60f8c7c98d29d65bf805736a0d')

@responses.activate
def test_http_bz2(self):
"""Can open bz2 via http?"""
test_string = b'Hello World Compressed.'
test_file = tempfile.NamedTemporaryFile('wb', suffix='.bz2',
delete=False).name

with smart_open.smart_open(test_file, 'wb') as outfile:
outfile.write(test_string)

with open(test_file, 'rb') as infile:
compressed_data = infile.read()

if os.path.isfile(test_file):
os.unlink(test_file)

responses.add(responses.GET, "http://127.0.0.1/data.bz2",
body=compressed_data)
smart_open_object = smart_open.HttpOpenRead(
smart_open.ParseUri("http://127.0.0.1/data.bz2"))

# decompress the gzip and get the same md5 hash
self.assertEqual(smart_open_object.read(), test_string)


class SmartOpenReadTest(unittest.TestCase):
"""
Test reading from files under various schemes.
Expand Down Expand Up @@ -169,30 +239,6 @@ def test_webhdfs_read(self):
smart_open_object = smart_open.WebHdfsOpenRead(smart_open.ParseUri("webhdfs://127.0.0.1:8440/path/file"))
self.assertEqual(smart_open_object.read().decode("utf-8"), "line1\nline2")

@responses.activate
def test_http_read(self):
"""Does http read method work correctly"""
responses.add(responses.GET, "http://127.0.0.1/index.html", body='line1\nline2')
smart_open_object = smart_open.HttpOpenRead(smart_open.ParseUri("http://127.0.0.1/index.html"))
self.assertEqual(smart_open_object.read().decode("utf-8"), "line1\nline2")

@responses.activate
def test_https_readline(self):
"""Does https readline method work correctly"""
responses.add(responses.GET, "https://127.0.0.1/index.html", body='line1\nline2')
smart_open_object = smart_open.HttpOpenRead(smart_open.ParseUri("https://127.0.0.1/index.html"))
self.assertEqual(smart_open_object.readline().decode("utf-8"), "line1")

@responses.activate
def test_http_pass(self):
"""Does http authentication work correctly"""
responses.add(responses.GET, "http://127.0.0.1/index.html", body='line1\nline2')
_ = smart_open.HttpOpenRead(smart_open.ParseUri("http://127.0.0.1/index.html"), user='me', password='pass')
self.assertEquals(len(responses.calls), 1)
actual_request = responses.calls[0].request
self.assert_('Authorization' in actual_request.headers)
self.assert_(actual_request.headers['Authorization'].startswith('Basic '))

@mock.patch('smart_open.smart_open_lib.boto')
@mock.patch('smart_open.smart_open_lib.S3OpenRead')
def test_s3_boto(self, mock_s3_open_read, mock_boto):
Expand Down Expand Up @@ -900,13 +946,13 @@ def test_s3_iter_bucket_with_SSLError_moto(self):


PY2 = sys.version_info[0] == 2

CURR_DIR = os.path.abspath(os.path.dirname(__file__))

class CompressionFormatTest(unittest.TestCase):
"""
Test that compression
"""
CURR_DIR = os.path.abspath(os.path.dirname(__file__))

TEXT = 'Hello'

def write_read_assertion(self, test_file):
Expand All @@ -921,7 +967,7 @@ def write_read_assertion(self, test_file):

def test_open_gz(self):
"""Can open gzip?"""
fpath = os.path.join(self.CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz')
fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz')
data = smart_open.smart_open(fpath).read()
m = hashlib.md5(data)
assert m.hexdigest() == '18473e60f8c7c98d29d65bf805736a0d', \
Expand Down