Skip to content

Commit

Permalink
Fix the issue with compressed format and add tests. Fix #109 (#110)
Browse files Browse the repository at this point in the history
* Add tests for compressed files

* Fix issue 109

* write binary

* compare binary

* decode binary

* Update test_smart_open.py
  • Loading branch information
tmylk committed Mar 17, 2017
1 parent 4c65fa9 commit 1683156
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 2 deletions.
4 changes: 2 additions & 2 deletions smart_open/smart_open_lib.py
Expand Up @@ -623,11 +623,11 @@ def compression_wrapper(file_obj, filename, mode):
from bz2file import BZ2File
else:
from bz2 import BZ2File
return make_closing(BZ2File)(file_obj, mode)
return make_closing(BZ2File)(filename, mode)

elif ext == '.gz':
from gzip import GzipFile
return make_closing(GzipFile)(file_obj, mode)
return make_closing(GzipFile)(filename, mode)

else:
return file_obj
Expand Down
38 changes: 38 additions & 0 deletions smart_open/tests/test_smart_open.py
Expand Up @@ -12,6 +12,7 @@
import tempfile
import sys
import os
import hashlib

import boto
import mock
Expand Down Expand Up @@ -854,6 +855,43 @@ def test_s3_iter_bucket_moto(self):

PY2 = sys.version_info[0] == 2


class CompressionFormatTest(unittest.TestCase):
"""
Test that compression
"""
CURR_DIR = os.path.abspath(os.path.dirname(__file__))
TEXT = 'Hello'

def write_read_assertion(self, test_file):
with smart_open.smart_open(test_file, 'wb') as fout: # 'b' for binary, needed on Windows
fout.write(self.TEXT.encode('utf8'))

with smart_open.smart_open(test_file, 'rb') as fin:
self.assertEqual(fin.read().decode('utf8'), self.TEXT)

if os.path.isfile(test_file):
os.unlink(test_file)

def test_open_gz(self):
"""Can open gzip?"""
fpath = os.path.join(self.CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz')
data = smart_open.smart_open(fpath).read()
m = hashlib.md5(data)
assert m.hexdigest() == '18473e60f8c7c98d29d65bf805736a0d', \
'Failed to read gzip'

def test_write_read_gz(self):
"""Can write and read gzip?"""
test_file = tempfile.NamedTemporaryFile('wb', suffix='.gz', delete=False).name
self.write_read_assertion(test_file)

def test_write_read_bz2(self):
"""Can write and read bz2?"""
test_file = tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False).name
self.write_read_assertion(test_file)


class MultistreamsBZ2Test(unittest.TestCase):
"""
Test that multistream bzip2 compressed files can be read.
Expand Down

1 comment on commit 1683156

@robottwo
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching, and for the tests. I'm not sure this reversion is great, because it can lead to filehandle leaks depending on how the caller is invoking smart_open. I'll commit a separate PR in a bit.

Please sign in to comment.