Skip to content

Commit

Permalink
Merge pull request #1024 from effigies/fix/gzip_compression_options
Browse files Browse the repository at this point in the history
ENH: Create gzip header deterministically by default
  • Loading branch information
effigies committed Jun 25, 2021
2 parents 44a1052 + ff5efe4 commit ea68c4e
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 7 deletions.
28 changes: 21 additions & 7 deletions nibabel/openers.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,24 @@
HAVE_INDEXED_GZIP = False


def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False):
class DeterministicGzipFile(gzip.GzipFile):
""" Deterministic variant of GzipFile
This writer does not add filename information to the header, and defaults
to a modification time (``mtime``) of 0 seconds.
"""
def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, mtime=0):
# These two guards are copied from
# https://github.com/python/cpython/blob/6ab65c6/Lib/gzip.py#L171-L174
if mode and 'b' not in mode:
mode += 'b'
if fileobj is None:
fileobj = self.myfileobj = open(filename, mode or 'rb')
return super().__init__(filename="", mode=mode, compresslevel=compresslevel,
fileobj=fileobj, mtime=mtime)


def _gzip_open(filename, mode='rb', compresslevel=9, mtime=0, keep_open=False):

# use indexed_gzip if possible for faster read access. If keep_open ==
# True, we tell IndexedGzipFile to keep the file handle open. Otherwise
Expand All @@ -52,7 +69,7 @@ def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False):

# Fall-back to built-in GzipFile
else:
gzip_file = gzip.GzipFile(filename, mode, compresslevel)
gzip_file = DeterministicGzipFile(filename, mode, compresslevel, mtime=mtime)

return gzip_file

Expand Down Expand Up @@ -83,7 +100,7 @@ class Opener(object):
passed to opening method when `fileish` is str. Change of defaults as
for \*args
"""
gz_def = (_gzip_open, ('mode', 'compresslevel', 'keep_open'))
gz_def = (_gzip_open, ('mode', 'compresslevel', 'mtime', 'keep_open'))
bz2_def = (BZ2File, ('mode', 'buffering', 'compresslevel'))
zstd_def = (_zstd_open, ('mode', 'level_or_option', 'zstd_dict'))
compress_ext_map = {
Expand Down Expand Up @@ -163,10 +180,7 @@ def name(self):
self._name will be None if object was created with a fileobj, otherwise
it will be the filename.
"""
try:
return self.fobj.name
except AttributeError:
return self._name
return self._name

@property
def mode(self):
Expand Down
126 changes: 126 additions & 0 deletions nibabel/tests/test_openers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@
from gzip import GzipFile
from io import BytesIO, UnsupportedOperation
from distutils.version import StrictVersion
import hashlib
import time

from numpy.compat.py3k import asstr, asbytes
from ..openers import (Opener,
ImageOpener,
HAVE_INDEXED_GZIP,
BZ2File,
DeterministicGzipFile,
)
from ..tmpdirs import InTemporaryDirectory
from ..volumeutils import BinOpener
Expand Down Expand Up @@ -367,3 +370,126 @@ def test_iter():
lobj = Opener(Lunk(''))
with pytest.raises(TypeError):
list(lobj)


def md5sum(fname):
with open(fname, "rb") as fobj:
return hashlib.md5(fobj.read()).hexdigest()


def test_DeterministicGzipFile():
with InTemporaryDirectory():
msg = b"Hello, I'd like to have an argument."

# No filename, no mtime
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj:
gzobj.write(msg)
anon_chksum = md5sum("ref.gz")

with DeterministicGzipFile("default.gz", "wb") as fobj:
internal_fobj = fobj.myfileobj
fobj.write(msg)
# Check that myfileobj is being closed by GzipFile.close()
# This is in case GzipFile changes its internal implementation
assert internal_fobj.closed

assert md5sum("default.gz") == anon_chksum

# No filename, current mtime
now = time.time()
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=now) as gzobj:
gzobj.write(msg)
now_chksum = md5sum("ref.gz")

with DeterministicGzipFile("now.gz", "wb", mtime=now) as fobj:
fobj.write(msg)

assert md5sum("now.gz") == now_chksum

# Change in default behavior
with mock.patch("time.time") as t:
t.return_value = now

# GzipFile will use time.time()
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb", fileobj=fobj) as gzobj:
gzobj.write(msg)
assert md5sum("ref.gz") == now_chksum

# DeterministicGzipFile will use 0
with DeterministicGzipFile("now.gz", "wb") as fobj:
fobj.write(msg)
assert md5sum("now.gz") == anon_chksum

# GzipFile is filename dependent, DeterministicGzipFile is independent
with GzipFile("filenameA.gz", mode="wb", mtime=0) as gzobj:
gzobj.write(msg)
fnameA_chksum = md5sum("filenameA.gz")
assert fnameA_chksum != anon_chksum

with DeterministicGzipFile("filenameA.gz", "wb") as fobj:
fobj.write(msg)

# But the contents are the same with different filenames
assert md5sum("filenameA.gz") == anon_chksum


def test_DeterministicGzipFile_fileobj():
with InTemporaryDirectory():
msg = b"Hello, I'd like to have an argument."
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb", fileobj=fobj, mtime=0) as gzobj:
gzobj.write(msg)
ref_chksum = md5sum("ref.gz")

with open("test.gz", "wb") as fobj:
with DeterministicGzipFile(filename="", mode="wb", fileobj=fobj) as gzobj:
gzobj.write(msg)
md5sum("test.gz") == ref_chksum

with open("test.gz", "wb") as fobj:
with DeterministicGzipFile(fileobj=fobj, mode="wb") as gzobj:
gzobj.write(msg)
md5sum("test.gz") == ref_chksum

with open("test.gz", "wb") as fobj:
with DeterministicGzipFile(filename="test.gz", mode="wb", fileobj=fobj) as gzobj:
gzobj.write(msg)
md5sum("test.gz") == ref_chksum


def test_bitwise_determinism():
with InTemporaryDirectory():
msg = b"Hello, I'd like to have an argument."
# Canonical reference: No filename, no mtime
# Use default compresslevel
with open("ref.gz", "wb") as fobj:
with GzipFile(filename="", mode="wb",
compresslevel=1, fileobj=fobj,
mtime=0) as gzobj:
gzobj.write(msg)
anon_chksum = md5sum("ref.gz")

# Different times, different filenames
now = time.time()
with mock.patch("time.time") as t:
t.return_value = now
with Opener("a.gz", "wb") as fobj:
fobj.write(msg)
t.return_value = now + 1
with Opener("b.gz", "wb") as fobj:
fobj.write(msg)

assert md5sum("a.gz") == anon_chksum
assert md5sum("b.gz") == anon_chksum

# Users can still set mtime, but filenames will not be embedded
with Opener("filenameA.gz", "wb", mtime=0xCAFE10C0) as fobj:
fobj.write(msg)
with Opener("filenameB.gz", "wb", mtime=0xCAFE10C0) as fobj:
fobj.write(msg)
fnameA_chksum = md5sum("filenameA.gz")
fnameB_chksum = md5sum("filenameB.gz")
assert fnameA_chksum == fnameB_chksum != anon_chksum

0 comments on commit ea68c4e

Please sign in to comment.