From 5667a3ad0489815c1239cba785300952c9799000 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 9 Mar 2017 09:50:04 -0500 Subject: [PATCH] TST: fix up compression tests / docs --- doc/source/io.rst | 55 +++--- doc/source/whatsnew/v0.20.0.txt | 40 +++-- pandas/tests/io/test_pickle.py | 289 ++++++++++++++++---------------- 3 files changed, 208 insertions(+), 176 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 67491c8b30de7..fdd33ab4625f3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3042,22 +3042,19 @@ any pickled pandas object (or any other pickled object) from file: See `this question `__ for a detailed explanation. -.. note:: - - These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated. - .. _io.pickle.compression: -Read/Write compressed pickle files -'''''''''''''' +Compressed pickle files +''''''''''''''''''''''' .. versionadded:: 0.20.0 :func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can read -and write compressed pickle files. Compression types of ``gzip``, ``bz2``, ``xz`` supports -both read and write. ``zip`` file supports read only and must contain only one data file +and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz`` are supported for reading and writing. +`zip`` file supports read only and must contain only one data file to be read in. -Compression type can be an explicitely parameter or be inferred from the file extension. + +The compression type can be an explicit parameter or be inferred from the file extension. If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or ``'.xz'``, respectively. @@ -3065,17 +3062,37 @@ If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ` df = pd.DataFrame({ 'A': np.random.randn(1000), - 'B': np.random.randn(1000), - 'C': np.random.randn(1000)}) - df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type - df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension - df.to_pickle("data.pkl.gz") # default, using "infer" - df["A"].to_pickle("s1.pkl.bz2") + 'B': 'foo', + 'C': pd.date_range('20130101', periods=1000, freq='s')}) + df + +Using an explicit compression type + +.. ipython:: python - df = pd.read_pickle("data.pkl.compress", compression="gzip") - df = pd.read_pickle("data.pkl.xz", compression="infer") - df = pd.read_pickle("data.pkl.gz") - s = pd.read_pickle("s1.pkl.bz2") + df.to_pickle("data.pkl.compress", compression="gzip") + rt = pd.read_pickle("data.pkl.compress", compression="gzip") + rt + +Inferring compression type from the extension + +.. ipython:: python + + df.to_pickle("data.pkl.xz", compression="infer") + rt = pd.read_pickle("data.pkl.xz", compression="infer") + rt + +The default is to 'infer + +.. ipython:: python + + df.to_pickle("data.pkl.gz") + rt = pd.read_pickle("data.pkl.gz") + rt + + df["A"].to_pickle("s1.pkl.bz2") + rt = pd.read_pickle("s1.pkl.bz2") + rt .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4b320d21fe738..8f671062464f0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -102,23 +102,41 @@ Pickle file I/O now supports compression :func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can now read from and write to compressed pickle files. Compression methods can be an explicit parameter or be inferred from the file extension. -See :ref:`Read/Write compressed pickle files ` +See :ref:`the docs here ` .. ipython:: python df = pd.DataFrame({ 'A': np.random.randn(1000), - 'B': np.random.randn(1000), - 'C': np.random.randn(1000)}) - df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type - df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension - df.to_pickle("data.pkl.gz") # default, using "infer" - df["A"].to_pickle("s1.pkl.bz2") + 'B': 'foo', + 'C': pd.date_range('20130101', periods=1000, freq='s')}) + +Using an explicit compression type + +.. ipython:: python - df = pd.read_pickle("data.pkl.compress", compression="gzip") - df = pd.read_pickle("data.pkl.xz", compression="infer") - df = pd.read_pickle("data.pkl.gz") - s = pd.read_pickle("s1.pkl.bz2") + df.to_pickle("data.pkl.compress", compression="gzip") + rt = pd.read_pickle("data.pkl.compress", compression="gzip") + rt + +Inferring compression type from the extension + +.. ipython:: python + + df.to_pickle("data.pkl.xz", compression="infer") + rt = pd.read_pickle("data.pkl.xz", compression="infer") + rt + +The default is to 'infer + +.. ipython:: python + + df.to_pickle("data.pkl.gz") + rt = pd.read_pickle("data.pkl.gz") + rt + df["A"].to_pickle("s1.pkl.bz2") + rt = pd.read_pickle("s1.pkl.bz2") + rt .. ipython:: python :suppress: diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 2fffc3c39ec26..91e70e942089c 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -306,191 +306,188 @@ def test_pickle_v0_15_2(): # --------------------- # test pickle compression # --------------------- -_compression_to_extension = { - None: ".none", - 'gzip': '.gz', - 'bz2': '.bz2', - 'zip': '.zip', - 'xz': '.xz', -} - +@pytest.fixture def get_random_path(): return u'__%s__.pickle' % tm.rands(10) -def compress_file(src_path, dest_path, compression): - if compression is None: - shutil.copyfile(src_path, dest_path) - return - - if compression == 'gzip': - import gzip - f = gzip.open(dest_path, "w") - elif compression == 'bz2': - import bz2 - f = bz2.BZ2File(dest_path, "w") - elif compression == 'zip': - import zipfile - zip_file = zipfile.ZipFile(dest_path, "w", - compression=zipfile.ZIP_DEFLATED) - zip_file.write(src_path, os.path.basename(src_path)) - elif compression == 'xz': - lzma = pandas.compat.import_lzma() - f = lzma.LZMAFile(dest_path, "w") - else: - msg = 'Unrecognized compression type: {}'.format(compression) - raise ValueError(msg) - - if compression != "zip": - f.write(open(src_path, "rb").read()) - f.close() +class TestCompression(object): + _compression_to_extension = { + None: ".none", + 'gzip': '.gz', + 'bz2': '.bz2', + 'zip': '.zip', + 'xz': '.xz', + } -def decompress_file(src_path, dest_path, compression): - if compression is None: - shutil.copyfile(src_path, dest_path) - return + def compress_file(self, src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return - if compression == 'gzip': - import gzip - f = gzip.open(src_path, "r") - elif compression == 'bz2': - import bz2 - f = bz2.BZ2File(src_path, "r") - elif compression == 'zip': - import zipfile - zip_file = zipfile.ZipFile(src_path) - zip_names = zip_file.namelist() - if len(zip_names) == 1: - f = zip_file.open(zip_names.pop()) + if compression == 'gzip': + import gzip + f = gzip.open(dest_path, "w") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(dest_path, "w") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(dest_path, "w", + compression=zipfile.ZIP_DEFLATED) + zip_file.write(src_path, os.path.basename(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(dest_path, "w") else: - raise ValueError('ZIP file {} error. Only one file per ZIP.' - .format(src_path)) - elif compression == 'xz': - lzma = pandas.compat.import_lzma() - f = lzma.LZMAFile(src_path, "r") - else: - msg = 'Unrecognized compression type: {}'.format(compression) - raise ValueError(msg) - - open(dest_path, "wb").write(f.read()) - f.close() + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + if compression != "zip": + f.write(open(src_path, "rb").read()) + f.close() -@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) -def test_write_explicit(compression): - # issue 11666 - if compression == 'xz': - tm._skip_if_no_lzma() + def decompress_file(self, src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return - base = get_random_path() - path1 = base + ".compressed" - path2 = base + ".raw" + if compression == 'gzip': + import gzip + f = gzip.open(src_path, "r") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(src_path, "r") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(src_path) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + else: + raise ValueError('ZIP file {} error. Only one file per ZIP.' + .format(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(src_path, "r") + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + open(dest_path, "wb").write(f.read()) + f.close() - # write to compressed file - df.to_pickle(p1, compression=compression) + @pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) + def test_write_explicit(self, compression, get_random_path): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() - # decompress - decompress_file(p1, p2, compression=compression) + base = get_random_path + path1 = base + ".compressed" + path2 = base + ".raw" - # read decompressed file - df2 = pd.read_pickle(p2, compression=None) + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() - tm.assert_frame_equal(df, df2) + # write to compressed file + df.to_pickle(p1, compression=compression) + # decompress + self.decompress_file(p1, p2, compression=compression) -@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) -def test_write_explicit_bad(compression): - with tm.assertRaisesRegexp(ValueError, - "Unrecognized compression type"): - with tm.ensure_clean(get_random_path()) as path: - df = tm.makeDataFrame() - df.to_pickle(path, compression=compression) + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) + tm.assert_frame_equal(df, df2) -@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) -def test_write_infer(ext): - if ext == '.xz': - tm._skip_if_no_lzma() + @pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) + def test_write_explicit_bad(self, compression, get_random_path): + with tm.assertRaisesRegexp(ValueError, + "Unrecognized compression type"): + with tm.ensure_clean(get_random_path) as path: + df = tm.makeDataFrame() + df.to_pickle(path, compression=compression) - base = get_random_path() - path1 = base + ext - path2 = base + ".raw" - compression = None - for c in _compression_to_extension: - if _compression_to_extension[c] == ext: - compression = c - break + @pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) + def test_write_infer(self, ext, get_random_path): + if ext == '.xz': + tm._skip_if_no_lzma() - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + base = get_random_path + path1 = base + ext + path2 = base + ".raw" + compression = None + for c in self._compression_to_extension: + if self._compression_to_extension[c] == ext: + compression = c + break - # write to compressed file by inferred compression method - df.to_pickle(p1) - - # decompress - decompress_file(p1, p2, compression=compression) + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() - # read decompressed file - df2 = pd.read_pickle(p2, compression=None) + # write to compressed file by inferred compression method + df.to_pickle(p1) - tm.assert_frame_equal(df, df2) + # decompress + self.decompress_file(p1, p2, compression=compression) + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) -@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"]) -def test_read_explicit(compression): - # issue 11666 - if compression == 'xz': - tm._skip_if_no_lzma() + tm.assert_frame_equal(df, df2) - base = get_random_path() - path1 = base + ".raw" - path2 = base + ".compressed" + @pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"]) + def test_read_explicit(self, compression, get_random_path): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + base = get_random_path + path1 = base + ".raw" + path2 = base + ".compressed" - # write to uncompressed file - df.to_pickle(p1, compression=None) + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() - # compress - compress_file(p1, p2, compression=compression) + # write to uncompressed file + df.to_pickle(p1, compression=None) - # read compressed file - df2 = pd.read_pickle(p2, compression=compression) + # compress + self.compress_file(p1, p2, compression=compression) - tm.assert_frame_equal(df, df2) + # read compressed file + df2 = pd.read_pickle(p2, compression=compression) + tm.assert_frame_equal(df, df2) -@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.zip', - '.no_compress']) -def test_read_infer(ext): - if ext == '.xz': - tm._skip_if_no_lzma() + @pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.zip', + '.no_compress']) + def test_read_infer(self, ext, get_random_path): + if ext == '.xz': + tm._skip_if_no_lzma() - base = get_random_path() - path1 = base + ".raw" - path2 = base + ext - compression = None - for c in _compression_to_extension: - if _compression_to_extension[c] == ext: - compression = c - break + base = get_random_path + path1 = base + ".raw" + path2 = base + ext + compression = None + for c in self._compression_to_extension: + if self._compression_to_extension[c] == ext: + compression = c + break - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() - # write to uncompressed file - df.to_pickle(p1, compression=None) + # write to uncompressed file + df.to_pickle(p1, compression=None) - # compress - compress_file(p1, p2, compression=compression) + # compress + self.compress_file(p1, p2, compression=compression) - # read compressed file by inferred compression method - df2 = pd.read_pickle(p2) + # read compressed file by inferred compression method + df2 = pd.read_pickle(p2) - tm.assert_frame_equal(df, df2) + tm.assert_frame_equal(df, df2)