Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write pickle to file-like without intermediate in-memory buffer #37056

Merged
merged 9 commits into from
Oct 14, 2020
6 changes: 6 additions & 0 deletions asv_bench/benchmarks/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,11 @@ def time_read_pickle(self):
def time_write_pickle(self):
self.df.to_pickle(self.fname)

def peakmem_read_pickle(self):
read_pickle(self.fname)

def peakmem_write_pickle(self):
self.df.to_pickle(self.fname)


from ..pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ Performance improvements
avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`)
- Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`)
- Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`)
- Reduced peak memory usage in DataFrame.to_pickle() when using protocol=5 in python 3.8+
ig248 marked this conversation as resolved.
Show resolved Hide resolved

.. ---------------------------------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def to_pickle(
if protocol < 0:
protocol = pickle.HIGHEST_PROTOCOL
try:
f.write(pickle.dumps(obj, protocol=protocol))
pickle.dump(obj, f, protocol=protocol)
finally:
if f != filepath_or_buffer:
# do not close user-provided file objects GH 35679
Expand Down
44 changes: 30 additions & 14 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"""
import bz2
import datetime
import functools
import glob
import gzip
import io
Expand All @@ -24,7 +25,7 @@

import pytest

from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian
from pandas.compat import PY38, get_lzma_file, import_lzma, is_platform_little_endian
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -155,28 +156,43 @@ def test_pickles(current_pickle_data, legacy_pickle):
compare(current_pickle_data, legacy_pickle, version)


def test_round_trip_current(current_pickle_data):
def python_pickler(obj, path):
with open(path, "wb") as fh:
pickle.dump(obj, fh, protocol=-1)
def python_pickler(obj, path):
with open(path, "wb") as fh:
pickle.dump(obj, fh, protocol=-1)

def python_unpickler(path):
with open(path, "rb") as fh:
fh.seek(0)
return pickle.load(fh)

def python_unpickler(path):
with open(path, "rb") as fh:
fh.seek(0)
return pickle.load(fh)


@pytest.mark.parametrize(
"pickle_writer",
[
pytest.param(python_pickler, id="python"),
jreback marked this conversation as resolved.
Show resolved Hide resolved
pytest.param(pd.to_pickle, id="pandas_proto_default"),
pytest.param(
functools.partial(pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL),
id="pandas_proto_highest",
),
pytest.param(functools.partial(pd.to_pickle, protocol=4), id="pandas_proto_4"),
pytest.param(
functools.partial(pd.to_pickle, protocol=5),
id="pandas_proto_5",
marks=pytest.mark.skipif(not PY38, reason="protocol 5 not supported"),
),
],
)
def test_round_trip_current(current_pickle_data, pickle_writer):
data = current_pickle_data
for typ, dv in data.items():
for dt, expected in dv.items():

for writer in [pd.to_pickle, python_pickler]:
if writer is None:
continue

with tm.ensure_clean() as path:

# test writing with each pickler
writer(expected, path)
pickle_writer(expected, path)

# test reading with each unpickler
result = pd.read_pickle(path)
Expand Down