In [1]:
import pandas as pd
from scipy.sparse import random
import numpy as np
import pickle
import bcolz
import feather
import os

## Benchmark different serialization formats
### Hardware : Mac Book Pro, 2.2 GHz Intel Core i7, 16 Gb RAM

## Simulate data

In [2]:
n = 1000000
features = 500
X = random(n, features, density=0.7, format='csr', random_state=4)

In [3]:
X.shape

(1000000, 500)

## Serialize to Pickle

In [4]:
%%time 
#pickle cannot wite files more than 4G. workaround is to write incrementally
bytes_out = pickle.dumps(X)
n_bytes = 2**31
max_bytes = 2**31 - 1
with open('data.pickle', 'wb') as f_out:
    for idx in range(0, n_bytes, max_bytes):
        f_out.write(bytes_out[idx:idx+max_bytes])


CPU times: user 5.36 s, sys: 9.12 s, total: 14.5 s
Wall time: 20.4 s


In [5]:
%%time
bytes_in = bytearray(0)
file_path = 'data.pickle'
input_size = os.path.getsize(file_path)
with open(file_path, 'rb') as f_in:
    for _ in range(0, input_size, max_bytes):
        bytes_in += f_in.read(max_bytes)
data2 = pickle.loads(bytes_in)

CPU times: user 4.83 s, sys: 16.6 s, total: 21.5 s
Wall time: 26.8 s


In [6]:
data2.shape

(1000000, 500)

In [7]:
del data2

In [8]:
# print size
def print_size(f):
    s = !ls -lh {f}
    print('Size: ' , s[0].split()[4])
print_size('./data.pickle')    

Size:  3.9G


## Serialize to Feather format

In [9]:
df = pd.DataFrame(X.todense())

In [10]:
del X

In [11]:
df.columns = ['C' + str(i) for i in range(500)]
df.shape

(1000000, 500)

In [12]:
%%time 
feather.write_dataframe(df, 'data.feather')

CPU times: user 32.1 s, sys: 28.7 s, total: 1min
Wall time: 25.5 s


In [13]:
print_size('./data.feather')    

Size:  3.7G


In [14]:
%%time 
pdf = feather.read_dataframe('data.feather')

CPU times: user 2.62 s, sys: 8.01 s, total: 10.6 s
Wall time: 30.5 s


In [15]:
pdf.shape

(1000000, 500)

In [16]:
del pdf

## Serialize using bcolz

In [22]:
%%time
ct = bcolz.ctable.fromdataframe(df, rootdir='data.bcolz')

CPU times: user 21.4 s, sys: 23.6 s, total: 45 s
Wall time: 49.6 s


In [23]:
# print size
def print_dir_size(f):
    s = !du -sh {f}
    print('Size: ' , s[0].split()[0])
print_dir_size('data.bcolz') 

Size:  3.6G


In [24]:
%%time
ct = bcolz.ctable(rootdir='data.bcolz') 
df2 = ct.todataframe()

CPU times: user 8.05 s, sys: 17.2 s, total: 25.2 s
Wall time: 21.4 s


In [25]:
df2.shape

(1000000, 500)

In [26]:
del df2