In [1]:
import pandas as pd
from scipy.sparse import random
import numpy as np
import pickle
import bcolz
import feather
import os

## Simulate data

In [2]:
n = 1000000
features = 500
X = random(n, features, density=0.7, format='csr', random_state=4)

In [3]:
X.shape

(1000000, 500)

## Serialize to Pickle

In [9]:
%%time 
#pickle cannot wite files more than 4G. workaround is to write incrementally
bytes_out = pickle.dumps(X)
n_bytes = 2**31
max_bytes = 2**31 - 1
with open('data.pickle', 'wb') as f_out:
    for idx in range(0, n_bytes, max_bytes):
        f_out.write(bytes_out[idx:idx+max_bytes])


CPU times: user 8.38 s, sys: 35 s, total: 43.4 s
Wall time: 1min 33s


In [10]:
%%time
bytes_in = bytearray(0)
file_path = 'data.pickle'
input_size = os.path.getsize(file_path)
with open(file_path, 'rb') as f_in:
    for _ in range(0, input_size, max_bytes):
        bytes_in += f_in.read(max_bytes)
data2 = pickle.loads(bytes_in)

CPU times: user 5.05 s, sys: 17.7 s, total: 22.8 s
Wall time: 31.9 s


In [11]:
data2.shape

(1000000, 500)

In [12]:
# print size
def print_size(f):
    s = !ls -lh {f}
    print('Size: ' , s[0].split()[4])
print_size('./data.pickle')    

Size:  3.9G


## Serialize to Feather format

In [4]:
coo = X.tocoo(copy=False)
df = pd.DataFrame({'index': coo.row, 'col': coo.col, 'data': coo.data}
                 )[['index', 'col', 'data']].sort_values(['index', 'col']
                 ).reset_index(drop=True)

In [13]:
%%time 
feather.write_dataframe(df, 'data.feather')

CPU times: user 3.95 s, sys: 18.9 s, total: 22.8 s
Wall time: 52.5 s


In [14]:
print_size('./data.feather')    

Size:  5.2G


In [15]:
%%time 
pdf = feather.read_dataframe('data.feather')

CPU times: user 3.91 s, sys: 13.5 s, total: 17.4 s
Wall time: 1min 4s


## Serialize using bcolz

In [16]:
%%time
ct = bcolz.ctable.fromdataframe(df, rootdir='data.bcolz')

CPU times: user 7.18 s, sys: 30.7 s, total: 37.9 s
Wall time: 16.6 s


In [17]:
# print size
def print_dir_size(f):
    s = !du -sh {f}
    print('Size: ' , s[0].split()[0])
print_dir_size('data.bcolz') 


Size:  2.6G


In [22]:
%%time
ct = bcolz.ctable.fromdataframe(df, rootdir='data_.bcolz') 

CPU times: user 5.94 s, sys: 5.2 s, total: 11.1 s
Wall time: 10 s
