In [1]:
import pandas as pd
from scipy.sparse import random
import numpy as np
import pickle
import bcolz
import feather
import os

In [2]:
n = 1000000
features = 500
X = random(n, features, density=0.1, format='csr', random_state=4)

## Serialize to Pickle

In [3]:
%%time 
with open('data.pickle', 'wb') as f:
    pickle.dump(X, f)
    

CPU times: user 400 ms, sys: 810 ms, total: 1.21 s
Wall time: 1.86 s


In [4]:
%%time
with open('data.pickle', 'rb') as f:
    data = pickle.load(f)

CPU times: user 206 ms, sys: 508 ms, total: 714 ms
Wall time: 713 ms


In [5]:
data.shape

(1000000, 500)

In [6]:
# print size
def print_size(f):
    s = !ls -lh {f}
    print('Size: ' , s[0].split()[4])
print_size('./data.pickle')    

Size:  576M


## Serialize to Feather format

In [7]:
coo = X.tocoo(copy=False)
df = pd.DataFrame({'index': coo.row, 'col': coo.col, 'data': coo.data}
                 )[['index', 'col', 'data']].sort_values(['index', 'col']
                 ).reset_index(drop=True)

In [8]:
%%time 
feather.write_dataframe(df, 'data.feather')


CPU times: user 308 ms, sys: 540 ms, total: 849 ms
Wall time: 2.17 s


In [9]:
print_size('data.feather')    



Size:  763M


### write as sparse data frame

In [10]:
sdf = df.to_sparse()

In [11]:
%%time 
feather.write_dataframe(sdf, 'data_sparse.feather')

CPU times: user 30.7 s, sys: 1.85 s, total: 32.6 s
Wall time: 34 s


In [13]:
print_size('data_sparse.feather')    

Size:  763M


## Serialize using bcolz

In [15]:
%%time
ct = bcolz.ctable.fromdataframe(df, rootdir='data.bcolz')

CPU times: user 899 ms, sys: 1.13 s, total: 2.03 s
Wall time: 1.59 s


In [17]:
# print size
def print_dir_size(f):
    s = !du -sh {f}
    print('Size: ' , s[0].split()[0])
print_dir_size('data.bcolz') 


Size:  402M


In [None]:
%% time
ct = bcolz.ctable.fromdataframe(sdf, rootdir='data_sparse.bcolz') #NOTE:  this takes for ever

In [None]:
print_size('data_sparse.bcolz') 

