# Pickling alternatives to drive filesize down, avoid LFS!!

In [23]:
import msgpack
import numpy as np
import pandas as pd
import pickle
import sys

# Check pickle structure...

In [60]:
data = pd.read_pickle('precomputed.pkl')

In [61]:
type(data)

dict

In [6]:
data.keys()

dict_keys(['properties', 'version', 'matrix', 'abscissa'])

In [10]:
data['properties']

{'charge': array([ 0.,  1.,  2., ...,  0.,  1., -1.]),
 'size': array([32., 32., 32., ..., 70., 70., 70.])}

In [12]:
data['properties']['charge'].shape

(1877,)

In [13]:
data['properties']['size'].shape

(1877,)

In [14]:
data['version']

'3.00'

In [16]:
data['matrix'].shape

(15996, 1877)

In [18]:
data['abscissa'].shape

(15996,)

In [31]:
type(data['matrix'])

numpy.ndarray

In [33]:
matrix = data['matrix']

# CSV (bad!)

In [32]:
np.savetxt('matrix.csv', data['matrix'], delimiter=',')

740MB ish

# Sparse representation?

In [34]:
matrix.tofile('testmatrix.txt')

In [40]:
from scipy.sparse import csc_matrix, save_npz, load_npz

In [36]:
sparse_matrix = csc_matrix(matrix)

In [39]:
save_npz('sparse_matrix.npz', sparse_matrix)

162MB!

In [41]:
mm = load_npz('sparse_matrix.npz')

In [42]:
type(mm)

scipy.sparse.csc.csc_matrix

In [45]:
matriz = mm.todense()

In [46]:
type(matriz)

numpy.matrixlib.defmatrix.matrix

In [49]:
np.all(matriz == matrix)

True

# Dataframe?

In [51]:
df = pd.DataFrame(matrix)

In [58]:
df.to_hdf('test.h5', key='df')

240MB.. same as pickle.

# Split into multiple files?

In [62]:
# can use np.array_split, which won't raise an exception if the chunks are not equal in size.
tmp = np.split(matrix, 4)

In [65]:
len(tmp)

4

In [68]:
# Save the chopped-up matrix
for index, value in enumerate(tmp):
    df_tmp = pd.DataFrame(value)
    df_tmp.to_pickle('matrix_split_' + str(index+1) + '.pkl')

Created... (60MB each)
- matrix_split_1.pkl
- matrix_split_2.pkl
- matrix_split_3.pkl
- matrix_split_4.pkl

In [84]:
# Load the chopped-up matrix
dataframe = pd.DataFrame()
for index in range(1, 5, 1):
    df_tmp = pd.read_pickle('matrix_split_' + str(index) + '.pkl')
    dataframe = pd.concat([dataframe, df_tmp], ignore_index=True)    

In [85]:
loaded_matrix = dataframe.values

In [86]:
loaded_matrix.shape

(15996, 1877)

In [87]:
matrix.shape

(15996, 1877)

In [88]:
np.all(matrix == loaded_matrix)

True

Success!