In [1]:
import numpy as np
import pandas as pd
from pickle import dump, load

In [2]:
sparse = np.zeros((10000,4000),dtype=int)

In [3]:
n = 1000000
idx = (np.sort(np.random.randint(0,sparse.shape[0],n)),
       np.sort(np.random.randint(0,sparse.shape[1],n)))

In [4]:
sparse[idx] = np.random.randint(1,6,n)

In [5]:
sparse

array([[2, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [3, 4, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 5, 1],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5]])

In [6]:
np.sum(sparse)

41661

In [7]:
class DCV:
    """
    Dict of Common Values is a sparse matrix representation that is 
    useful for integer valued matrices with relatively small number
    of uniques entries, e.g. user-score matrices.
    
    Attributes
    ----------
    sparse : dict
        Sparse representation of dense form that is created by
        accumulating common values in ijv representation.
    shape : tuple
        Number of rows and columns of dense matrix.
    dtype : data-type
        Data type of the dense matrix.
    """
    def __init__(self):
        self.__sparse = None
        self.__shape = None
        self.__dtype = None
    
    @property
    def sparse(self):
        return self.__sparse
    
    @property
    def shape(self):
        return self.__shape
    
    @property
    def dtype(self):
        return self.__dtype
    
    def to_sparse(self,x):
        self.__shape = x.shape
        self.__dtype = x.dtype
        
        nnz = np.nonzero(x)
        index_not = pd.DataFrame(np.c_[np.array(list(zip(*nnz))),sparse[nnz]],columns=['r','c','v'])
        dict_not = index_not.groupby('v').agg(list)
        
        self.__sparse = {}
        for i,row in dict_not.iterrows():
            self.__sparse[i] = pd.DataFrame(np.c_[row['r'],row['c']],columns=['r','c']).groupby('c').agg({'r':list}).to_dict()['r']
            
        return self
    
    def to_dense(self):
        output = np.zeros((self.__shape[0],self.__shape[1]),dtype = self.__dtype)
        for val,loc in self.__sparse.items():
            for col,rows in loc.items():
                for row in rows:
                    output[row,col] = val
                    
        return output
    
    def save_sparse(self,filename):
        if self.__sparse is None:
            raise AttributeError('''Use either `to_sparse` to create sparse representation 
                                 or `load_sparse` to load existing before saving.''')
        
        with open(filename, 'wb') as f:
            dump({'shape':self.__shape,'dtype':self.__dtype,'sparse':self.__sparse}, f)
            
    def load_sparse(self,filename):
        with open(filename, 'rb') as f:
            tmp = load(f)
        
        self.__shape = tmp['shape']
        self.__dtype = tmp['dtype']
        self.__sparse = tmp['sparse']
        
        return self

In [8]:
cv = DCV()
cv.to_sparse(sparse)

<__main__.DCV at 0x114477518>

In [9]:
cv.sparse[1][1]

[3]

In [10]:
cv.save_sparse('~/Desktop/sparse')

In [11]:
cv.to_dense()

array([[2, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [3, 4, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 5, 1],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5]])

In [12]:
np.sum(sparse - cv.to_dense())

0

In [13]:
cv = DCV()
cv.load_sparse('~/Desktop/sparse')

<__main__.DCV at 0x114477eb8>

In [14]:
cv.to_dense()

array([[2, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [3, 4, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 5, 1],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5]])

In [15]:
np.sum(sparse - cv.to_dense())

0