[[1, 0, 0, 0, 2],
 [0, 3, 0, 4, 0],
 [0, 0, 0, 0, 0]
 [5, 0, 0, 6, 0]]

In [3]:
import numpy as np

x = [[1, 0, 0, 0, 2],
     [0, 3, 0, 4, 0],
     [0, 0, 0, 0, 0],
     [5, 0, 0, 6, 0]]
x = np.asarray(x)
print(x)

[[1 0 0 0 2]
 [0 3 0 4 0]
 [0 0 0 0 0]
 [5 0 0 6 0]]


In [16]:
rows = [0, 0, 1, 1, 3, 3]
cols = [0, 4, 1, 3, 0, 3]
data = [1, 2, 3, 4, 5, 6]

In [19]:
csr_matrix((data, (rows, cols))).todense()

matrix([[1, 0, 0, 0, 2],
        [0, 3, 0, 4, 0],
        [0, 0, 0, 0, 0],
        [5, 0, 0, 6, 0]], dtype=int64)

## dok matrix

In [5]:
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from scipy.sparse import dok_matrix

dok = dok_matrix(x)

In [8]:
dok.keys()

dict_keys([(0, 0), (0, 4), (1, 1), (1, 3), (3, 0), (3, 3)])

In [9]:
dok.values()

dict_values([1, 2, 3, 4, 5, 6])

In [10]:
for key, value in dok.items():
    print('{} = {}'.format(key, value))

(0, 0) = 1
(0, 4) = 2
(1, 1) = 3
(1, 3) = 4
(3, 0) = 5
(3, 3) = 6


In [11]:
dok.nnz

6

In [13]:
print(dok[0,3])
print(dok[0,4])

0
2


## coo matrix

In [14]:
coo = coo_matrix(x)

(row, column, value) list 

연산 불가. 저장 및 확인

### csr matrix

compressed sparse row (CSR) 

In [49]:
csr = csr_matrix(x)

In [50]:
print(csr.indices)

[0 4 1 3 0 3]


In [51]:
print(csr.indptr) # [0 2 4 4 6]

[0 2 4 4 6]


In [52]:
len(csr.indptr) # num of rows + 1

5

In [53]:
print(csr.data) # [1 2 3 4 5 6]

[1 2 3 4 5 6]


In [26]:
csr_rows, csr_cols = csr.nonzero()
print(csr_rows) # array([0, 0, 1, 1, 3, 3], dtype=int32)
print(csr_cols) # array([0, 4, 1, 3, 0, 3], dtype=int32)

[0 0 1 1 3 3]
[0 4 1 3 0 3]


In [27]:
for i, (b, e) in enumerate(zip(csr.indptr, csr.indptr[1:])):
    for idx in range(b, e):
        j = csr.indices[idx]
        d = csr.data[idx]
        print('({}, {}) = {}'.format(i, j, d))

(0, 0) = 1
(0, 4) = 2
(1, 1) = 3
(1, 3) = 4
(3, 0) = 5
(3, 3) = 6


## csc matrix

compressed sparse column (CSC) 

In [28]:
csc = csc_matrix(x)

In [33]:
len(csc.indptr) # num of columns + 1

6

In [29]:
print(csc.indices) # [0 3 1 1 3 0]
print(csc.indptr)
print(csc.data)

[0 3 1 1 3 0]
[0 2 3 3 5 6]
[1 5 3 4 6 2]


In [30]:
for j, (b, e) in enumerate(zip(csc.indptr, csc.indptr[1:])):
    for idx in range(b, e):
        i = csc.indices[idx]
        d = csc.data[idx]
        print('({}, {}) = {}'.format(i, j, d))

(0, 0) = 1
(3, 0) = 5
(1, 1) = 3
(1, 3) = 4
(3, 3) = 6
(0, 4) = 2


## IO

In [42]:
from scipy.io import mmwrite
from scipy.io import mmread

In [44]:
mmwrite('csr.mtx', csr)

In [57]:
from scipy.sparse import csr_matrix

def my_read(path):
    with open(path) as f:
        # skip head
        for _ in range(3):
            next(f)
        rows = []
        cols = []
        data = []
        for line in f:
            elements = line.split()
            i = int(elements[0])
            j = int(elements[1])
            d = float(elements[2])
            rows.append(i)
            cols.append(j)
            data.append(d)
    return csr_matrix((data, (rows, cols)), dtype=np.int)

my_read('csr.mtx').todense()

matrix([[0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 2],
        [0, 0, 3, 0, 4, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 5, 0, 0, 6, 0]])

In [45]:
with open('csr.mtx') as f:
    for _ in range(5):
        print(next(f).strip())

%%MatrixMarket matrix coordinate integer general
%
4 5 6
1 1 1
1 5 2


In [46]:
loaded = mmread('csr.mtx')
print(type(loaded))

<class 'scipy.sparse.coo.coo_matrix'>


In [47]:
loaded = mmread('csr.mtx').tocsr()
print(type(loaded))

<class 'scipy.sparse.csr.csr_matrix'>


## Avoid for slicing

class csr_matrix():
    # ...

    def _get_submatrix(self, row_slice, col_slice):
        """Return a submatrix of this matrix (new matrix is created)."""

        def process_slice(sl, num):
            # ...
            return i0, i1

        M,N = self.shape
        i0, i1 = process_slice(row_slice, M)
        j0, j1 = process_slice(col_slice, N)

        indptr, indices, data = get_csr_submatrix(
            M, N, self.indptr, self.indices, self.data, i0, i1, j0, j1)

        shape = (i1 - i0, j1 - j0)
        return self.__class__((data, indices, indptr), shape=shape,
                              dtype=self.dtype, copy=False)

In [58]:
for i in range(csr.shape[0]):
    row = csr[i,:]
    # ...
    

In [41]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics import pairwise_distances_argmin

pairwise_distances(X, Y=None, metric='euclidean')

- cityblock : manhattan_distances
- cosine : 1 - cosine similarity
- euclidean : L2 

In [34]:
import numpy as np
np.dot?

In [59]:
csr_zero = csr.copy()

In [60]:
csr_zero[:,3] = 0



In [61]:
len(csr_zero.data)

8

In [66]:
csr.data

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [62]:
csr_zero.data

array([1, 0, 2, 3, 0, 0, 5, 0], dtype=int64)

In [63]:
rows, cols = csr_zero.nonzero()

In [64]:
cols

array([0, 4, 1, 0], dtype=int32)

In [65]:
rows

array([0, 0, 1, 3], dtype=int32)

In [70]:
def remove_column(x, idx):
    rows, cols = x.nonzero()
    data = x.data
    
    rows_, cols_, data_ = [], [], []
    for r, c, d in zip(rows, cols, data):
        if c == idx:
            continue
        rows_.append(r)
        cols_.append(c)
        data_.append(d)
    return x.__class__((data_, (rows_, cols_)), shape=x.shape)

def remove_column_2(x_csr, idx):
    indices = x_csr.indices
    indptr = x_csr.indptr
    data = x_csr.data
    
    indices_, indptr_, data_ = [], [], []
    n_skips = 0
    for b, e in zip(indptr, indptr[1:]):
        for idx in range(b, e):
            j = indices[idx]
            if j == idx:
                n_skips += 1
                
    
    
    

csr_remove = remove_column(csr, 3)

(6,)
(6,)
(6,)


In [71]:
csr_remove.nnz

4