## vector
- spark에서 type field를 통해 vector 종류 식별
    - dense vector (1)
    - sparse vector (0)

### dense vector

In [8]:
import numpy as np
dv = np.array([1.0,2.1,3])
print (dv)
print (type(dv))

[1.  2.1 3. ]
<class 'numpy.ndarray'>


In [7]:
from pyspark.mllib.linalg import Vectors

dvmllib = Vectors.dense([1.0,2.1,3])
print (dvmllib)
print (type(dvmllib))

[1.0,2.1,3.0]
<class 'pyspark.mllib.linalg.DenseVector'>


In [10]:
dvlml = Vectors.dense([1.0,2.1,3])
print (dvlml)

[1.0,2.1,3.0]


In [16]:
for e in dvlml:
    print (e, end=' ')

1.0 2.1 3.0 

### sparse vector
- 실제 값이 없는 요소, '0'을 제거하여 만든 vector

- 1차원 dense vector
    - [160, 69, 24]
- sparse vector
    - 3-컬럼 갯수, [0,1,2]-값이 있는 컬럼 [160.0, 69.0, 24.0]-실제 값
    - (3,[0,1,2],[160.0, 69.0, 24.0])

In [19]:
sv1 = Vectors.sparse(3,[1,2],[1.0,3.0])
print (sv1)
print (sv1.toArray())

(3,[1,2],[1.0,3.0])
[0. 1. 3.]


In [23]:
import numpy as np
import scipy.sparse as sps
row = np.array([0,0,1,2,2,2])
col = np.array([0,2,2,0,1,2])
data = np.array([1,2,3,4,5,6,])
mtx = sps.csc_matrix((data,(row,col)), shape=(3,3))
print(mtx.todense())

[[1 0 2]
 [0 0 3]
 [4 5 6]]


In [29]:
from pyspark.mllib.linalg import Matrix, Matrices
dm = Matrices.dense(6,4,[1,2,0,0,0,0,0,3,0,4,0,0,0,0,5,6,7,0,0,0,0,0,0,8])
dm.toArray()

array([[1., 0., 0., 0.],
       [2., 3., 0., 0.],
       [0., 0., 5., 0.],
       [0., 4., 6., 0.],
       [0., 0., 7., 0.],
       [0., 0., 0., 8.]])

In [30]:
dm.toSparse()

SparseMatrix(6, 4, [0, 2, 4, 7, 8], [0, 1, 1, 3, 2, 3, 4, 5], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], False)

In [31]:
dm

DenseMatrix(6, 4, [1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, ..., 7.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.0], False)

In [34]:
sm = Matrices.sparse(3,2,[0,1,3],[0,2,1],[9,6,8])
print (sm)

3 X 2 CSCMatrix
(0,0) 9.0
(2,1) 6.0
(1,1) 8.0


In [33]:
d = sm.toDense()
print(d)

DenseMatrix([[9., 0.],
             [0., 8.],
             [0., 6.]])


In [36]:
p = [[1.0,2.0,3.0],[1.1,2.1,3.1],[1.2,2.2,3.2]]
my = spark.sparkContext.parallelize(p)

In [38]:
my.collect()

[[1.0, 2.0, 3.0], [1.1, 2.1, 3.1], [1.2, 2.2, 3.2]]

In [40]:
from pyspark.mllib.linalg.distributed import RowMatrix
rm = RowMatrix(my) # 리스트를 묶어 여러 행으로 구성된 벡터를 생성
print (type(rm))

<class 'pyspark.mllib.linalg.distributed.RowMatrix'>


In [41]:
rm.rows.collect()

[DenseVector([1.0, 2.0, 3.0]),
 DenseVector([1.1, 2.1, 3.1]),
 DenseVector([1.2, 2.2, 3.2])]

## Labeled Point
- label, features 로 구성
    - label: supervised learning에서 '구분 값'으로 사용, data type:double

In [42]:
from pyspark.mllib.regression import LabeledPoint
print (LabeledPoint(1.0,[1.0,2.0,3.0]))

(1.0,[1.0,2.0,3.0])


In [44]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors

print (LabeledPoint(1992,Vectors.sparse(10,{0:3.0, 1:5.5, 2:10.0}))) 

(1992.0,(10,[0,1,2],[3.0,5.5,10.0]))


In [45]:
dvmllib

DenseVector([1.0, 2.1, 3.0])

In [46]:
from pyspark.mllib.regression import LabeledPoint
LabeledPoint(1.0,dvmllib)

LabeledPoint(1.0, [1.0,2.1,3.0])

In [47]:
type(dvlml)

pyspark.mllib.linalg.DenseVector

In [49]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
LabeledPoint(1.0, dvlml)
#LabeledPoint(1.0, Vectors.fromML(dvlml))

LabeledPoint(1.0, [1.0,2.1,3.0])