From e4bd0c02df49b07ed0ee3687c3ac8e44868c857a Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Wed, 17 Dec 2014 01:51:48 -0800 Subject: [PATCH] [SPARK-4409] Modified horzcat and vertcat --- .../apache/spark/mllib/linalg/Matrices.scala | 172 +++++++++++++----- .../spark/mllib/linalg/MatricesSuite.scala | 34 ++-- .../spark/mllib/util/TestingUtils.scala | 6 +- 3 files changed, 145 insertions(+), 67 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index b02e756b05db6..e18046e37225d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -603,7 +603,7 @@ object Matrices { /** * Horizontally concatenate a sequence of matrices. The returned matrix will be in the format * the matrices are supplied in. Supplying a mix of dense and sparse matrices will result in - * a dense matrix. + * a sparse matrix. * @param matrices array of matrices * @return a single `Matrix` composed of the matrices that were horizontally concatenated */ @@ -621,17 +621,42 @@ object Matrices { mat match { case sparse: SparseMatrix => isSparse = true case dense: DenseMatrix => isDense = true + case _ => throw new IllegalArgumentException("Unsupported matrix format. Expected " + + s"SparseMatrix or DenseMatrix. Instead got: ${mat.getClass}") } numCols += mat.numCols } require(rowsMatch, "The number of rows of the matrices in this sequence, don't match!") - if (isSparse && !isDense) { + if (!isSparse && isDense) { + new DenseMatrix(numRows, numCols, matrices.flatMap(_.toArray).toArray) + } else { val allColPtrs: Array[(Int, Int)] = Array((0, 0)) ++ matrices.zipWithIndex.flatMap { case (mat, ind) => - val ptr = mat.asInstanceOf[SparseMatrix].colPtrs - ptr.slice(1, ptr.length).map(p => (ind, p)) - } + mat match { + case spMat: SparseMatrix => + val ptr = spMat.colPtrs + ptr.slice(1, ptr.length).map(p => (ind, p)) + case dnMat: DenseMatrix => + val colSize = dnMat.numCols + var j = 0 + val rowSize = dnMat.numRows + val ptr = new ArrayBuffer[(Int, Int)](colSize) + var nnz = 0 + val vals = dnMat.values + while (j < colSize) { + var i = j * rowSize + val indEnd = (j + 1) * rowSize + while (i < indEnd) { + if (vals(i) != 0.0) nnz += 1 + i += 1 + } + j += 1 + ptr.append((ind, nnz)) + } + ptr + } + } var counter = 0 var lastIndex = 0 var lastPtr = 0 @@ -643,21 +668,36 @@ object Matrices { lastPtr = p counter + p } + val valsAndIndices: Array[(Int, Double)] = matrices.flatMap { + case spMat: SparseMatrix => + spMat.rowIndices.zip(spMat.values) + case dnMat: DenseMatrix => + val colSize = dnMat.numCols + var j = 0 + val rowSize = dnMat.numRows + val data = new ArrayBuffer[(Int, Double)]() + val vals = dnMat.values + while (j < colSize) { + val indStart = j * rowSize + var i = 0 + while (i < rowSize) { + val index = indStart + i + if (vals(index) != 0.0) data.append((i, vals(index))) + i += 1 + } + j += 1 + } + data + } new SparseMatrix(numRows, numCols, adjustedPtrs, - matrices.flatMap(_.asInstanceOf[SparseMatrix].rowIndices).toArray, - matrices.flatMap(_.asInstanceOf[SparseMatrix].values).toArray) - } else if (!isSparse && !isDense) { - throw new IllegalArgumentException("The supplied matrices are neither in SparseMatrix or" + - " DenseMatrix format!") - }else { - new DenseMatrix(numRows, numCols, matrices.flatMap(_.toArray).toArray) + valsAndIndices.map(_._1), valsAndIndices.map(_._2)) } } /** * Vertically concatenate a sequence of matrices. The returned matrix will be in the format * the matrices are supplied in. Supplying a mix of dense and sparse matrices will result in - * a dense matrix. + * a sparse matrix. * @param matrices array of matrices * @return a single `Matrix` composed of the matrices that were vertically concatenated */ @@ -680,27 +720,58 @@ object Matrices { case dense: DenseMatrix => isDense = true valsLength += dense.values.length + case _ => throw new IllegalArgumentException("Unsupported matrix format. Expected " + + s"SparseMatrix or DenseMatrix. Instead got: ${mat.getClass}") } numRows += mat.numRows } require(colsMatch, "The number of rows of the matrices in this sequence, don't match!") - if (isSparse && !isDense) { - val matMap = matrices.zipWithIndex.map(d => (d._2, d._1.asInstanceOf[SparseMatrix])).toMap - // (matrixInd, colInd, colStart, colEnd, numRows) - val allColPtrs: Seq[(Int, Int, Int, Int, Int)] = - matMap.flatMap { case (ind, mat) => - val ptr = mat.colPtrs - var colStart = 0 - var j = 0 - ptr.slice(1, ptr.length).map { p => - j += 1 - val oldColStart = colStart - colStart = p - (j - 1, ind, oldColStart, p, mat.numRows) - } - }.toSeq + if (!isSparse && isDense) { + val matData = matrices.zipWithIndex.flatMap { case (mat, ind) => + val values = mat.toArray + for (j <- 0 until numCols) yield (j, ind, + values.slice(j * mat.numRows, (j + 1) * mat.numRows)) + }.sortBy(x => (x._1, x._2)) + new DenseMatrix(numRows, numCols, matData.flatMap(_._3).toArray) + } else { + val matMap = matrices.zipWithIndex.map(d => (d._2, d._1)).toMap + // (colInd, matrixInd, colStart, colEnd, numRows) + val allColPtrs: Seq[(Int, Int, Int, Int, Int)] = matMap.flatMap { case (ind, mat) => + mat match { + case spMat: SparseMatrix => + val ptr = spMat.colPtrs + var colStart = 0 + var j = 0 + ptr.slice(1, ptr.length).map { p => + j += 1 + val oldColStart = colStart + colStart = p + (j - 1, ind, oldColStart, p, spMat.numRows) + } + case dnMat: DenseMatrix => + val colSize = dnMat.numCols + var j = 0 + val rowSize = dnMat.numRows + val ptr = new ArrayBuffer[(Int, Int, Int, Int, Int)](colSize) + var nnz = 0 + val vals = dnMat.values + var colStart = 0 + while (j < colSize) { + var i = j * rowSize + val indEnd = (j + 1) * rowSize + while (i < indEnd) { + if (vals(i) != 0.0) nnz += 1 + i += 1 + } + ptr.append((j, ind, colStart, nnz, dnMat.numRows)) + j += 1 + colStart = nnz + } + ptr + } + }.toSeq val values = new ArrayBuffer[Double](valsLength) val rowInd = new ArrayBuffer[Int](valsLength) val newColPtrs = new Array[Int](numCols) @@ -712,31 +783,38 @@ object Matrices { var startRow = 0 sortedPtrs.foreach { case (colIdx, matrixInd, colStart, colEnd, nRows) => val selectedMatrix = matMap(matrixInd) - val selectedValues = selectedMatrix.values.slice(colStart, colEnd) - val selectedRowIdx = selectedMatrix.rowIndices.slice(colStart, colEnd) - val len = selectedValues.length - newColPtrs(colIdx) += len - var i = 0 - while (i < len) { - values.append(selectedValues(i)) - rowInd.append(selectedRowIdx(i) + startRow) - i += 1 + selectedMatrix match { + case spMat: SparseMatrix => + val selectedValues = spMat.values + val selectedRowIdx = spMat.rowIndices + val len = colEnd - colStart + newColPtrs(colIdx) += len + var i = colStart + while (i < colEnd) { + values.append(selectedValues(i)) + rowInd.append(selectedRowIdx(i) + startRow) + i += 1 + } + case dnMat: DenseMatrix => + val selectedValues = dnMat.values + val len = colEnd - colStart + newColPtrs(colIdx) += len + val indStart = colIdx * nRows + var i = 0 + while (i < nRows) { + val v = selectedValues(indStart + i) + if (v != 0) { + values.append(v) + rowInd.append(i + startRow) + } + i += 1 + } } startRow += nRows } } val adjustedPtrs = newColPtrs.scanLeft(0)(_ + _) new SparseMatrix(numRows, numCols, adjustedPtrs, rowInd.toArray, values.toArray) - } else if (!isSparse && !isDense) { - throw new IllegalArgumentException("The supplied matrices are neither in SparseMatrix or" + - " DenseMatrix format!") - }else { - val matData = matrices.zipWithIndex.flatMap { case (mat, ind) => - val values = mat.toArray - for (j <- 0 until numCols) yield (j, ind, - values.slice(j * mat.numRows, (j + 1) * mat.numRows)) - }.sortBy(x => (x._1, x._2)) - new DenseMatrix(numRows, numCols, matData.flatMap(_._3).toArray) } } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index 6526bd614fe40..c9a2b1f0454b5 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -152,21 +152,21 @@ class MatricesSuite extends FunSuite { val spMat3 = Matrices.speye(2) val spHorz = Matrices.horzcat(Array(spMat1, spMat2)) + val spHorz2 = Matrices.horzcat(Array(spMat1, deMat2)) + val spHorz3 = Matrices.horzcat(Array(deMat1, spMat2)) val deHorz1 = Matrices.horzcat(Array(deMat1, deMat2)) - val deHorz2 = Matrices.horzcat(Array(spMat1, deMat2)) - val deHorz3 = Matrices.horzcat(Array(deMat1, spMat2)) assert(deHorz1.numRows === 3) - assert(deHorz2.numRows === 3) - assert(deHorz3.numRows === 3) + assert(spHorz2.numRows === 3) + assert(spHorz3.numRows === 3) assert(spHorz.numRows === 3) assert(deHorz1.numCols === 5) - assert(deHorz2.numCols === 5) - assert(deHorz3.numCols === 5) + assert(spHorz2.numCols === 5) + assert(spHorz3.numCols === 5) assert(spHorz.numCols === 5) - assert(deHorz1 === deHorz2) - assert(deHorz2 === deHorz3) + assert(deHorz1.toBreeze.toDenseMatrix === spHorz2.toBreeze.toDenseMatrix) + assert(spHorz2.toBreeze === spHorz3.toBreeze) assert(spHorz(0, 0) === 1.0) assert(spHorz(2, 1) === 5.0) assert(spHorz(0, 2) === 1.0) @@ -177,7 +177,7 @@ class MatricesSuite extends FunSuite { assert(deHorz1(0, 0) === 1.0) assert(deHorz1(2, 1) === 5.0) assert(deHorz1(0, 2) === 1.0) - assert(deHorz1(1, 2) === 0.0) + assert(deHorz1(1, 2) == 0.0) assert(deHorz1(1, 3) === 1.0) assert(deHorz1(2, 4) === 1.0) assert(deHorz1(1, 4) === 0.0) @@ -192,20 +192,20 @@ class MatricesSuite extends FunSuite { val spVert = Matrices.vertcat(Array(spMat1, spMat3)) val deVert1 = Matrices.vertcat(Array(deMat1, deMat3)) - val deVert2 = Matrices.vertcat(Array(spMat1, deMat3)) - val deVert3 = Matrices.vertcat(Array(deMat1, spMat3)) + val spVert2 = Matrices.vertcat(Array(spMat1, deMat3)) + val spVert3 = Matrices.vertcat(Array(deMat1, spMat3)) assert(deVert1.numRows === 5) - assert(deVert2.numRows === 5) - assert(deVert3.numRows === 5) + assert(spVert2.numRows === 5) + assert(spVert3.numRows === 5) assert(spVert.numRows === 5) assert(deVert1.numCols === 2) - assert(deVert2.numCols === 2) - assert(deVert3.numCols === 2) + assert(spVert2.numCols === 2) + assert(spVert3.numCols === 2) assert(spVert.numCols === 2) - assert(deVert1 === deVert2) - assert(deVert2 === deVert3) + assert(deVert1.toBreeze.toDenseMatrix === spVert2.toBreeze.toDenseMatrix) + assert(spVert2.toBreeze === spVert3.toBreeze) assert(spVert(0, 0) === 1.0) assert(spVert(2, 1) === 5.0) assert(spVert(3, 0) === 1.0) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala index 30b906aaa3ba4..e957fa5d25f4c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala @@ -178,17 +178,17 @@ object TestingUtils { implicit class MatrixWithAlmostEquals(val x: Matrix) { /** - * When the difference of two vectors are within eps, returns true; otherwise, returns false. + * When the difference of two matrices are within eps, returns true; otherwise, returns false. */ def ~=(r: CompareMatrixRightSide): Boolean = r.fun(x, r.y, r.eps) /** - * When the difference of two vectors are within eps, returns false; otherwise, returns true. + * When the difference of two matrices are within eps, returns false; otherwise, returns true. */ def !~=(r: CompareMatrixRightSide): Boolean = !r.fun(x, r.y, r.eps) /** - * Throws exception when the difference of two vectors are NOT within eps; + * Throws exception when the difference of two matrices are NOT within eps; * otherwise, returns true. */ def ~==(r: CompareMatrixRightSide): Boolean = {