Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/tgaloppo/spark
Browse files Browse the repository at this point in the history
  Adds predict() method
  • Loading branch information
tgaloppo committed Dec 16, 2014
2 parents 2df336b + b99ecc4 commit c3b8ce0
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,9 @@ object DenseGmmEM {
println("weight=%f mu=%s sigma=\n%s\n" format
(clusters.weight(i), clusters.mu(i), clusters.sigma(i)))
}
val (responsibility_matrix, cluster_labels) = clusters.predict(data)
for(x <- cluster_labels.collect()){
print(" " + x)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.mllib.clustering

import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Vector

Expand All @@ -38,4 +39,12 @@ class GaussianMixtureModel(

/** Number of gaussians in mixture */
def k: Int = weight.length;

/** Maps given points to their cluster indices. */
def predict(points: RDD[Vector]): (RDD[Array[Double]],RDD[Int]) = {
val responsibility_matrix = new GaussianMixtureModelEM()
.predictClusters(points,mu,sigma,weight,k)
val cluster_labels = responsibility_matrix.map(r => r.indexOf(r.max))
(responsibility_matrix,cluster_labels)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix}
import breeze.linalg.Transpose

import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.impl.MultivariateGaussian
import org.apache.spark.{Accumulator, AccumulatorParam, SparkContext}
import org.apache.spark.SparkContext.DoubleAccumulatorParam
Expand Down Expand Up @@ -208,6 +208,34 @@ class GaussianMixtureModelEM private (
cov
}

/**
Given the input vectors, return the membership value of each vector
to all mixture components.
*/
def predictClusters(points:RDD[Vector],mu:Array[Vector],sigma:Array[Matrix],
weight:Array[Double],k:Int):RDD[Array[Double]]= {
val ctx = points.sparkContext
val dists = ctx.broadcast((0 until k).map(i =>
new MultivariateGaussian(mu(i).toBreeze.toDenseVector,sigma(i).toBreeze.toDenseMatrix))
.toArray)
val weights = ctx.broadcast((0 until k).map(i => weight(i)).toArray)
points.map(x=>compute_log_likelihood(x.toBreeze.toDenseVector,dists.value,weights.value,k))

}
/**
* Compute the log density of each vector
*/
def compute_log_likelihood(pt:DenseDoubleVector,dists:Array[MultivariateGaussian],
weights:Array[Double],k:Int):Array[Double]={
val p = (0 until k).map(i =>
eps + weights(i) * dists(i).pdf(pt)).toArray
val pSum = p.sum
for(i<- 0 until k){
p(i) /= pSum
}
p
}

/** AccumulatorParam for Dense Breeze Vectors */
private object DenseDoubleVectorAccumulatorParam extends AccumulatorParam[DenseDoubleVector] {
def zero(initialVector: DenseDoubleVector): DenseDoubleVector = {
Expand Down

0 comments on commit c3b8ce0

Please sign in to comment.