Skip to content

Commit

Permalink
MLUtils: changed privacy of EPSILON from [util] to [mllib]
Browse files Browse the repository at this point in the history
GaussianMixtureEM: Renamed from GaussianMixtureModelEM; corrected formatting issues

GaussianMixtureModel: Renamed predictLabels() to predict()

Others: Modifications based on rename of GaussianMixtureEM
  • Loading branch information
tgaloppo committed Dec 22, 2014
1 parent 709e4bf commit aaa8f25
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.GaussianMixtureModelEM
import org.apache.spark.mllib.clustering.GaussianMixtureEM
import org.apache.spark.mllib.linalg.Vectors

/**
Expand Down Expand Up @@ -46,7 +46,7 @@ object DenseGmmEM {
Vectors.dense(line.trim.split(' ').map(_.toDouble))
}.cache()

val clusters = new GaussianMixtureModelEM()
val clusters = new GaussianMixtureEM()
.setK(k)
.setConvergenceTol(convergenceTol)
.setMaxIterations(maxIterations)
Expand All @@ -58,7 +58,7 @@ object DenseGmmEM {
}

println("Cluster labels (first <= 100):")
val clusterLabels = clusters.predictLabels(data)
val clusterLabels = clusters.predict(data)
clusterLabels.take(100).foreach { x =>
print(" " + x)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix,
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors}
import org.apache.spark.mllib.stat.impl.MultivariateGaussian
import org.apache.spark.mllib.util.MLUtils

/**
* This class performs expectation maximization for multivariate Gaussian
Expand All @@ -41,16 +42,14 @@ import org.apache.spark.mllib.stat.impl.MultivariateGaussian
* is considered to have occurred.
* @param maxIterations The maximum number of iterations to perform
*/
class GaussianMixtureModelEM private (
class GaussianMixtureEM private (
private var k: Int,
private var convergenceTol: Double,
private var maxIterations: Int) extends Serializable {

/** A default instance, 2 Gaussians, 100 iterations, 0.01 log-likelihood threshold */
def this() = this(2, 0.01, 100)



// number of samples per cluster to use when initializing Gaussians
private val nSamples = 5

Expand Down Expand Up @@ -190,8 +189,6 @@ class GaussianMixtureModelEM private (

// companion class to provide zero constructor for ExpectationSum
private object ExpectationSum {
private val eps = math.pow(2.0, -52)

def zero(k: Int, d: Int): ExpectationSum = {
new ExpectationSum(0.0, Array.fill(k)(0.0),
Array.fill(k)(BreezeVector.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d)))
Expand All @@ -203,7 +200,9 @@ private object ExpectationSum {
weights: Array[Double],
dists: Array[MultivariateGaussian])
(sums: ExpectationSum, x: BreezeVector[Double]): ExpectationSum = {
val p = weights.zip(dists).map { case (weight, dist) => eps + weight * dist.pdf(x) }
val p = weights.zip(dists).map {
case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(x)
}
val pSum = p.sum
sums.logLikelihood += math.log(pSum)
val xxt = x * new Transpose(x)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import breeze.linalg.{DenseVector => BreezeVector}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{Matrix, Vector}
import org.apache.spark.mllib.stat.impl.MultivariateGaussian
import org.apache.spark.mllib.util.MLUtils

/**
* Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
Expand All @@ -43,7 +44,7 @@ class GaussianMixtureModel(
def k: Int = weight.length

/** Maps given points to their cluster indices. */
def predictLabels(points: RDD[Vector]): RDD[Int] = {
def predict(points: RDD[Vector]): RDD[Int] = {
val responsibilityMatrix = predictMembership(points, mu, sigma, weight, k)
responsibilityMatrix.map(r => r.indexOf(r.max))
}
Expand All @@ -70,11 +71,6 @@ class GaussianMixtureModel(
}
}

// We use "eps" as the minimum likelihood density for any given point
// in every cluster; this prevents any divide by zero conditions for
// outlier points.
private val eps = math.pow(2.0, -52)

/**
* Compute the partial assignments for each vector
*/
Expand All @@ -83,7 +79,9 @@ class GaussianMixtureModel(
dists: Array[MultivariateGaussian],
weights: Array[Double],
k: Int): Array[Double] = {
val p = weights.zip(dists).map { case (weight, dist) => eps + weight * dist.pdf(pt) }
val p = weights.zip(dists).map {
case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt)
}
val pSum = p.sum
for (i <- 0 until k) {
p(i) /= pSum
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ import org.apache.spark.streaming.dstream.DStream
*/
object MLUtils {

private[util] lazy val EPSILON = {
private[mllib] lazy val EPSILON = {
var eps = 1.0
while ((1.0 + (eps / 2.0)) != 1.0) {
eps /= 2.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex
val Emu = Vectors.dense(5.0, 10.0)
val Esigma = Matrices.dense(2, 2, Array(2.0 / 3.0, -2.0 / 3.0, -2.0 / 3.0, 2.0 / 3.0))

val gmm = new GaussianMixtureModelEM().setK(1).run(data)
val gmm = new GaussianMixtureEM().setK(1).run(data)

assert(gmm.weight(0) ~== Ew absTol 1E-5)
assert(gmm.mu(0) ~== Emu absTol 1E-5)
Expand All @@ -63,7 +63,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex
val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604))
val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644)))

val gmm = new GaussianMixtureModelEM()
val gmm = new GaussianMixtureEM()
.setK(2)
.setInitialModel(initialGmm)
.run(data)
Expand Down

0 comments on commit aaa8f25

Please sign in to comment.