MLUtils: changed privacy of EPSILON from [util] to [mllib]

GaussianMixtureEM: Renamed from GaussianMixtureModelEM; corrected formatting issues GaussianMixtureModel: Renamed predictLabels() to predict() Others: Modifications based on rename of GaussianMixtureEM
msjgriffiths · Dec 22, 2014 · aaa8f25 · aaa8f25
1 parent 709e4bf
commit aaa8f25
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 19 deletions.
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.clustering.GaussianMixtureModelEM
+import org.apache.spark.mllib.clustering.GaussianMixtureEM
 import org.apache.spark.mllib.linalg.Vectors
 
 /**
@@ -46,7 +46,7 @@ object DenseGmmEM {
       Vectors.dense(line.trim.split(' ').map(_.toDouble))
     }.cache()
 
-    val clusters = new GaussianMixtureModelEM()
+    val clusters = new GaussianMixtureEM()
       .setK(k)
       .setConvergenceTol(convergenceTol)
       .setMaxIterations(maxIterations)
@@ -58,7 +58,7 @@ object DenseGmmEM {
     }
 
     println("Cluster labels (first <= 100):")
-    val clusterLabels = clusters.predictLabels(data)
+    val clusterLabels = clusters.predict(data)
     clusterLabels.take(100).foreach { x =>
       print(" " + x)
     }

diff --git a/...b/clustering/GaussianMixtureModelEM.scala → .../mllib/clustering/GaussianMixtureEM.scala b/...b/clustering/GaussianMixtureModelEM.scala → .../mllib/clustering/GaussianMixtureEM.scala
@@ -23,6 +23,7 @@ import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix,
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors}
 import org.apache.spark.mllib.stat.impl.MultivariateGaussian
+import org.apache.spark.mllib.util.MLUtils
 
 /**
  * This class performs expectation maximization for multivariate Gaussian
@@ -41,16 +42,14 @@ import org.apache.spark.mllib.stat.impl.MultivariateGaussian
  * is considered to have occurred.
  * @param maxIterations The maximum number of iterations to perform
  */
-class GaussianMixtureModelEM private (
+class GaussianMixtureEM private (
     private var k: Int, 
     private var convergenceTol: Double, 
     private var maxIterations: Int) extends Serializable {
 
   /** A default instance, 2 Gaussians, 100 iterations, 0.01 log-likelihood threshold */
   def this() = this(2, 0.01, 100)
 
-
-
   // number of samples per cluster to use when initializing Gaussians
   private val nSamples = 5
 
@@ -190,8 +189,6 @@ class GaussianMixtureModelEM private (
 
 // companion class to provide zero constructor for ExpectationSum
 private object ExpectationSum {
-  private val eps = math.pow(2.0, -52)
-
   def zero(k: Int, d: Int): ExpectationSum = {
     new ExpectationSum(0.0, Array.fill(k)(0.0), 
       Array.fill(k)(BreezeVector.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d)))
@@ -203,7 +200,9 @@ private object ExpectationSum {
       weights: Array[Double], 
       dists: Array[MultivariateGaussian])
       (sums: ExpectationSum, x: BreezeVector[Double]): ExpectationSum = {
-    val p = weights.zip(dists).map { case (weight, dist) => eps + weight * dist.pdf(x) }
+    val p = weights.zip(dists).map {
+      case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(x)
+    }
     val pSum = p.sum
     sums.logLikelihood += math.log(pSum)
     val xxt = x * new Transpose(x)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -22,6 +22,7 @@ import breeze.linalg.{DenseVector => BreezeVector}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.{Matrix, Vector}
 import org.apache.spark.mllib.stat.impl.MultivariateGaussian
+import org.apache.spark.mllib.util.MLUtils
 
 /**
  * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points 
@@ -43,7 +44,7 @@ class GaussianMixtureModel(
   def k: Int = weight.length
 
   /** Maps given points to their cluster indices. */
-  def predictLabels(points: RDD[Vector]): RDD[Int] = {
+  def predict(points: RDD[Vector]): RDD[Int] = {
     val responsibilityMatrix = predictMembership(points, mu, sigma, weight, k)
     responsibilityMatrix.map(r => r.indexOf(r.max))
   }
@@ -70,11 +71,6 @@ class GaussianMixtureModel(
     }
   }
 
-  // We use "eps" as the minimum likelihood density for any given point
-  // in every cluster; this prevents any divide by zero conditions for
-  // outlier points.
-  private val eps = math.pow(2.0, -52)
-
   /**
    * Compute the partial assignments for each vector
    */
@@ -83,7 +79,9 @@ class GaussianMixtureModel(
       dists: Array[MultivariateGaussian],
       weights: Array[Double],
       k: Int): Array[Double] = {
-    val p = weights.zip(dists).map { case (weight, dist) => eps + weight * dist.pdf(pt) }
+    val p = weights.zip(dists).map {
+      case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt)
+    }
     val pSum = p.sum 
     for (i <- 0 until k) {
       p(i) /= pSum

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -38,7 +38,7 @@ import org.apache.spark.streaming.dstream.DStream
  */
 object MLUtils {
 
-  private[util] lazy val EPSILON = {
+  private[mllib] lazy val EPSILON = {
     var eps = 1.0
     while ((1.0 + (eps / 2.0)) != 1.0) {
       eps /= 2.0

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala
@@ -36,7 +36,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex
     val Emu = Vectors.dense(5.0, 10.0)
     val Esigma = Matrices.dense(2, 2, Array(2.0 / 3.0, -2.0 / 3.0, -2.0 / 3.0, 2.0 / 3.0))
 
-    val gmm = new GaussianMixtureModelEM().setK(1).run(data)
+    val gmm = new GaussianMixtureEM().setK(1).run(data)
 
     assert(gmm.weight(0) ~== Ew absTol 1E-5)
     assert(gmm.mu(0) ~== Emu absTol 1E-5)
@@ -63,7 +63,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex
     val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604))
     val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644)))
 
-    val gmm = new GaussianMixtureModelEM()
+    val gmm = new GaussianMixtureEM()
       .setK(2)
       .setInitialModel(initialGmm)
       .run(data)