Skip to content

Commit

Permalink
Added additional train() method to companion object for cluster count…
Browse files Browse the repository at this point in the history
… and tolerance parameters.

Modified cluster initialization strategy to use an initial covariance matrix derived from the sample points used to initialize the mean.
  • Loading branch information
tgaloppo committed Dec 3, 2014
1 parent 676e523 commit 8aaa17d
Showing 1 changed file with 31 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ object GMMExpectationMaximization {
/**
* Trains a GMM using the given parameters
*
* @param data training points stores as RDD[Vector]
* @param data training points stored as RDD[Vector]
* @param k the number of Gaussians in the mixture
* @param maxIterations the maximum number of iterations to perform
* @param delta change in log-likelihood at which convergence is considered achieved
Expand All @@ -47,7 +47,7 @@ object GMMExpectationMaximization {
/**
* Trains a GMM using the given parameters
*
* @param data training points stores as RDD[Vector]
* @param data training points stored as RDD[Vector]
* @param k the number of Gaussians in the mixture
* @param maxIterations the maximum number of iterations to perform
*/
Expand All @@ -58,7 +58,18 @@ object GMMExpectationMaximization {
/**
* Trains a GMM using the given parameters
*
* @param data training points stores as RDD[Vector]
* @param data training points stored as RDD[Vector]
* @param k the number of Gaussians in the mixture
* @param delta change in log-likelihood at which convergence is considered achieved
*/
def train(data: RDD[Vector], k: Int, delta: Double): GaussianMixtureModel = {
new GMMExpectationMaximization().setK(k).setDelta(delta).run(data)
}

/**
* Trains a GMM using the given parameters
*
* @param data training points stored as RDD[Vector]
* @param k the number of Gaussians in the mixture
*/
def train(data: RDD[Vector], k: Int): GaussianMixtureModel = {
Expand Down Expand Up @@ -127,10 +138,12 @@ class GMMExpectationMaximization private (

// C will be array of (weight, mean, covariance) tuples
// we start with uniform weights, a random mean from the data, and
// identity matrices for covariance
// diagonal covariance matrices using component variances
// derived from the samples
var C = (0 until k).map(i => (1.0/k,
vec_mean(samples.slice(i * nSamples, (i + 1) * nSamples)),
BreezeMatrix.eye[Double](d))).toArray
init_cov(samples.slice(i * nSamples, (i + 1) * nSamples)))
).toArray

val acc_w = new Array[Accumulator[Double]](k)
val acc_mu = new Array[Accumulator[DenseDoubleVector]](k)
Expand Down Expand Up @@ -216,6 +229,19 @@ class GMMExpectationMaximization private (
v / x.length.asInstanceOf[Double]
}

/**
* Construct matrix where diagonal entries are element-wise
* variance of input vectors (computes biased variance)
*/
private def init_cov(x : Array[DenseDoubleVector]) : DenseDoubleMatrix = {
val mu = vec_mean(x)
val ss = BreezeVector.zeros[Double](x(0).length)
val result = BreezeMatrix.eye[Double](ss.length)
(0 until x.length).map(i => (x(i) - mu) :^ 2.0).foreach(u => ss += u)
(0 until ss.length).foreach(i => result(i,i) = ss(i) / x.length)
result
}

/** AccumulatorParam for Dense Breeze Vectors */
private object DenseDoubleVectorAccumulatorParam extends AccumulatorParam[DenseDoubleVector] {
def zero(initialVector : DenseDoubleVector) : DenseDoubleVector = {
Expand Down

0 comments on commit 8aaa17d

Please sign in to comment.