remove main from KMeans and add DenseKMeans as an example

pdeyhim · Apr 28, 2014 · fe23dcb · fe23dcb
1 parent 67f4448
commit fe23dcb
Show file tree

Hide file tree

Showing 2 changed files with 109 additions and 22 deletions.
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.log4j.{Level, Logger}
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.clustering.KMeans
+import org.apache.spark.mllib.linalg.Vectors
+
+/**
+ * An example k-means app. Run with
+ * {{{
+ * ./bin/spark-example org.apache.spark.examples.mllib.DenseKMeans [options] <input>
+ * }}}
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object DenseKMeans extends App {
+
+  object InitializationMode extends Enumeration {
+    type InitializationMode = Value
+    val Random, Parallel = Value
+  }
+
+  import InitializationMode._
+
+  case class Params(
+      input: String = null,
+      k: Int = -1,
+      numIterations: Int = 10,
+      initializationMode: InitializationMode = Parallel)
+
+  val defaultParams = Params()
+
+  val parser = new OptionParser[Params]("DenseKMeans") {
+    head("DenseKMeans: an example k-means app for dense data.")
+    opt[Int]('k', "k")
+      .required()
+      .text(s"number of clusters, required")
+      .action((x, c) => c.copy(k = x))
+    opt[Int]("numIterations")
+      .text(s"number of iterations, default; ${defaultParams.numIterations}")
+      .action((x, c) => c.copy(numIterations = x))
+    opt[String]("initMode")
+      .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
+      s"default: ${defaultParams.initializationMode}")
+      .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
+    arg[String]("<input>")
+      .text("input paths to examples")
+      .required()
+      .action((x, c) => c.copy(input = x))
+  }
+
+  parser.parse(args, defaultParams).map { params =>
+    run(params)
+  }.getOrElse {
+    sys.exit(1)
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
+    val sc = new SparkContext(conf)
+
+    Logger.getRootLogger.setLevel(Level.WARN)
+
+    val examples = sc.textFile(params.input).map { line =>
+      Vectors.dense(line.split(' ').map(_.toDouble))
+    }.cache()
+
+    val numExamples = examples.count()
+
+    println(s"numExamples = $numExamples.")
+
+    val initMode = params.initializationMode match {
+      case Random =>
+        KMeans.RANDOM
+      case Parallel =>
+        KMeans.K_MEANS_PARALLEL
+    }
+
+    val model = new KMeans()
+      .setInitializationMode(initMode)
+      .setK(params.k)
+      .setMaxIterations(params.numIterations)
+      .run(examples)
+
+    val cost = model.computeCost(examples)
+
+    println(s"Total cost = $cost.")
+
+    sc.stop()
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -396,28 +396,6 @@ object KMeans {
       v2: BreezeVectorWithNorm): Double = {
     MLUtils.fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm)
   }
-
-  @Experimental
-  def main(args: Array[String]) {
-    if (args.length < 4) {
-      println("Usage: KMeans <master> <input_file> <k> <max_iterations> [<runs>]")
-      System.exit(1)
-    }
-    val (master, inputFile, k, iters) = (args(0), args(1), args(2).toInt, args(3).toInt)
-    val runs = if (args.length >= 5) args(4).toInt else 1
-    val sc = new SparkContext(master, "KMeans")
-    val data = sc.textFile(inputFile)
-      .map(line => Vectors.dense(line.split(' ').map(_.toDouble)))
-      .cache()
-    val model = KMeans.train(data, k, iters, runs)
-    val cost = model.computeCost(data)
-    println("Cluster centers:")
-    for (c <- model.clusterCenters) {
-      println("  " + c)
-    }
-    println("Cost: " + cost)
-    System.exit(0)
-  }
 }
 
 /**