Skip to content

Commit

Permalink
Merge branch 'master' of github.com:apache/spark into improve_create
Browse files Browse the repository at this point in the history
  • Loading branch information
Davies Liu committed Jun 3, 2015
2 parents 6ea5925 + f1646e1 commit 8c3f10d
Show file tree
Hide file tree
Showing 87 changed files with 1,859 additions and 1,020 deletions.
1 change: 1 addition & 0 deletions .rat-excludes
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,4 @@ local-1426633911242/*
local-1430917381534/*
DESCRIPTION
NAMESPACE
test_support/*
4 changes: 2 additions & 2 deletions core/src/test/scala/org/apache/spark/SparkFunSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ private[spark] abstract class SparkFunSuite extends FunSuite with Logging {
* Log the suite name and the test name before and after each test.
*
* Subclasses should never override this method. If they wish to run
* custom code before and after each test, they should should mix in
* the {{org.scalatest.BeforeAndAfter}} trait instead.
* custom code before and after each test, they should mix in the
* {{org.scalatest.BeforeAndAfter}} trait instead.
*/
final protected override def withFixture(test: NoArgTest): Outcome = {
val testName = test.text
Expand Down
8 changes: 4 additions & 4 deletions docs/ml-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ val model1 = lr.fit(training.toDF)
// we can view the parameters it used during fit().
// This prints the parameter (name: value) pairs, where names are unique IDs for this
// LogisticRegression instance.
println("Model 1 was fit using parameters: " + model1.fittingParamMap)
println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)

// We may alternatively specify parameters using a ParamMap,
// which supports several methods for specifying parameters.
Expand All @@ -222,7 +222,7 @@ val paramMapCombined = paramMap ++ paramMap2
// Now learn a new model using the paramMapCombined parameters.
// paramMapCombined overrides all parameters set earlier via lr.set* methods.
val model2 = lr.fit(training.toDF, paramMapCombined)
println("Model 2 was fit using parameters: " + model2.fittingParamMap)
println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)

// Prepare test data.
val test = sc.parallelize(Seq(
Expand Down Expand Up @@ -289,7 +289,7 @@ LogisticRegressionModel model1 = lr.fit(training);
// we can view the parameters it used during fit().
// This prints the parameter (name: value) pairs, where names are unique IDs for this
// LogisticRegression instance.
System.out.println("Model 1 was fit using parameters: " + model1.fittingParamMap());
System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());

// We may alternatively specify parameters using a ParamMap.
ParamMap paramMap = new ParamMap();
Expand All @@ -305,7 +305,7 @@ ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
// Now learn a new model using the paramMapCombined parameters.
// paramMapCombined overrides all parameters set earlier via lr.set* methods.
LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
System.out.println("Model 2 was fit using parameters: " + model2.fittingParamMap());
System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());

// Prepare test documents.
List<LabeledPoint> localTest = Lists.newArrayList(
Expand Down
6 changes: 3 additions & 3 deletions docs/mllib-clustering.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,11 @@ public class GaussianMixtureExample {
GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd());

// Save and load GaussianMixtureModel
gmm.save(sc, "myGMMModel")
GaussianMixtureModel sameModel = GaussianMixtureModel.load(sc, "myGMMModel")
gmm.save(sc.sc(), "myGMMModel");
GaussianMixtureModel sameModel = GaussianMixtureModel.load(sc.sc(), "myGMMModel");
// Output the parameters of the mixture model
for(int j=0; j<gmm.k(); j++) {
System.out.println("weight=%f\nmu=%s\nsigma=\n%s\n",
System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma());
}
}
Expand Down
24 changes: 10 additions & 14 deletions docs/mllib-linear-methods.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,8 @@ object, and make predictions with the resulting model to compute the training
error.

{% highlight scala %}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils

// Load training data in LIBSVM format.
Expand Down Expand Up @@ -231,15 +228,13 @@ calling `.rdd()` on your `JavaRDD` object. A self-contained application example
that is equivalent to the provided example in Scala is given bellow:

{% highlight java %}
import java.util.Random;

import scala.Tuple2;

import org.apache.spark.api.java.*;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.classification.*;
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
import org.apache.spark.mllib.linalg.Vector;

import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.SparkConf;
Expand Down Expand Up @@ -282,8 +277,8 @@ public class SVMClassifier {
System.out.println("Area under ROC = " + auROC);

// Save and load model
model.save(sc.sc(), "myModelPath");
SVMModel sameModel = SVMModel.load(sc.sc(), "myModelPath");
model.save(sc, "myModelPath");
SVMModel sameModel = SVMModel.load(sc, "myModelPath");
}
}
{% endhighlight %}
Expand Down Expand Up @@ -315,15 +310,12 @@ a dependency.
</div>

<div data-lang="python" markdown="1">
The following example shows how to load a sample dataset, build Logistic Regression model,
The following example shows how to load a sample dataset, build SVM model,
and make predictions with the resulting model to compute the training error.

Note that the Python API does not yet support model save/load but will in the future.

{% highlight python %}
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
from numpy import array

# Load and parse the data
def parsePoint(line):
Expand All @@ -334,12 +326,16 @@ data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Build the model
model = LogisticRegressionWithSGD.train(parsedData)
model = SVMWithSGD.train(parsedData, iterations=100)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model
model.save(sc, "myModelPath")
sameModel = SVMModel.load(sc, "myModelPath")
{% endhighlight %}
</div>
</div>
Expand Down
2 changes: 1 addition & 1 deletion docs/mllib-naive-bayes.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
val training = splits(0)
val test = splits(1)

val model = NaiveBayes.train(training, lambda = 1.0, model = "multinomial")
val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial")

val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
Expand Down
96 changes: 96 additions & 0 deletions examples/src/main/python/ml/cross_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import Row, SQLContext

"""
A simple example demonstrating model selection using CrossValidator.
This example also demonstrates how Pipelines are Estimators.
Run with:
bin/spark-submit examples/src/main/python/ml/cross_validator.py
"""

if __name__ == "__main__":
sc = SparkContext(appName="CrossValidatorExample")
sqlContext = SQLContext(sc)

# Prepare training documents, which are labeled.
LabeledDocument = Row("id", "text", "label")
training = sc.parallelize([(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0),
(4, "b spark who", 1.0),
(5, "g d a y", 0.0),
(6, "spark fly", 1.0),
(7, "was mapreduce", 0.0),
(8, "e spark program", 1.0),
(9, "a e c l", 0.0),
(10, "spark compile", 1.0),
(11, "hadoop software", 0.0)
]) \
.map(lambda x: LabeledDocument(*x)).toDF()

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
.addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
.addGrid(lr.regParam, [0.1, 0.01]) \
.build()

crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=BinaryClassificationEvaluator(),
numFolds=2) # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Prepare test documents, which are unlabeled.
Document = Row("id", "text")
test = sc.parallelize([(4L, "spark i j k"),
(5L, "l m n"),
(6L, "mapreduce spark"),
(7L, "apache hadoop")]) \
.map(lambda x: Document(*x)).toDF()

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
print(row)

sc.stop()
4 changes: 2 additions & 2 deletions examples/src/main/python/ml/simple_params_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@

# prepare training data.
# We create an RDD of LabeledPoints and convert them into a DataFrame.
# Spark DataFrames can automatically infer the schema from named tuples
# and LabeledPoint implements __reduce__ to behave like a named tuple.
# A LabeledPoint is an Object with two fields named label and features
# and Spark SQL identifies these fields and creates the schema appropriately.
training = sc.parallelize([
LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),
Expand Down
Loading

0 comments on commit 8c3f10d

Please sign in to comment.