Skip to content

Commit

Permalink
Merge branch 'master' of github.com:apache/spark into viz-emphasize-rdd
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Or committed May 6, 2015
2 parents 565801f + 7740996 commit f23e15b
Show file tree
Hide file tree
Showing 137 changed files with 35,960 additions and 573 deletions.
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ exportMethods("cache",
"collect",
"columns",
"count",
"describe",
"distinct",
"dtypes",
"except",
Expand Down
37 changes: 37 additions & 0 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
})

#' describe
#'
#' Computes statistics for numeric columns.
#' If no columns are given, this function computes statistics for all numerical columns.
#'
#' @param x A DataFrame to be computed.
#' @param col A string of name
#' @param ... Additional expressions
#' @return A DataFrame
#' @rdname describe
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' sqlCtx <- sparkRSQL.init(sc)
#' path <- "path/to/file.json"
#' df <- jsonFile(sqlCtx, path)
#' describe(df)
#' describe(df, "col1")
#' describe(df, "col1", "col2")
#' }
setMethod("describe",
signature(x = "DataFrame", col = "character"),
function(x, col, ...) {
colList <- list(col, ...)
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
dataFrame(sdf)
})

#' @rdname describe
setMethod("describe",
signature(x = "DataFrame"),
function(x) {
colList <- as.list(c(columns(x)))
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
dataFrame(sdf)
})
4 changes: 4 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
#' @export
setGeneric("columns", function(x) {standardGeneric("columns") })

#' @rdname describe
#' @export
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })

#' @rdname schema
#' @export
setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
Expand Down
11 changes: 11 additions & 0 deletions R/pkg/inst/tests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
expect_true(count(parquetDF) == count(df)*2)
})

test_that("describe() on a DataFrame", {
df <- jsonFile(sqlCtx, jsonPath)
stats <- describe(df, "age")
expect_true(collect(stats)[1, "summary"] == "count")
expect_true(collect(stats)[2, "age"] == 24.5)
expect_true(collect(stats)[3, "age"] == 5.5)
stats <- describe(df)
expect_true(collect(stats)[4, "name"] == "Andy")
expect_true(collect(stats)[5, "age"] == 30.0)
})

unlink(parquetPath)
unlink(jsonPath)
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,7 @@ function renderDagViz(forJob) {
resizeSvg(svg);
}

/*
* Render the RDD DAG visualization on the stage page.
*/
/* Render the RDD DAG visualization on the stage page. */
function renderDagVizForStage(svgContainer) {
var metadata = metadataContainer().select(".stage-metadata");
var dot = metadata.select(".dot-file").text();
Expand Down
8 changes: 5 additions & 3 deletions core/src/main/scala/org/apache/spark/SparkContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -407,15 +407,17 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli

if (master == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true")

// "_jobProgressListener" should be set up before creating SparkEnv because when creating
// "SparkEnv", some messages will be posted to "listenerBus" and we should not miss them.
_jobProgressListener = new JobProgressListener(_conf)
listenerBus.addListener(jobProgressListener)

// Create the Spark execution environment (cache, map output tracker, etc)
_env = createSparkEnv(_conf, isLocal, listenerBus)
SparkEnv.set(_env)

_metadataCleaner = new MetadataCleaner(MetadataCleanerType.SPARK_CONTEXT, this.cleanup, _conf)

_jobProgressListener = new JobProgressListener(_conf)
listenerBus.addListener(jobProgressListener)

_statusTracker = new SparkStatusTracker(this)

_progressBar =
Expand Down
41 changes: 41 additions & 0 deletions core/src/main/scala/org/apache/spark/annotation/Private.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.annotation;

import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;

/**
* A class that is considered private to the internals of Spark -- there is a high-likelihood
* they will be changed in future versions of Spark.
*
* This should be used only when the standard Scala / Java means of protecting classes are
* insufficient. In particular, Java has no equivalent of private[spark], so we use this annotation
* in its place.
*
* NOTE: If there exists a Scaladoc comment that immediately precedes this annotation, the first
* line of the comment must be ":: Private ::" with no trailing blank line. This is because
* of the known issue that Scaladoc displays only either the annotation or the comment, whichever
* comes first.
*/
@Retention(RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE, ElementType.FIELD, ElementType.METHOD, ElementType.PARAMETER,
ElementType.CONSTRUCTOR, ElementType.LOCAL_VARIABLE, ElementType.PACKAGE})
public @interface Private {}
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,13 @@ class KryoSerializer(conf: SparkConf)
override def newInstance(): SerializerInstance = {
new KryoSerializerInstance(this)
}

private[spark] override lazy val supportsRelocationOfSerializedObjects: Boolean = {
// If auto-reset is disabled, then Kryo may store references to duplicate occurrences of objects
// in the stream rather than writing those objects' serialized bytes, breaking relocation. See
// https://groups.google.com/d/msg/kryo-users/6ZUSyfjjtdo/FhGG1KHDXPgJ for more details.
newInstance().asInstanceOf[KryoSerializerInstance].getAutoReset()
}
}

private[spark]
Expand Down
35 changes: 34 additions & 1 deletion core/src/main/scala/org/apache/spark/serializer/Serializer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import java.nio.ByteBuffer
import scala.reflect.ClassTag

import org.apache.spark.{SparkConf, SparkEnv}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.annotation.{DeveloperApi, Private}
import org.apache.spark.util.{Utils, ByteBufferInputStream, NextIterator}

/**
Expand Down Expand Up @@ -63,6 +63,39 @@ abstract class Serializer {

/** Creates a new [[SerializerInstance]]. */
def newInstance(): SerializerInstance

/**
* :: Private ::
* Returns true if this serializer supports relocation of its serialized objects and false
* otherwise. This should return true if and only if reordering the bytes of serialized objects
* in serialization stream output is equivalent to having re-ordered those elements prior to
* serializing them. More specifically, the following should hold if a serializer supports
* relocation:
*
* {{{
* serOut.open()
* position = 0
* serOut.write(obj1)
* serOut.flush()
* position = # of bytes writen to stream so far
* obj1Bytes = output[0:position-1]
* serOut.write(obj2)
* serOut.flush()
* position2 = # of bytes written to stream so far
* obj2Bytes = output[position:position2-1]
* serIn.open([obj2bytes] concatenate [obj1bytes]) should return (obj2, obj1)
* }}}
*
* In general, this property should hold for serializers that are stateless and that do not
* write special metadata at the beginning or end of the serialization stream.
*
* This API is private to Spark; this method should not be overridden in third-party subclasses
* or called in user code and is subject to removal in future Spark releases.
*
* See SPARK-7311 for more details.
*/
@Private
private[spark] def supportsRelocationOfSerializedObjects: Boolean = false
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,7 @@ private[spark] class ExternalSorter[K, V, C](
private val kvChunkSize = conf.getInt("spark.shuffle.sort.kvChunkSize", 1 << 22) // 4 MB
private val useSerializedPairBuffer =
!ordering.isDefined && conf.getBoolean("spark.shuffle.sort.serializeMapOutputs", true) &&
ser.isInstanceOf[KryoSerializer] &&
serInstance.asInstanceOf[KryoSerializerInstance].getAutoReset
ser.supportsRelocationOfSerializedObjects

// Data structures to store in-memory objects before we spill. Depending on whether we have an
// Aggregator set, we either put objects into an AppendOnlyMap where we combine them, or we
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

package org.apache.spark.broadcast

import scala.concurrent.duration._
import scala.util.Random

import org.scalatest.{Assertions, FunSuite}
import org.scalatest.concurrent.Eventually._

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkEnv}
import org.apache.spark.io.SnappyCompressionCodec
Expand Down Expand Up @@ -307,7 +309,17 @@ class BroadcastSuite extends FunSuite with LocalSparkContext {
removeFromDriver: Boolean) {

sc = if (distributed) {
new SparkContext("local-cluster[%d, 1, 512]".format(numSlaves), "test", broadcastConf)
val _sc =
new SparkContext("local-cluster[%d, 1, 512]".format(numSlaves), "test", broadcastConf)
// Wait until all salves are up
eventually(timeout(10.seconds), interval(10.milliseconds)) {
_sc.jobProgressListener.synchronized {
val numBlockManagers = _sc.jobProgressListener.blockManagerIds.size
assert(numBlockManagers == numSlaves + 1,
s"Expect ${numSlaves + 1} block managers, but was ${numBlockManagers}")
}
}
_sc
} else {
new SparkContext("local", "test", broadcastConf)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}

import scala.util.Random

import org.scalatest.{Assertions, FunSuite}

import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoTest.RegistratorWithoutAutoReset

/**
* Tests to ensure that [[Serializer]] implementations obey the API contracts for methods that
* describe properties of the serialized stream, such as
* [[Serializer.supportsRelocationOfSerializedObjects]].
*/
class SerializerPropertiesSuite extends FunSuite {

import SerializerPropertiesSuite._

test("JavaSerializer does not support relocation") {
// Per a comment on the SPARK-4550 JIRA ticket, Java serialization appears to write out the
// full class name the first time an object is written to an output stream, but subsequent
// references to the class write a more compact identifier; this prevents relocation.
val ser = new JavaSerializer(new SparkConf())
testSupportsRelocationOfSerializedObjects(ser, generateRandomItem)
}

test("KryoSerializer supports relocation when auto-reset is enabled") {
val ser = new KryoSerializer(new SparkConf)
assert(ser.newInstance().asInstanceOf[KryoSerializerInstance].getAutoReset())
testSupportsRelocationOfSerializedObjects(ser, generateRandomItem)
}

test("KryoSerializer does not support relocation when auto-reset is disabled") {
val conf = new SparkConf().set("spark.kryo.registrator",
classOf[RegistratorWithoutAutoReset].getName)
val ser = new KryoSerializer(conf)
assert(!ser.newInstance().asInstanceOf[KryoSerializerInstance].getAutoReset())
testSupportsRelocationOfSerializedObjects(ser, generateRandomItem)
}

}

object SerializerPropertiesSuite extends Assertions {

def generateRandomItem(rand: Random): Any = {
val randomFunctions: Seq[() => Any] = Seq(
() => rand.nextInt(),
() => rand.nextString(rand.nextInt(10)),
() => rand.nextDouble(),
() => rand.nextBoolean(),
() => (rand.nextInt(), rand.nextString(rand.nextInt(10))),
() => MyCaseClass(rand.nextInt(), rand.nextString(rand.nextInt(10))),
() => {
val x = MyCaseClass(rand.nextInt(), rand.nextString(rand.nextInt(10)))
(x, x)
}
)
randomFunctions(rand.nextInt(randomFunctions.size)).apply()
}

def testSupportsRelocationOfSerializedObjects(
serializer: Serializer,
generateRandomItem: Random => Any): Unit = {
if (!serializer.supportsRelocationOfSerializedObjects) {
return
}
val NUM_TRIALS = 5
val rand = new Random(42)
for (_ <- 1 to NUM_TRIALS) {
val items = {
// Make sure that we have duplicate occurrences of the same object in the stream:
val randomItems = Seq.fill(10)(generateRandomItem(rand))
randomItems ++ randomItems.take(5)
}
val baos = new ByteArrayOutputStream()
val serStream = serializer.newInstance().serializeStream(baos)
def serializeItem(item: Any): Array[Byte] = {
val itemStartOffset = baos.toByteArray.length
serStream.writeObject(item)
serStream.flush()
val itemEndOffset = baos.toByteArray.length
baos.toByteArray.slice(itemStartOffset, itemEndOffset).clone()
}
val itemsAndSerializedItems: Seq[(Any, Array[Byte])] = {
val serItems = items.map {
item => (item, serializeItem(item))
}
serStream.close()
rand.shuffle(serItems)
}
val reorderedSerializedData: Array[Byte] = itemsAndSerializedItems.flatMap(_._2).toArray
val deserializedItemsStream = serializer.newInstance().deserializeStream(
new ByteArrayInputStream(reorderedSerializedData))
assert(deserializedItemsStream.asIterator.toSeq === itemsAndSerializedItems.map(_._1))
deserializedItemsStream.close()
}
}
}

private case class MyCaseClass(foo: Int, bar: String)
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,12 @@ private[ml] trait CrossValidatorParams extends Params {
def getEstimatorParamMaps: Array[ParamMap] = $(estimatorParamMaps)

/**
* param for the evaluator for selection
* param for the evaluator used to select hyper-parameters that maximize the cross-validated
* metric
* @group param
*/
val evaluator: Param[Evaluator] = new Param(this, "evaluator", "evaluator for selection")
val evaluator: Param[Evaluator] = new Param(this, "evaluator",
"evaluator used to select hyper-parameters that maximize the cross-validated metric")

/** @group getParam */
def getEvaluator: Evaluator = $(evaluator)
Expand Down Expand Up @@ -120,6 +122,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
trainingDataset.unpersist()
var i = 0
while (i < numModels) {
// TODO: duplicate evaluator to take extra params from input
val metric = eval.evaluate(models(i).transform(validationDataset, epm(i)))
logDebug(s"Got metric $metric for model trained with ${epm(i)}.")
metrics(i) += metric
Expand Down
Loading

0 comments on commit f23e15b

Please sign in to comment.